In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
review = pd.read_csv(os.path.relpath('../data/filtered/cleaned_review_modi_3.csv'))
review['month_year'] = pd.to_datetime(review[['year', 'month']].assign(DAY=1)).dt.to_period('M')

### Make timeseries data

In [126]:
def make_lags(df, lags, cols=['stars', 'text', 'useful', 'funny', 'cool']):
    lag_array = []
    for lag in lags:
        lagged = df.groupby('business_id').shift(lag)
        lagged.columns = [f'{col}_lag_{lag}' for col in lagged.columns]
        lag_array.append(lagged)
    lags = pd.concat(lag_array, axis=1)
    return pd.concat([df, lags], axis=1)

In [262]:
aggregations = {
    'stars': ['mean', 'std', 'max'],
    'text': ['count'],
    'useful': ['mean', 'std', 'max'],
    'funny': ['mean', 'std', 'max'],
    'cool': ['mean', 'std', 'max'],
}

timeseries = review.groupby(['business_id', 'month_year']).agg(aggregations).sort_index(level=1).sort_index(level=0)

lagged_df = make_lags(timeseries, np.arange(1, 13))

lagged_df = lagged_df.reset_index()

In [263]:
lagged_df.head()

Unnamed: 0,business_id,month_year,"(stars, mean)","(stars, std)","(stars, max)","(text, count)","(useful, mean)","(useful, std)","(useful, max)","(funny, mean)",...,"('text', 'count')_lag_12","('useful', 'mean')_lag_12","('useful', 'std')_lag_12","('useful', 'max')_lag_12","('funny', 'mean')_lag_12","('funny', 'std')_lag_12","('funny', 'max')_lag_12","('cool', 'mean')_lag_12","('cool', 'std')_lag_12","('cool', 'max')_lag_12"
0,--9e1ONYQuAa-CB_Rrw7Tw,2015-01,4.666667,0.492366,5,12,2.666667,7.547827,26,1.5,...,,,,,,,,,,
1,--9e1ONYQuAa-CB_Rrw7Tw,2015-02,4.125,0.957427,5,16,2.25,4.654747,18,2.1875,...,,,,,,,,,,
2,--9e1ONYQuAa-CB_Rrw7Tw,2015-03,4.307692,1.182132,5,13,0.538462,0.877058,3,0.461538,...,,,,,,,,,,
3,--9e1ONYQuAa-CB_Rrw7Tw,2015-04,4.153846,0.898717,5,13,0.384615,0.869718,3,0.230769,...,,,,,,,,,,
4,--9e1ONYQuAa-CB_Rrw7Tw,2015-05,4.642857,0.744946,5,14,0.071429,0.267261,1,0.071429,...,,,,,,,,,,


### Make business data

In [234]:
#attach business data

businesses = pd.read_csv('../data/raw/yelp_business.csv')
mybusinesses = businesses[businesses['business_id'].isin(review['business_id'])].reset_index(drop=True)

#vectorize categories

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(tokenizer=lambda x: x.split(';'), binary=True, min_df=10 )
category_vec = vectorizer.fit_transform(mybusinesses['categories'])
category_df = pd.DataFrame(category_vec.toarray(), columns=vectorizer.get_feature_names_out())

#business_vars
business_vars = mybusinesses[['business_id', 'latitude', 'longitude', 'review_count']]
business_df = business_vars.merge(category_df, left_index=True, right_index=True)


In [235]:
businesses

Unnamed: 0,business_id,name,neighborhood,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,categories
0,FYWN1wneV18bWNgQjJ2GNg,"""Dental by Design""",,"""4855 E Warner Rd, Ste B9""",Ahwatukee,AZ,85044,33.330690,-111.978599,4.0,22,1,Dentists;General Dentistry;Health & Medical;Or...
1,He-G7vWjzVUysIKrfNbPUQ,"""Stephen Szabo Salon""",,"""3101 Washington Rd""",McMurray,PA,15317,40.291685,-80.104900,3.0,11,1,Hair Stylists;Hair Salons;Men's Hair Salons;Bl...
2,KQPW8lFf1y5BT2MxiSZ3QA,"""Western Motor Vehicle""",,"""6025 N 27th Ave, Ste 1""",Phoenix,AZ,85017,33.524903,-112.115310,1.5,18,1,Departments of Motor Vehicles;Public Services ...
3,8DShNS-LuFqpEWIp0HxijA,"""Sports Authority""",,"""5000 Arizona Mills Cr, Ste 435""",Tempe,AZ,85282,33.383147,-111.964725,3.0,9,0,Sporting Goods;Shopping
4,PfOCPjBrlQAnz__NXj9h_w,"""Brick House Tavern + Tap""",,"""581 Howe Ave""",Cuyahoga Falls,OH,44221,41.119535,-81.475690,3.5,116,1,American (New);Nightlife;Bars;Sandwiches;Ameri...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
174562,ALV5R8NkZ1KGOZeuZl3u0A,"""Whitby Toyota""",,"""1025 Dundas Street W""",Whitby,ON,L1P 1Z1,43.873460,-78.968033,4.0,4,1,Car Dealers;Automotive
174563,gRGalHVu6BcaUDIAGVW_xQ,"""Village Auto Body""",,"""3957 Brecksville Rd""",Richfield,OH,44286,41.243385,-81.636212,5.0,3,1,Body Shops;Automotive
174564,XXvZBIHoJBU5d6-a-oyMWQ,"""AAM""",,"""1600 W Broadway Rd, Ste 200""",Tempe,AZ,85282,33.407914,-111.965098,1.5,19,1,Home Services;Property Management;Real Estate
174565,lNpPGgM96nPIYM1shxciHg,"""Bronze Beauty Spray Tanning""",,"""300 Camp Horne Rd, Ste 250""",Pittsburgh,PA,15202,40.517724,-80.091466,5.0,14,1,Spray Tanning;Tanning;Beauty & Spas


In [236]:
business_df.head()

Unnamed: 0,business_id,latitude,longitude,review_count,american (new),american (traditional),arts & entertainment,asian fusion,bars,breakfast & brunch,...,hotels & travel,italian,mexican,nightlife,resorts,restaurants,salad,sandwiches,seafood,steakhouses
0,frCxZS7lPhEnQRJ3UY6m7A,33.469201,-112.047393,1694,0,0,0,0,1,1,...,0,0,1,1,0,1,0,0,0,0
1,aiX_WP7NKPTdF9CfI-M-wg,33.428533,-111.943955,984,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,7m1Oa1VYV98UUuo_6i0EZg,33.639913,-111.995703,1019,0,1,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
3,5iSmZO0SrKU6EoXK_1M8Kw,36.104614,-115.176021,2507,0,0,1,0,0,0,...,1,0,0,0,1,0,0,0,0,0
4,JmI9nslLD7KZqRr__Bg6NQ,33.435943,-112.011026,2215,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


### Make Review data

In [237]:
# # transformers
# from sentence_transformers import SentenceTransformer, models, InputExample

# model = SentenceTransformer('all-MiniLM-L6-v2')

In [238]:
# review_encodings = model.encode(review['text'])

In [239]:
# review_encodings.shape

In [240]:
# #pickle the encoding
# import pickle
# import os
# with open(os.path.relpath('../data/filtered/review_encodings_5.pkl'), 'wb') as f:
#     pickle.dump(review_encodings, f)


In [147]:
#load the encoding
import pickle
import os
with open(os.path.relpath('../data/filtered/review_encodings_5.pkl'), 'rb') as f:
    review_encodings = pickle.load(f)

In [149]:
review_encodings_df = pd.DataFrame(review_encodings)
review_encodings_df['business_id'] = review['business_id']
review_encodings_df['month_year'] = review['month_year']

In [156]:
review_text_df = review_encodings_df.groupby(['business_id', 'month_year']).mean().rename(columns={i: f'encoding_{i}' for i in range(384)}).reset_index()

### (PCA?)

In [158]:
#TODO

In [170]:
#make y
next_month = make_lags(timeseries, [-1])
next_month_avg_stars = next_month[(next_month.index.get_level_values(1) >= '2016-01') & (next_month.index.get_level_values(1) != '2017-12')][['stars_lag_-1']]
next_month_avg_stars = next_month_avg_stars.rename(columns={'stars_lag_-1': 'next_month_avg_stars'})
next_month_avg_stars = next_month_avg_stars.reset_index()

In [171]:
next_month_avg_stars.head()

Unnamed: 0,business_id,month_year,next_month_avg_stars
0,--9e1ONYQuAa-CB_Rrw7Tw,2016-01,3.882353
1,--9e1ONYQuAa-CB_Rrw7Tw,2016-02,3.833333
2,--9e1ONYQuAa-CB_Rrw7Tw,2016-03,4.2
3,--9e1ONYQuAa-CB_Rrw7Tw,2016-04,4.3125
4,--9e1ONYQuAa-CB_Rrw7Tw,2016-05,4.0


### Combine data

In [264]:
final_df = pd.merge(
                    lagged_df, 
                    business_df, 
                    on='business_id'
                    ).merge(
                        
                            review_text_df,
                            on=['business_id', 'month_year']
                            ).merge(
                                    next_month_avg_stars, 
                                    on=['business_id', 'month_year']
                                    )

final_df.sort_values(by=['month_year'], inplace=True)

In [265]:
final_df.head()

Unnamed: 0,business_id,month_year,"(stars, mean)","(stars, std)","(stars, max)","(text, count)","(useful, mean)","(useful, std)","(useful, max)","(funny, mean)",...,encoding_375,encoding_376,encoding_377,encoding_378,encoding_379,encoding_380,encoding_381,encoding_382,encoding_383,next_month_avg_stars
0,--9e1ONYQuAa-CB_Rrw7Tw,2016-01,4.285714,0.82542,5,14,0.642857,0.841897,2,0.357143,...,0.020849,0.023258,-0.011667,0.057943,0.045613,0.026869,-0.015658,-0.074155,0.014165,3.882353
1909,ZCzey5aPhd7jYIoHsUfjmQ,2016-01,4.357143,0.744946,5,14,0.428571,0.646206,2,0.071429,...,0.003887,-0.013277,-0.055648,0.057769,0.012789,0.012412,0.040131,-0.078325,0.024494,4.166667
2944,u4sTiCzVeIHZY8OlaL346Q,2016-01,4.8,0.472789,5,35,0.771429,2.365011,13,0.257143,...,-0.005144,0.005249,-0.049,0.095854,0.037362,0.013046,-0.007392,-0.095087,0.025261,4.8
345,5iSmZO0SrKU6EoXK_1M8Kw,2016-01,2.40625,1.499664,5,32,1.34375,2.088515,8,0.3125,...,-0.03476,0.028057,0.01144,0.023033,0.016932,-0.00658,-0.03629,-0.094897,0.013865,2.548387
1357,NvKNe9DnQavC9GstglcBJQ,2016-01,3.862069,1.216674,5,29,0.344828,0.768852,3,0.137931,...,-0.012263,0.027756,-0.020646,0.056686,0.021643,-0.006877,0.000797,-0.079847,0.006258,3.684211


### Make a model

In [276]:
base_ = ['stars', 'useful', 'funny', 'cool']
stats_ = ['mean', 'std', 'max']
base_stats = [(base, stat) for base in base_ for stat in stats_]
base_stats += [('text', 'count')]
lag_stats = [(f'{base}_lag_{lag}') for base in base_stats for lag in range(1, 13)]
business_vars = ['latitude', 'longitude', 'review_count']
encoding_vars = [f'encoding_{i}' for i in range(384)]

drop_cols = ['business_id', 'month_year', 'next_month_avg_stars']


In [275]:
# timeseries test train split
from sklearn.model_selection import TimeSeriesSplit

X = final_df[final_df['month_year'] <= '2017-01'].drop(drop_cols, axis=1)
y = final_df[final_df['month_year'] <= '2017-01']['next_month_avg_stars']

tscv = TimeSeriesSplit(n_splits=5, gap=1)

In [297]:
# make pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import ElasticNet, Lasso, Ridge
from sklearn.model_selection import GridSearchCV

pipe = Pipeline([
    ('scaler', RobustScaler()),
    ('regressor', ElasticNet())
])

param_grid = {
    #'regressor__fit_intercept': [True, False],
    'regressor__alpha': [0.05, 0.1, 0.15],
    'regressor__l1_ratio': [0.15, 0.2, 0.25]
}

grid = GridSearchCV(pipe, param_grid, cv=tscv, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)

grid.fit(X, y)

Fitting 5 folds for each of 9 candidates, totalling 45 fits




In [295]:
grid.best_params_

{'regressor__alpha': 0.1, 'regressor__l1_ratio': 0.2}

In [298]:
grid.best_score_
#all vars, <=2017-01 0.11033964063557497
#no text 0.10870816146219967
#no text robust scaler 0.10981041968616204

-0.1109364511156111

In [247]:
# Use 2017-02 as test set
X_test = final_df[final_df['month_year'] == '2017-02'].drop(drop_cols + ['next_month_avg_stars'], axis=1)
y_test = final_df[final_df['month_year'] == '2017-02']['next_month_avg_stars']

print("MSE: ", -grid.score(X_test, y_test))

# Rsquared

from sklearn.metrics import r2_score

print("R-squared", r2_score(y_test, grid.predict(X_test)))


MSE:  0.09627118371180983
R-squared 0.8056025145274551




In [None]:
# make pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import ElasticNet, Lasso, Ridge
from sklearn.model_selection import GridSearchCV

pipe = Pipeline([
    ('scaler', RobustScaler()),
    ('regressor', Lasso())
])

param_grid = {
    #'regressor__fit_intercept': [True, False],
    'regressor__alpha': [0.05, 0.1, 0.15],
    #'regressor__l1_ratio': [0.15, 0.2, 0.25]
}

grid = GridSearchCV(pipe, param_grid, cv=tscv, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)

grid.fit(X, y)

In [248]:
# xgboost pipeline
from xgboost import XGBRegressor

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', XGBRegressor())
])

param_grid = {
    'regressor': [XGBRegressor()],
    'regressor__max_depth': [2,3],
    'regressor__learning_rate': [0.1],
    'regressor__n_estimators': [100],
    'regressor__subsample': [1],
    'regressor__colsample_bytree': [1],
    'regressor__reg_alpha': [0.1, 0.2, 0.3],
}

xgb_grid = GridSearchCV(pipe, param_grid, cv=tscv, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)

xgb_grid.fit(X, y)

Fitting 5 folds for each of 6 candidates, totalling 30 fits




In [249]:
xgb_grid.best_params_

{'regressor': XGBRegressor(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=None, gpu_id=None,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=2, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...),
 'regressor__colsample_bytree': 1,
 'regressor__learning_rate': 0.1,
 'regressor__max_depth': 2,
 'regressor__n_estimators': 100,
 'regressor__reg_alpha': 0.3,
 'regressor__subsample': 1}

In [251]:
#before best_score:  -0.11816271278176989
print('best_score: ',xgb_grid.best_score_)

best_score:  -0.1178208430922646


In [200]:
# Use 2017-02 as test set
X_test = final_df[final_df['month_year'] == '2017-02'].drop(drop_cols + ['next_month_avg_stars'], axis=1)
y_test = final_df[final_df['month_year'] == '2017-02']['next_month_avg_stars']

print("MSE: ", -xgb_grid.score(X_test, y_test))

# Rsquared

from sklearn.metrics import r2_score

print("R-squared", r2_score(y_test, xgb_grid.predict(X_test)))


MSE:  0.09915165015707351
R-squared 0.7997860758761601


### Some experiments with removing data

In [210]:
# defining all the variables

base_vars = ['stars', 'text', 'useful', 'funny', 'cool']
business_vars = ['latitude', 'longitude', 'review_count']
lag_vars = [f'{cat}_lag_{i}' for cat in base_vars for i in range(1, 13)]
encoding_vars = [f'encoding_{i}' for i in range(384)]

In [212]:
from sklearn.model_selection import cross_val_score
best_lr_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', ElasticNet(alpha=0.2, l1_ratio=0.1))
])

best_xgb_pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', XGBRegressor(max_depth=2, learning_rate=0.1, n_estimators=100, subsample=1, colsample_bytree=1, reg_alpha=0.3))
])

In [215]:
# cross validation with lr
print("LINEAR REGRESSION")

print("--BASE VARIABLES--")
print("Validation MSE: ", -cross_val_score(best_lr_pipe, X[base_vars], y, cv=tscv, scoring='neg_mean_squared_error').mean())

print("--BASE VARIABLES + LAG --")
print("Validation MSE: ", -cross_val_score(best_lr_pipe, X[base_vars + lag_vars], y, cv=tscv, scoring='neg_mean_squared_error').mean())

print("--BASE VARIABLES + LAG + BUSINESS --")
print("Validation MSE: ", -cross_val_score(best_lr_pipe, X[base_vars + lag_vars + business_vars], y, cv=tscv, scoring='neg_mean_squared_error').mean())

print("--BASE VARIABLES + LAG + BUSINESS + ENCODING --")
print("Validation MSE: ", -cross_val_score(best_lr_pipe, X[base_vars + lag_vars + business_vars + encoding_vars], y, cv=tscv, scoring='neg_mean_squared_error').mean())

LINEAR REGRESSION
--BASE VARIABLES--
Validation MSE:  0.17005834629774377
--BASE VARIABLES + LAG --
Validation MSE:  0.10855356493616326
--BASE VARIABLES + LAG + BUSINESS --
Validation MSE:  0.10858142365729599
--BASE VARIABLES + LAG + BUSINESS + ENCODING --
Validation MSE:  0.11042032446526269


In [216]:
# cross validation with xgboost
print("XGBOOST")

print("--BASE VARIABLES--")
print("Validation MSE: ", -cross_val_score(best_xgb_pipe, X[base_vars], y, cv=tscv, scoring='neg_mean_squared_error').mean())

print("--BASE VARIABLES + LAG --")
print("Validation MSE: ", -cross_val_score(best_xgb_pipe, X[base_vars + lag_vars], y, cv=tscv, scoring='neg_mean_squared_error').mean())

print("--BASE VARIABLES + LAG + BUSINESS --")
print("Validation MSE: ", -cross_val_score(best_xgb_pipe, X[base_vars + lag_vars + business_vars], y, cv=tscv, scoring='neg_mean_squared_error').mean())

print("--BASE VARIABLES + LAG + BUSINESS + ENCODING --")
print("Validation MSE: ", -cross_val_score(best_xgb_pipe, X[base_vars + lag_vars + business_vars + encoding_vars], y, cv=tscv, scoring='neg_mean_squared_error').mean())

XGBOOST
--BASE VARIABLES--
Validation MSE:  0.1583786732343033
--BASE VARIABLES + LAG --
Validation MSE:  0.11875605548882755
--BASE VARIABLES + LAG + BUSINESS --
Validation MSE:  0.11841144503330532
--BASE VARIABLES + LAG + BUSINESS + ENCODING --
Validation MSE:  0.1179281796148168


### Polynomial Features?

In [217]:
#what about polynomial features?

from sklearn.preprocessing import PolynomialFeatures
lr_pipe_poly = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures()),
    ('regressor', ElasticNet(alpha=0.2, l1_ratio=0.1))
])

print("LINEAR REGRESSION")

print("--BASE VARIABLES--")
print("Validation MSE: ", -cross_val_score(lr_pipe_poly, X[base_vars], y, cv=tscv, scoring='neg_mean_squared_error').mean())

print("--BASE VARIABLES + LAG --")
print("Validation MSE: ", -cross_val_score(lr_pipe_poly, X[base_vars + lag_vars], y, cv=tscv, scoring='neg_mean_squared_error').mean())

print("--BASE VARIABLES + LAG + BUSINESS --")
print("Validation MSE: ", -cross_val_score(lr_pipe_poly, X[base_vars + lag_vars + business_vars], y, cv=tscv, scoring='neg_mean_squared_error').mean())

print("--BASE VARIABLES + LAG + BUSINESS + ENCODING --")
print("Validation MSE: ", -cross_val_score(lr_pipe_poly, X[base_vars + lag_vars + business_vars + encoding_vars], y, cv=tscv, scoring='neg_mean_squared_error').mean())



LINEAR REGRESSION
--BASE VARIABLES--
Validation MSE:  0.1655505755034922
--BASE VARIABLES + LAG --
Validation MSE:  0.1156018647213222
--BASE VARIABLES + LAG + BUSINESS --
Validation MSE:  0.11598222656486028
--BASE VARIABLES + LAG + BUSINESS + ENCODING --
Validation MSE:  0.12229862864061951


### Pipe robust scaling

In [227]:
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import RobustScaler

lr_pipe_robscale = Pipeline([
    ('scaler', RobustScaler()),
    ('regressor', ElasticNet())
])

lr_pipe_grid = GridSearchCV(lr_pipe_robscale, param_grid={
    'regressor__alpha': [0.1, 0.2, 0.3],
    'regressor__l1_ratio': [0.1, 0.2, 0.3]
}, cv=tscv, scoring='neg_mean_squared_error')

In [228]:
print("LINEAR REGRESSION")

print("--BASE VARIABLES--")
print("Validation MSE: ", -cross_val_score(lr_pipe_grid, X[base_vars], y, cv=tscv, scoring='neg_mean_squared_error').mean())

print("--BASE VARIABLES + LAG --")
print("Validation MSE: ", -cross_val_score(lr_pipe_grid, X[base_vars + lag_vars], y, cv=tscv, scoring='neg_mean_squared_error').mean())

print("--BASE VARIABLES + LAG + BUSINESS --")
print("Validation MSE: ", -cross_val_score(lr_pipe_grid, X[base_vars + lag_vars + business_vars], y, cv=tscv, scoring='neg_mean_squared_error').mean())

print("--BASE VARIABLES + LAG + BUSINESS + ENCODING --")
print("Validation MSE: ", -cross_val_score(lr_pipe_grid, X[base_vars + lag_vars + business_vars + encoding_vars], y, cv=tscv, scoring='neg_mean_squared_error').mean())

LINEAR REGRESSION
--BASE VARIABLES--
Validation MSE:  0.16434474390592543
--BASE VARIABLES + LAG --
Validation MSE:  0.10854690108165466
--BASE VARIABLES + LAG + BUSINESS --
Validation MSE:  0.10912188466022181
--BASE VARIABLES + LAG + BUSINESS + ENCODING --
Validation MSE:  0.11163840206273816


In [229]:
xgb_pipe_robscale = Pipeline([
    ('scaler', RobustScaler()),
    ('regressor', XGBRegressor())
])

xgb_pipe_grid = GridSearchCV(xgb_pipe_robscale, param_grid={
    'regressor__n_estimators': [50, 100],
    'regressor__max_depth': [2,3]
}, cv=tscv, scoring='neg_mean_squared_error')

In [230]:
print("XGBOOST")

print("--BASE VARIABLES--")
print("Validation MSE: ", -cross_val_score(xgb_pipe_grid, X[base_vars], y, cv=tscv, scoring='neg_mean_squared_error').mean())

print("--BASE VARIABLES + LAG --")
print("Validation MSE: ", -cross_val_score(xgb_pipe_grid, X[base_vars + lag_vars], y, cv=tscv, scoring='neg_mean_squared_error').mean())

print("--BASE VARIABLES + LAG + BUSINESS --")
print("Validation MSE: ", -cross_val_score(xgb_pipe_grid, X[base_vars + lag_vars + business_vars], y, cv=tscv, scoring='neg_mean_squared_error').mean())

print("--BASE VARIABLES + LAG + BUSINESS + ENCODING --")
print("Validation MSE: ", -cross_val_score(xgb_pipe_grid, X[base_vars + lag_vars + business_vars + encoding_vars], y, cv=tscv, scoring='neg_mean_squared_error').mean())

XGBOOST
--BASE VARIABLES--
Validation MSE:  0.16354528044442693
--BASE VARIABLES + LAG --
Validation MSE:  0.12869094691370947
--BASE VARIABLES + LAG + BUSINESS --
Validation MSE:  0.12685908712788121
--BASE VARIABLES + LAG + BUSINESS + ENCODING --
Validation MSE:  0.1286304890168465
