In [2]:
import numpy as np
import pandas as pd
#viz
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA, KernelPCA
import re
import nltk
import string
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from gensim.models import Word2Vec, FastText
import sklearn.feature_extraction.text as sk_text # TFIDF

# preprocess function

In [4]:
# remove numbers
def remove_num(text):
    text_nonum = re.sub(r'\d+', '', text)
    text_nopunct = "".join([char.lower() for char in str(text_nonum) if char not in string.punctuation])
    # substitute multiple whitespace with single whitespace
    # Also, removes leading and trailing whitespaces
    text_no_doublespace = re.sub('\s+', ' ', text_nopunct).strip()
    return text_nonum

# remove_special_characters
def remove_special_characters(text):
    # define the pattern to keep
    pat = r'[^a-zA-z0-9.,!?/:;\"\'\s]' 
    return re.sub(pat, '', text)

# remove punctuation
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if str(i) not in string.punctuation])
    return punctuationfree

#  tokenization
def tokenization(text):
    tokens = re.split(' +',text)
    return tokens

# remove stopwords
nltk.download('stopwords')
my_stopwords = stopwords.words('english')
my_stopwords.remove('be') # BE -> Back Ends
def rm_stopwords(text):
    return [i for i in text if i not in my_stopwords]

# lemmatization
nltk.download('wordnet')
nltk.download('omw-1.4')
wordnet_lemmatizer = WordNetLemmatizer()
def lemmatization(text):
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text

# remove_extra_whitespace_tabs
def remove_extra_whitespace_tabs(text):
    #pattern = r'^\s+$|\s+$'
    pattern = r'^\s*|\s\s*'
    return re.sub(pattern, ' ', text).strip()

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/qinwenw/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/qinwenw/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/qinwenw/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [5]:
review = pd.read_csv('./data/filtered/cleaned_review_modi_5.csv')
review['month_year'] = pd.to_datetime(review[['year', 'month']].assign(DAY=1)).dt.to_period('M')

In [6]:
# drop reviews with the same useful level
# keep the last/ most recent one
review_single = review.drop_duplicates(['business_id','year','month','useful'], keep='last')
# select the most useful review
# for each business for each year_month
idx = review_single.groupby(['business_id','year','month'])['useful'].transform(max) == review_single['useful']
review_most_useful = review_single[idx]
review_most_useful = review_most_useful.drop(columns={'review_id','date','year','month','stars','useful','funny','cool'})
review_most_useful = review_most_useful.sort_values(['business_id','month_year']).reset_index(drop=True)

In [7]:
review_most_useful.head()

Unnamed: 0,business_id,text,month_year
0,--9e1ONYQuAa-CB_Rrw7Tw,"I'm hardly ever in Vegas, and when I am here i...",2015-01
1,--9e1ONYQuAa-CB_Rrw7Tw,I'll start with this: I'm not a fan of Emeril ...,2015-02
2,--9e1ONYQuAa-CB_Rrw7Tw,Celebrated Labor Day weekend in Vegas and was ...,2015-03
3,--9e1ONYQuAa-CB_Rrw7Tw,"I love love love Delmonico! I love the staff, ...",2015-04
4,--9e1ONYQuAa-CB_Rrw7Tw,Decided to try lunch today. I have had the Rib...,2015-05


In [8]:
review_most_useful.loc[:,('text')] = review_most_useful.loc[:,('text')].apply(
        lambda x: lemmatization(rm_stopwords(tokenization(remove_punctuation(remove_extra_whitespace_tabs(remove_num(remove_special_characters(x.lower())))))))
    )
review_most_useful.loc[:,('text')] = review_most_useful.loc[:,('text')].apply(lambda x: ' '.join(x))

In [9]:
review_most_useful.head()

Unnamed: 0,business_id,text,month_year
0,--9e1ONYQuAa-CB_Rrw7Tw,im hardly ever vega rage forage hah hah must a...,2015-01
1,--9e1ONYQuAa-CB_Rrw7Tw,ill start im fan emeril lagasse dont know ive ...,2015-02
2,--9e1ONYQuAa-CB_Rrw7Tw,celebrated labor day weekend vega looking good...,2015-03
3,--9e1ONYQuAa-CB_Rrw7Tw,love love love delmonico love staff food ambia...,2015-04
4,--9e1ONYQuAa-CB_Rrw7Tw,decided try lunch today rib steak loved today ...,2015-05


## TFIDF

In [10]:
# vectorizer = TfidfVectorizer()
vectorizer = sk_text.TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word',
                                     stop_words= 'english',ngram_range=(1,1))
vectors = vectorizer.fit_transform(review_most_useful['text'])
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
df_tfidf = pd.DataFrame(denselist, columns=feature_names)



In [11]:
df_tfidf.head()

Unnamed: 0,able,absolutely,actual,actually,add,added,addition,additional,afternoon,ago,...,year,yelp,yes,york,youd,youll,youre,youve,yum,yummy
0,0.0,0.262122,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.131745,0.044476,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.049307,0.0,0.066665,0.0,0.20052,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# reduce to 5 dimentions
n=5
pca = PCA(n_components=n)
text_pca = pca.fit_transform(df_tfidf)
df_pca = pd.DataFrame(text_pca)
df_pca.columns = ['text_pca'+str(i) for i in range(1,n+1)]

In [13]:
df =  pd.concat([review_most_useful, df_pca], axis=1)
df = df.drop(columns='text')
# df.to_csv("./data/filtered/cleaned_review_modi_3_tfidf.csv",index=False)

In [14]:
df.head()

Unnamed: 0,business_id,month_year,text_pca1,text_pca2,text_pca3,text_pca4,text_pca5
0,--9e1ONYQuAa-CB_Rrw7Tw,2015-01,0.037837,-0.022495,0.048339,-0.126852,-0.086118
1,--9e1ONYQuAa-CB_Rrw7Tw,2015-02,-0.040089,-0.038087,0.064798,0.120718,-0.064648
2,--9e1ONYQuAa-CB_Rrw7Tw,2015-03,-0.106095,-0.019336,0.144173,0.049278,-0.071613
3,--9e1ONYQuAa-CB_Rrw7Tw,2015-04,-0.017845,-0.033601,0.018613,-0.067188,-0.084294
4,--9e1ONYQuAa-CB_Rrw7Tw,2015-05,-0.070687,-0.033032,0.034046,0.019868,-0.073832


## BERT

In [15]:
import torch
from transformers import AutoTokenizer, AutoModel

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert-base-uncased").to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [16]:
review_most_useful.head()

Unnamed: 0,business_id,text,month_year
0,--9e1ONYQuAa-CB_Rrw7Tw,im hardly ever vega rage forage hah hah must a...,2015-01
1,--9e1ONYQuAa-CB_Rrw7Tw,ill start im fan emeril lagasse dont know ive ...,2015-02
2,--9e1ONYQuAa-CB_Rrw7Tw,celebrated labor day weekend vega looking good...,2015-03
3,--9e1ONYQuAa-CB_Rrw7Tw,love love love delmonico love staff food ambia...,2015-04
4,--9e1ONYQuAa-CB_Rrw7Tw,decided try lunch today rib steak loved today ...,2015-05


In [17]:
tokenized_review = tokenizer(review_most_useful['text'].values.tolist(),
                             padding = True, truncation = True, return_tensors="pt")

#move on device (GPU)
tokenized_review = {k:torch.tensor(v).to(device) for k,v in tokenized_review.items()}

  tokenized_review = {k:torch.tensor(v).to(device) for k,v in tokenized_review.items()}


In [18]:
tokenized_review

{'input_ids': tensor([[  101, 10047,  6684,  ...,     0,     0,     0],
         [  101,  5665,  2707,  ...,     0,     0,     0],
         [  101,  6334,  4450,  ...,     0,     0,     0],
         ...,
         [  101,  2307,  4157,  ...,     0,     0,     0],
         [  101,  2307,  2312,  ...,     0,     0,     0],
         [  101,  2173,  8966,  ...,     0,     0,     0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}

In [None]:
with torch.no_grad():
    hidden_review = model(**tokenized_review) #dim : [batch_size(nr_sentences), tokens, emb_dim]

#get only the [CLS] hidden states
cls_review = tokenized_review.last_hidden_state[:,0,:]

In [None]:
n = 5
pca = PCA(n_components=n)
description_pca = pca.fit_transform(cls_review)
temp = pd.DataFrame(description_pca)
temp.columns = ['Description_pca'+str(i) for i in range(1,description_n+1)]
df =  pd.concat([df, temp], axis=1)

# Model

In [None]:
# timeseries = review1.join(review2)
timeseries = review3.join(review2)

In [None]:
timeseries.head()

In [None]:
def make_lags(df, lags, cols=['stars', 'text', 'useful', 'funny', 'cool', 'sentiment_score']):
    lag_array = []
    for lag in lags:
        lagged = df.groupby('business_id').shift(lag)
        lagged.columns = [f'{col}_lag_{lag}' for col in lagged.columns]
        lag_array.append(lagged)
    lags = pd.concat(lag_array, axis=1)
    return pd.concat([df, lags], axis=1)

In [None]:
lagged_df = make_lags(timeseries, np.arange(1, 13))

In [None]:
lagged_df

In [None]:
next_month = make_lags(timeseries, [-1])
next_month_avg_stars = next_month[(next_month.index.get_level_values(1) >= '2016-01') &
                                  (next_month.index.get_level_values(1) != '2017-12')][['stars_lag_-1']].rename(
    columns={'stars_lag_-1': 'next_month_avg_stars'})
final_lagged_df = pd.concat([lagged_df[(lagged_df.index.get_level_values(1) >= '2016-01') &
                                       (lagged_df.index.get_level_values(1) != '2017-12')], next_month_avg_stars], axis=1)

#attach business data

businesses = pd.read_csv('./data/yelp_business.csv')
mybusinesses = businesses[businesses['business_id'].isin(review['business_id'])].reset_index(drop=True)

useful_vars = mybusinesses[['business_id', 'latitude', 'longitude', 'review_count']]

final_df = final_lagged_df.reset_index().merge(useful_vars,
                        on='business_id').set_index(['business_id', 'month_year']).sort_index(level=1)

In [None]:
final_df

In [None]:
# timeseries test train split
from sklearn.model_selection import TimeSeriesSplit

X = final_lagged_df[final_lagged_df.index.get_level_values(1) <= '2017-07'].drop('next_month_avg_stars', axis=1)
y = final_lagged_df[final_lagged_df.index.get_level_values(1) <= '2017-07']['next_month_avg_stars']

tscv = TimeSeriesSplit(n_splits=5, gap=1)

In [None]:
# make pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet, Lasso, Ridge
from sklearn.model_selection import GridSearchCV

warnings.simplefilter("ignore", UserWarning)

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', ElasticNet())
])

param_grid = {
    #'regressor__fit_intercept': [True, False],
    'regressor__alpha': [0, 0.1, 0.2],
    'regressor__l1_ratio': [0.05, 0.1, 0.15]
}

grid = GridSearchCV(pipe, param_grid, cv=tscv, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)

grid.fit(X, y)

In [None]:
grid.best_params_

In [None]:
grid.best_score_

In [None]:
# Use 2017-08 as test set
test_X = final_lagged_df[final_lagged_df.index.get_level_values(1) == '2017-08'].drop('next_month_avg_stars', axis=1)
test_y = final_lagged_df[final_lagged_df.index.get_level_values(1) == '2017-08']['next_month_avg_stars']

print("MSE: ", -grid.score(test_X, test_y))

# Rsquared

from sklearn.metrics import r2_score

print("R-squared", r2_score(test_y, grid.predict(test_X)))

**without** sentiment

MSE = 0.11009040821693132

R^2 = 0.7860813104464417

**with** sentimetent

MSE = 0.13907533021966362

R^2 = 0.7398459301388225

**TFIDF**

MSE = 0.14055024452036893

R^2 = 0.7370869580233588

In [None]:
# xgboost pipeline
from xgboost import XGBRegressor

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', XGBRegressor())
])

param_grid = {
    'regressor': [XGBRegressor()],
    'regressor__max_depth': [2,3],
    'regressor__learning_rate': [0.1],
    'regressor__n_estimators': [100],
    'regressor__subsample': [1],
    'regressor__colsample_bytree': [1],
    'regressor__reg_alpha': [0.1, 0.2, 0.3],
}

xgb_grid = GridSearchCV(pipe, param_grid, cv=tscv, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)

xgb_grid.fit(X, y)

In [None]:
xgb_grid.best_params_

In [None]:
#.250 - max depth 2
print('best_score: ',xgb_grid.best_score_)

**without** sentiment:

best_score:  -0.11382132077111447

**with** sentiment:

best_score:  -0.1416801342235987

**TFIDF**:

best_score:  -0.14559988200558513

In [None]:
# Use 2017-08 as test set

test_X = final_lagged_df[final_lagged_df.index.get_level_values(1) == '2017-08'].drop('next_month_avg_stars', axis=1)
test_y = final_lagged_df[final_lagged_df.index.get_level_values(1) == '2017-08']['next_month_avg_stars']

print("MSE: ", -xgb_grid.score(test_X, test_y))

# Rsquared

from sklearn.metrics import r2_score

print("R-squared", r2_score(test_y, xgb_grid.predict(test_X)))

**without** sentiment:

MSE:  0.11338180969045816

R-squared 0.7796857279300784

**with** sentiment:

MSE:  0.13901107527057768

R-squared 0.7399661253343832

In [None]:
# Baseline model

#using the average 2017 rating
avg_2017_pred = final_lagged_df[final_lagged_df.index.get_level_values(1) == '2017-08']['stars'].mean()

print("MSE with average of all:", mean_squared_error(final_lagged_df[final_lagged_df.index.get_level_values(1) == '2017-08']['next_month_avg_stars'], np.repeat(avg_2017_pred, 143)))

#using the average rating for each business

print("MSE with previous month of each business:", mean_squared_error(final_lagged_df[final_lagged_df.index.get_level_values(1) == '2017-08']['next_month_avg_stars'], final_lagged_df[final_lagged_df.index.get_level_values(1) == '2017-08']['stars']))
