In [1]:
import numpy as np
import pandas as pd
import sqlite3
#viz
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
attribute=pd.read_csv("./data/filtered/business_attributes_on.csv")
hour=pd.read_csv("./data/filtered/business_hours_on.csv")
business_hours=pd.read_csv("./data/filtered/check_in_on.csv")
restaurant=pd.read_csv("./data/filtered/res_on.csv")
review=pd.read_csv("./data/filtered/review_res_on.csv")
tip=pd.read_csv("./data/filtered/tip_on.csv")
user=pd.read_csv("./data/filtered/user_res_on.csv")

In [236]:
import warnings
from pandas.core.common import SettingWithCopyWarning

warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
warnings.simplefilter(action="ignore", category=FutureWarning)

## Preprocess

In [3]:
def clean_df(review):
    # select only year 2016, 2017
    review.loc[:, ('date')] = pd.to_datetime(review['date'])
    review.loc[:, ('year')] = review['date'].dt.year
    review.loc[:, ('month')] = review['date'].dt.month
    review = review[review['year'] > 2015]
    review = review[['business_id','date','year','month','stars','text','useful','funny','cool']]
    
    # filter restaurants that have at least 20+ reviews in 2015 & 2016 & 2017
    # reviews_2015 = review[review['year'] == 2015]
    reviews_2016 = review[review['year'] == 2016]
    reviews_2017 = review[review['year'] == 2017]
    
    # over_20_reviews_2015 = reviews_2015['business_id'].value_counts()[reviews_2015['business_id'].value_counts() >= 20].index.tolist()
    over_20_reviews_2016 = reviews_2016['business_id'].value_counts()[reviews_2016['business_id'].value_counts() >= 20].index.tolist()
    over_20_reviews_2017 = reviews_2017['business_id'].value_counts()[reviews_2017['business_id'].value_counts() >= 20].index.tolist()

    # in_2015 = set(over_20_reviews_2015)
    in_2016 = set(over_20_reviews_2016)
    in_2017 = set(over_20_reviews_2017)
    
    # intersection = list(in_2015 & in_2016 & in_2017)
    intersection = list(in_2016 & in_2017)
    
    review = review[review['business_id'].isin(intersection)]
    
    return review

In [4]:
def export_df(dataframe):
    dataframe = dataframe.drop(columns={'month','date'})
    # names = ['2015_text','2015_rate','2016_text','2016_rate','2017_text','2017_rate']
    # years = [2015, 2016, 2017]
    names = ['2016_text','2016_rate']
    # years = [2015, 2016]
    years = [2016]
    new_df = pd.DataFrame(columns=names)
    
    # extract business ids in the dataframe
    # (note: should be the same as 'in_all_3yrs')
    bus_id_list = dataframe['business_id'].unique().tolist()
    
    for res in bus_id_list:
        df_res = dataframe[dataframe.business_id == res]
        df_des = df_res.groupby('year')['stars'].mean().reset_index()
        res_info = []

        for yr in years:
            df_res_temp = df_res[df_res['year']==yr]
            df_res_temp = df_res_temp.drop(columns={'business_id','year'})
            res_yr_rate = df_res[df_res['year']==yr]['stars'].mean()
            res_info.append(df_res_temp)
            res_info.append(res_yr_rate)

        new_df.loc[res] = res_info
        # new_df.index.names = ['business_id']
        # new_df = new_df.reset_index()
        
    return new_df

In [5]:
def export_df_17(dataframe):
    names2 = ['2017_01_text','2017_01_rate','2017_02_text','2017_02_rate','2017_03_text','2017_03_rate',
             '2017_04_text','2017_04_rate','2017_05_text','2017_05_rate','2017_06_text','2017_06_rate',
             '2017_07_text','2017_07_rate','2017_08_text','2017_08_rate','2017_09_text','2017_09_rate',
             '2017_10_text','2017_10_rate','2017_11_text','2017_11_rate','2017_12_text','2017_12_rate']
    months = [1,2,3,4,5,6,7,8,9,10,11,12]
    new_df_2017 = pd.DataFrame(columns=names2)
    
    bus_id_list = dataframe['business_id'].unique().tolist()
    
    for res in bus_id_list:
        df_res = dataframe[dataframe.business_id == res]
        df_des = df_res.groupby('year')['stars'].mean().reset_index()
        df_res_temp = df_res[df_res['year']==2017]
        res_info = []
        
        for month in months:
            df_res_temp_month = df_res_temp[df_res_temp['month'] == month]
            res_yr_mon_rate = df_res[(df_res['year']==2017) & (df_res['month']==month)]['stars'].mean()
            res_info.append(df_res_temp_month.drop(columns={'business_id','date','year','month'}))
            res_info.append(res_yr_mon_rate)
            
        new_df_2017.loc[res] = res_info
    
    return new_df_2017

clean text

In [87]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA, KernelPCA
import re
import nltk
import string
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from gensim.models import Word2Vec, FastText

In [88]:
# Read cleaned review data
clean_review = clean_df(review)

In [89]:
# remove numbers
def remove_num(text):
    text_nonum = re.sub(r'\d+', '', text)
    text_nopunct = "".join([char.lower() for char in str(text_nonum) if char not in string.punctuation])
    # substitute multiple whitespace with single whitespace
    # Also, removes leading and trailing whitespaces
    text_no_doublespace = re.sub('\s+', ' ', text_nopunct).strip()
    return text_nonum

# remove_special_characters
def remove_special_characters(text):
    # define the pattern to keep
    pat = r'[^a-zA-z0-9.,!?/:;\"\'\s]' 
    return re.sub(pat, '', text)

# remove punctuation
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if str(i) not in string.punctuation])
    return punctuationfree

#  tokenization
def tokenization(text):
    tokens = re.split(' +',text)
    return tokens

# remove stopwords
nltk.download('stopwords')
my_stopwords = stopwords.words('english')
my_stopwords.remove('be') # BE -> Back Ends
def rm_stopwords(text):
    return [i for i in text if i not in my_stopwords]

# lemmatization
nltk.download('wordnet')
nltk.download('omw-1.4')
wordnet_lemmatizer = WordNetLemmatizer()
def lemmatization(text):
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text

# remove_extra_whitespace_tabs
def remove_extra_whitespace_tabs(text):
    #pattern = r'^\s+$|\s+$'
    pattern = r'^\s*|\s\s*'
    return re.sub(pattern, ' ', text).strip()

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/qinwenw/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/qinwenw/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/qinwenw/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [90]:
clean_review.loc[:,('text')] = clean_review.loc[:,('text')].apply(
        lambda x: lemmatization(rm_stopwords(tokenization(remove_punctuation(remove_extra_whitespace_tabs(remove_num(remove_special_characters(x.lower())))))))
    )
clean_review.loc[:,('text')] = clean_review.loc[:,('text')].apply(lambda x: ' '.join(x))

In [91]:
clean_review.head()

Unnamed: 0,business_id,date,year,month,stars,text,useful,funny,cool
19,Tn8O4tv1U-n0PRC8kbJJPg,2016-04-06,2016,4,4,great place chinatown get banh mi make sure kn...,0,0,0
32,3RlylOY452bA8rwliPUeUQ,2017-01-04,2017,1,4,sister love place probably come week every two...,1,0,0
48,An-JJle53UMHokU4MwFktg,2017-01-26,2017,1,3,sister big fan noodle soup option msg way syst...,0,0,0
49,ZilzayEdyk70SoI-wruJbg,2017-01-04,2017,1,4,stop girlfriend whenever im back always order ...,0,0,0
54,kKNauD7d3HOvumRV7JIAfg,2016-04-01,2016,4,4,saw new place neighborhood decided try disappo...,1,0,0


join dataframes

In [92]:
review_2016= export_df(clean_review)
review_final_2017= export_df_17(clean_review)

In [93]:
review_final = review_2016.join(review_final_2017)
# set business_id as a column
review_final.index.names = ['business_id']
# drop rows with na
## i.e. at least 1 review for each month in 2017
review_final = review_final.dropna().reset_index()

In [94]:
review_final.head()

Unnamed: 0,business_id,2016_text,2016_rate,2017_01_text,2017_01_rate,2017_02_text,2017_02_rate,2017_03_text,2017_03_rate,2017_04_text,...,2017_08_text,2017_08_rate,2017_09_text,2017_09_rate,2017_10_text,2017_10_rate,2017_11_text,2017_11_rate,2017_12_text,2017_12_rate
0,An-JJle53UMHokU4MwFktg,stars ...,3.212121,stars ...,3.714286,stars ...,3.333333,stars ...,4.5,stars ...,...,stars ...,3.75,stars ...,4.0,stars ...,4.0,stars ...,3.0,stars ...,3.0
1,XCxxPZ3Lu5mwmIo7IQRf1g,stars ...,3.830986,stars ...,4.166667,stars ...,3.272727,stars ...,3.5,stars ...,...,stars ...,3.444444,stars ...,3.666667,stars ...,4.0,stars ...,4.0,stars ...,4.0
2,c78Pat78fVUBFPXYeVvbaQ,stars ...,3.666667,stars ...,4.0,stars ...,4.0,stars ...,5.0,stars ...,...,stars ...,4.25,stars ...,1.0,stars ...,3.833333,stars ...,2.5,stars ...,5.0
3,5N8R7ALESZ30EoAzVJtabw,stars ...,3.473118,stars ...,4.5,stars ...,4.8,stars ...,4.5,stars ...,...,stars ...,4.125,stars ...,3.0,stars ...,3.0,stars ...,4.0,stars ...,4.0
4,WFB1fn8rWNukmmIfTg6AMw,stars ...,4.826087,stars ...,4.0,stars ...,4.583333,stars ...,4.230769,stars ...,...,stars ...,4.714286,stars ...,4.333333,stars ...,4.888889,stars ...,4.0,stars ...,4.6


In [95]:
review_final['2016_text'][0].head() # first restaurants with all reviews

Unnamed: 0,stars,text,useful,funny,cool
10349,4,there much say yelpers havent goto soup noodle...,0,0,0
19989,3,visited friday night pm parking mayhem plaza p...,0,0,0
35512,4,restaurant specializes noodle soup provide lis...,0,0,0
48530,4,love noodle soup deer garden well pretty much ...,0,0,2
77778,4,place known noodle fish soup customize soup ba...,0,0,0


In [105]:
# regardless of which restaurant
# train on 2016's each review text & rating 
frames = [ review_final['2016_text'][i] for i in range(len(review_final)) ]
restaurant_2016_text = pd.concat(frames)
len(restaurant_2016_text)

22448

In [106]:
restaurant_2016_text.head()

Unnamed: 0,stars,text,useful,funny,cool
10349,4,there much say yelpers havent goto soup noodle...,0,0,0
19989,3,visited friday night pm parking mayhem plaza p...,0,0,0
35512,4,restaurant specializes noodle soup provide lis...,0,0,0
48530,4,love noodle soup deer garden well pretty much ...,0,0,2
77778,4,place known noodle fish soup customize soup ba...,0,0,0


In [174]:
review_2017_jan = review_final[['business_id','2017_01_text','2017_01_rate']]

In [175]:
review_2017_jan.head()

Unnamed: 0,business_id,2017_01_text,2017_01_rate
0,An-JJle53UMHokU4MwFktg,stars ...,3.714286
1,XCxxPZ3Lu5mwmIo7IQRf1g,stars ...,4.166667
2,c78Pat78fVUBFPXYeVvbaQ,stars ...,4.0
3,5N8R7ALESZ30EoAzVJtabw,stars ...,4.5
4,WFB1fn8rWNukmmIfTg6AMw,stars ...,4.0


In [121]:
restaurant_2016_text.reset_index(drop=True, inplace=True)

In [122]:
restaurant_2016_text.head()

Unnamed: 0,stars,text,useful,funny,cool
0,4,there much say yelpers havent goto soup noodle...,0,0,0
1,3,visited friday night pm parking mayhem plaza p...,0,0,0
2,4,restaurant specializes noodle soup provide lis...,0,0,0
3,4,love noodle soup deer garden well pretty much ...,0,0,2
4,4,place known noodle fish soup customize soup ba...,0,0,0


## Sentiment Analysis

### 1. VADER

In [197]:
# Load SentimentIntensityAnalyzer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
# Instantiate new SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
# Generate sentiment scores
restaurant_2016_vader = restaurant_2016_text
restaurant_2016_vader.loc[:,('polarity_compound')] = restaurant_2016_vader.loc[:,('text')].apply(sid.polarity_scores)

In [198]:
restaurant_2016_vader.loc[:,('polarity_compound')] = restaurant_2016_vader.loc[:,('polarity_compound')].apply(
    lambda x: x['compound'])

In [199]:
restaurant_2016_vader.head()

Unnamed: 0,stars,text,useful,funny,cool,text_polarity,polarity_compound
0,4,there much say yelpers havent goto soup noodle...,0,0,0,"{'neg': 0.0, 'neu': 0.906, 'pos': 0.094, 'comp...",0.6705
1,3,visited friday night pm parking mayhem plaza p...,0,0,0,"{'neg': 0.052, 'neu': 0.794, 'pos': 0.153, 'co...",0.9793
2,4,restaurant specializes noodle soup provide lis...,0,0,0,"{'neg': 0.0, 'neu': 0.777, 'pos': 0.223, 'comp...",0.9674
3,4,love noodle soup deer garden well pretty much ...,0,0,2,"{'neg': 0.0, 'neu': 0.421, 'pos': 0.579, 'comp...",0.9081
4,4,place known noodle fish soup customize soup ba...,0,0,0,"{'neg': 0.02, 'neu': 0.791, 'pos': 0.189, 'com...",0.9674


In [206]:
for res_index in range(len(review_2017_jan)):
    res_index_review = review_2017_jan['2017_01_text'][res_index]
    res_index_review.loc[:,('polarity_compound')] = res_index_review.loc[:,('text')].apply(sid.polarity_scores)
    review_2017_jan['2017_01_text'][res_index]['polarity_compound'] = res_index_review.loc[:,('polarity_compound')].apply(
        lambda x: x['compound'])

In [207]:
review_2017_jan['2017_01_text'][0]

Unnamed: 0,stars,text,useful,funny,cool,sentiment_score,polarity_compound
48,3,sister big fan noodle soup option msg way syst...,0,0,0,0.984308,0.0801
65389,4,kind place enjoy coming cold chilly outside wi...,0,0,0,0.986233,0.9201
127082,2,cleaner brighter larger restaurant thick noodl...,1,0,0,-0.982409,0.8016
182667,4,one place frequented bit past ownership name c...,0,0,0,0.981074,0.9939
275395,4,enjoy coming build noodle soup soup base mostl...,2,0,0,0.966202,0.802
284977,4,favourite soup noodle place toronto generous p...,0,0,0,0.990875,0.6124
360589,5,third visit far one favourite noodle shop trie...,0,0,0,0.999738,0.9245


### 2. Transformer

Sentiment Analysis, calculate sentiment scores:

In [126]:
restaurant_2016_text['text'][0:70]

0     there much say yelpers havent goto soup noodle...
1     visited friday night pm parking mayhem plaza p...
2     restaurant specializes noodle soup provide lis...
3     love noodle soup deer garden well pretty much ...
4     place known noodle fish soup customize soup ba...
                            ...                        
65    one go sushi place sashimi always fresh im lov...
66    osaka time dine order take well restaurant qui...
67    neat little japanese restaurant tucked lesser ...
68    great service great food price seem expensive ...
69    came based review yelp quite disappointed sash...
Name: text, Length: 70, dtype: object

In [128]:
from transformers import pipeline
sentiment = pipeline('sentiment-analysis', model='VictorSanh/roberta-base-finetuned-yelp-polarity')
## use first 70 restaurants to train
res_n = 70
sentiment_score = sentiment(restaurant_2016_text['text'][0:res_n].tolist()[0:res_n],
                            truncation=True, max_length=512)

Some weights of the model checkpoint at VictorSanh/roberta-base-finetuned-yelp-polarity were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [129]:
sentiment_score[0:10]

[{'label': 'LABEL_1', 'score': 0.994976818561554},
 {'label': 'LABEL_0', 'score': 0.9990818500518799},
 {'label': 'LABEL_1', 'score': 0.9986188411712646},
 {'label': 'LABEL_1', 'score': 0.9993802309036255},
 {'label': 'LABEL_1', 'score': 0.9789679646492004},
 {'label': 'LABEL_1', 'score': 0.9803270697593689},
 {'label': 'LABEL_0', 'score': 0.9999288320541382},
 {'label': 'LABEL_1', 'score': 0.9723057746887207},
 {'label': 'LABEL_1', 'score': 0.9988142251968384},
 {'label': 'LABEL_1', 'score': 0.9954903721809387}]

Transform sentiment scores:

In [109]:
def transform_sentiment_score(sentiment_score):
    transformed_sentiment_score = []
    for each in sentiment_score:
        score = (each['score'])
        if each['label'] == 'LABEL_0':
            score = -score
        transformed_sentiment_score.append(score)
    return transformed_sentiment_score  

Weight each sentiment score based on "Useful" column:

In [119]:
restaurant_2016_text.head(res_n)

Unnamed: 0,stars,text,useful,funny,cool
10349,4,there much say yelpers havent goto soup noodle...,0,0,0
19989,3,visited friday night pm parking mayhem plaza p...,0,0,0
35512,4,restaurant specializes noodle soup provide lis...,0,0,0
48530,4,love noodle soup deer garden well pretty much ...,0,0,2
77778,4,place known noodle fish soup customize soup ba...,0,0,0
...,...,...,...,...,...
212169,5,one go sushi place sashimi always fresh im lov...,1,0,0
217436,3,osaka time dine order take well restaurant qui...,0,0,0
232161,3,neat little japanese restaurant tucked lesser ...,2,1,1
240466,4,great service great food price seem expensive ...,1,0,0


In [136]:
restaurant_2016_transformer = restaurant_2016_text[0:res_n]
restaurant_2016_transformer['sentiment_score'] = transform_sentiment_score(sentiment_score)

# sa_matrix = restaurant_2016_transformer[['business_id', 'sentiment_score', 'stars']]

# sa_matrix = restaurant_2016_text[['business_id', 'useful', 'funny', 'cool', 'sentiment_score', 'stars']].reset_index(drop=True)

# #sample from sa_matrix with equal number of samples in each "stars" group":
# sa_matrix['stars'].value_counts()

# balanced_sa_matrix = sa_matrix.groupby('stars')
# balanced_sa_matrix = pd.DataFrame(balanced_sa_matrix.apply(
#     lambda x: x.sample(balanced_sa_matrix.size().min()).reset_index(drop=True)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  restaurant_2016_transformer['sentiment_score'] = transform_sentiment_score(sentiment_score)


In [137]:
restaurant_2016_transformer.head()

Unnamed: 0,stars,text,useful,funny,cool,text_polarity,sentiment_score
0,4,there much say yelpers havent goto soup noodle...,0,0,0,"{'neg': 0.0, 'neu': 0.906, 'pos': 0.094, 'comp...",0.994977
1,3,visited friday night pm parking mayhem plaza p...,0,0,0,"{'neg': 0.052, 'neu': 0.794, 'pos': 0.153, 'co...",-0.999082
2,4,restaurant specializes noodle soup provide lis...,0,0,0,"{'neg': 0.0, 'neu': 0.777, 'pos': 0.223, 'comp...",0.998619
3,4,love noodle soup deer garden well pretty much ...,0,0,2,"{'neg': 0.0, 'neu': 0.421, 'pos': 0.579, 'comp...",0.99938
4,4,place known noodle fish soup customize soup ba...,0,0,0,"{'neg': 0.02, 'neu': 0.791, 'pos': 0.189, 'com...",0.978968


In [182]:
for res_index in range(len(review_2017_jan)):
    res_index_review = review_2017_jan['2017_01_text'][res_index]
    sentiment_score = sentiment(res_index_review['text'][0:len(res_index_review)].tolist()[0:len(res_index_review)],
                                truncation=True, max_length=512)
    review_2017_jan['2017_01_text'][res_index]['sentiment_score'] = transform_sentiment_score(sentiment_score)

In [185]:
review_2017_jan['2017_01_text'][0].head()

Unnamed: 0,stars,text,useful,funny,cool,sentiment_score
48,3,sister big fan noodle soup option msg way syst...,0,0,0,0.984308
65389,4,kind place enjoy coming cold chilly outside wi...,0,0,0,0.986233
127082,2,cleaner brighter larger restaurant thick noodl...,1,0,0,-0.982409
182667,4,one place frequented bit past ownership name c...,0,0,0,0.981074
275395,4,enjoy coming build noodle soup soup base mostl...,2,0,0,0.966202


In [28]:
# balanced_sa_matrix

Unnamed: 0_level_0,Unnamed: 1_level_0,useful,funny,cool,sentiment_score,stars
stars,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0,1,1,0,-0.999926,1
1,1,1,0,0,-0.999927,1
1,2,1,1,0,-0.998615,1
1,3,0,0,0,-0.999894,1
1,4,0,0,0,-0.999922,1
...,...,...,...,...,...,...
5,9,0,0,0,0.999838,5
5,10,1,0,2,0.999882,5
5,11,3,0,2,0.999891,5
5,12,0,0,0,0.999830,5


## modeling

In [None]:
X_train_sen = restaurant_2016_transformer[['sentiment_score','useful']]
y_train_sen = restaurant_2016_transformer['stars']

In [None]:
X_train_polar = restaurant_2016_vader[['polarity_compound','useful']]
y_train_polar = restaurant_2016_vader['stars']

### 1. Linear Regression

In [79]:
import random
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score

In [250]:
## no need to split train & test here
## b/c we train on 2016 and predict for 2017 Jan

# X_train = sa_matrix[['sentiment_score']]
# y_train = sa_matrix['stars']
linear_model = LinearRegression().fit(X_train_sen, y_train_sen)
y_pred_mean = []
for res_index in range(len(review_2017_jan)):
    res_index_review = review_2017_jan['2017_01_text'][res_index]
    X_valid = res_index_review[['sentiment_score','useful']]
    y_pred = linear_model.predict(X_valid)
    y_pred_mean.append(y_pred.mean())

r2_score(review_2017_jan['2017_01_rate'], y_pred_mean)

0.5043309610913794

In [251]:
linear_model_polar = LinearRegression().fit(X_train_polar, y_train_polar)
y_pred_mean = []
for res_index in range(len(review_2017_jan)):
    res_index_review = review_2017_jan['2017_01_text'][res_index]
    X_valid = res_index_review[['polarity_compound','useful']]
    y_pred = linear_model_polar.predict(X_valid)
    y_pred_mean.append(y_pred.mean())

r2_score(review_2017_jan['2017_01_rate'], y_pred_mean)

0.19683081178650574

### 2. XGboost

In [225]:
import xgboost as xgb
from xgboost import XGBRegressor

In [248]:
model = XGBRegressor(n_estimators=1000, max_depth=7)
clf_sen = model.fit(X_train_sen, y_train_sen)
y_pred_mean = []
for res_index in range(len(review_2017_jan)):
    res_index_review = review_2017_jan['2017_01_text'][res_index]
    X_valid = res_index_review[['sentiment_score','useful']]
    y_pred = clf_sen.predict(X_valid)
    y_pred_mean.append(y_pred.mean())
    
print(r2_score(review_2017_jan['2017_01_rate'], y_pred_mean))

0.3813557080628227


In [249]:
model = XGBRegressor(n_estimators=1000, max_depth=7)
clf_polar = model.fit(X_train_polar, y_train_polar)
y_pred_mean = []
for res_index in range(len(review_2017_jan)):
    res_index_review = review_2017_jan['2017_01_text'][res_index]
    X_valid = res_index_review[['polarity_compound','useful']]
    y_pred = clf_polar.predict(X_valid)
    y_pred_mean.append(y_pred.mean())
    
review_2017_jan['2017_01_pred_xgboost'] = y_pred_mean
print(r2_score(review_2017_jan['2017_01_rate'], y_pred_mean))

0.08274422750529409
