In [1]:
import numpy as np
import time
import spacy
import pandas as pd
import seaborn as sb
from tqdm import tqdm
import matplotlib.pyplot as plt
from spacy import displacy
from nltk.stem.snowball import SnowballStemmer
import math
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm
from sklearn.datasets import load_svmlight_file
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV,cross_val_score,StratifiedKFold
import sklearn.preprocessing as preprocessing
from readability import Readability
import enchant
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score,roc_auc_score
from sklearn.ensemble import VotingClassifier

In [2]:
###Do Tokenization and stemming###
DATA_PATH = '/home/wangbo/桌面/sampled_data.csv'

data = pd.read_csv(DATA_PATH)
data.rename(columns={'Unnamed: 0':'index'}, inplace=True)
print(data.head())
# print(data.columns)

nlp = spacy.load('en_core_web_sm')

def tokenizeReviews(data):
    token_nums = []
    stem_nums = []
    start_time = time.time()
    reviews = data['text'].values
    total_len = data.shape[0]
    stemmer = SnowballStemmer('english')
    print('Total reviews in this 200 business sub-dataset: {}'.format(total_len))
    pro_bar = tqdm(total=total_len)
    for doc in nlp.pipe(reviews, batch_size=200, n_process=8, disable=["parser", "ner", "textcat"]):
        _token = set()
        _stem = set()
        for token in doc:
            _token.add(token.text)
            _stem.add(stemmer.stem(token.text))
        token_nums.append(len(_token))
        stem_nums.append(len(_stem))
        pro_bar.update(1)
    pro_bar.close()
    data['token_nums'] = token_nums
    data['stem_nums'] = stem_nums
    assert isinstance(data, pd.DataFrame)
    data.to_csv('/home/wangbo/桌面/token_stemming.csv')

   index               review_id                 user_id  \
0    524  8bTpgBiYAmuveHF_avX0SA  8vIK6ndl8yzIdmSDnGp0tw   
1    602  PdqGgSuZcgPwIpPI5AyPtw  H9d0x9EuU63KPeRb2gCHfQ   
2    627  8Xy5yRMsUSPvNv_lpCHsEg  aYt6yemtPLX1RbX7UT6KCw   
3   1472  e3R37IGpvCvqWt0eWlhnbQ  2lmBAfQWAI06ziEfN5NkCQ   
4   1743  8zDbR_DxgxFPN2EMTGKkdw  RdMD77Uwe8Mjy_EptbPa3g   

              business_id  stars  useful  funny  cool  \
0  nQjuBVbcRFyX3trnuf_8CQ    3.0       0      0     0   
1  33N-oL25LxumJNft-ULXTA    3.0       0      0     0   
2  nQjuBVbcRFyX3trnuf_8CQ    4.0       0      0     0   
3  OGNv45615-ni06lUvkV62w    2.0       2      0     0   
4  Ul0EYexHUOptT9L2GUZJug    5.0       1      1     2   

                                                text                 date  
0  Notes: 1 visit, late lunch.\nOverall: Food had...  2010-01-08 03:17:23  
1  Used to be really good with previous owner. Si...  2017-08-28 20:45:03  
2  Being in Vegas for nearly 1 week, I was cravin...  2011-03-20 04:

In [3]:
#####Find business ID with most review#####
data=pd.read_csv('/home/wangbo/桌面/token_stemming.csv')
data['business_id'].value_counts().max()
data['business_id'].value_counts().idxmax()
business_id1 = data['business_id']=='igHYkXZMLAc9UdV5VnR_AA'
business_id1
data1=data[business_id1]
data1["business_id"]
data1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3086 entries, 6045 to 9855
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Unnamed: 0   3086 non-null   int64  
 1   index        3086 non-null   int64  
 2   review_id    3086 non-null   object 
 3   user_id      3086 non-null   object 
 4   business_id  3086 non-null   object 
 5   stars        3086 non-null   float64
 6   useful       3086 non-null   int64  
 7   funny        3086 non-null   int64  
 8   cool         3086 non-null   int64  
 9   text         3086 non-null   object 
 10  date         3086 non-null   object 
 11  token_nums   3086 non-null   int64  
 12  stem_nums    3086 non-null   int64  
dtypes: float64(1), int64(7), object(5)
memory usage: 337.5+ KB


In [4]:
#####Cut review with count<30 & >200####
text1 = (data1['stem_nums']<200)& (data1['stem_nums']>30)
data2=data1[text1]
data2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2418 entries, 6045 to 9855
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Unnamed: 0   2418 non-null   int64  
 1   index        2418 non-null   int64  
 2   review_id    2418 non-null   object 
 3   user_id      2418 non-null   object 
 4   business_id  2418 non-null   object 
 5   stars        2418 non-null   float64
 6   useful       2418 non-null   int64  
 7   funny        2418 non-null   int64  
 8   cool         2418 non-null   int64  
 9   text         2418 non-null   object 
 10  date         2418 non-null   object 
 11  token_nums   2418 non-null   int64  
 12  stem_nums    2418 non-null   int64  
dtypes: float64(1), int64(7), object(5)
memory usage: 264.5+ KB


In [5]:
#####Use Useful index as ylabel for further prediction#####
useful_nums = data2['useful'].value_counts()
i=0
data2['ylabel']=0
while i<=2417:
    if data2['useful'].iloc[i]>0:
        data2['ylabel'].iloc[i]=1.0
    else:
        data2['ylabel'].iloc[i]=0.0
    i+=1
ylabel_num=data2['ylabel'].value_counts()
ylabel_num
data2.to_csv('data2.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2['ylabel']=0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2['ylabel'].iloc[i]=0.0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2['ylabel'].iloc[i

In [6]:
### Get ADJ, Verb and Noun Number for review######
import numpy as np
import pandas as pd
import math
from tqdm import tqdm
import spacy

nlp = spacy.load('en_core_web_sm')
####Get ADJ#####
def get_adj(review_list):
    review_text = review_list['text'].values
    total_len = len(review_list)
    print('len of review list', total_len)
    adj_list=[]
    pbar = tqdm(total=total_len)
    for doc in nlp.pipe(review_text, batch_size=400, n_process=16):
        adj=''
        for token in doc:
            if(token.pos_ is 'ADJ' and (token.dep_ is 'amod' or token.dep_ is 'acomp')): #amod: adjectival modifier
                adj+=token.lemma_                                                        #acomp: adjectival complement
                adj+=' '
        adj=adj.strip(' ')
        adj_list.append(adj)
        pbar.update(1)
    pbar.close()
    adj_series=pd.Series(adj_list)
    review_list.insert(0,'adj',adj_series)
    review_list.to_csv('adj_output.csv')
#####Get Verb#####
def get_verb(review_list):
    review_text = review_list['text'].values
    total_len = len(review_list)
    print('len of review list', total_len)
    verb_list=[]
    pbar = tqdm(total=total_len)
    for doc in nlp.pipe(review_text, batch_size=400, n_process=16):
        verb=''
        for token in doc:
            if token.pos_ is 'VERB':
                verb+=token.lemma_                                                        
                verb+=' '
        verb=verb.strip(' ')
        verb_list.append(verb)
        pbar.update(1)
    pbar.close()
    verb_series=pd.Series(verb_list)
    review_list.insert(0,'verb',verb_series)
#####Get Noun#####
def get_noun(review_list):
    review_text = review_list['text'].values
    total_len = len(review_list)
    print('len of review list', total_len)
    noun_list=[]
    pbar = tqdm(total=total_len)
    for doc in nlp.pipe(review_text, batch_size=400, n_process=16):
        noun=''
        for token in doc:
            if token.pos_ is 'NOUN':
                noun+=token.lemma_                                                        
                noun+=' '
        noun=noun.strip(' ')
        noun_list.append(noun)
        pbar.update(1)
    pbar.close()
    noun_series=pd.Series(noun_list)
    review_list.insert(0,'noun',noun_series)
if __name__ == '__main__':
    review_list = pd.read_csv('data2.csv')
    get_adj(review_list)
    get_verb(review_list)
    get_noun(review_list)

  if(token.pos_ is 'ADJ' and (token.dep_ is 'amod' or token.dep_ is 'acomp')): #amod: adjectival modifier
  if(token.pos_ is 'ADJ' and (token.dep_ is 'amod' or token.dep_ is 'acomp')): #amod: adjectival modifier
  if(token.pos_ is 'ADJ' and (token.dep_ is 'amod' or token.dep_ is 'acomp')): #amod: adjectival modifier
  if token.pos_ is 'VERB':
  if token.pos_ is 'NOUN':
  0%|          | 0/2418 [00:00<?, ?it/s]

len of review list 2418


100%|██████████| 2418/2418 [00:18<00:00, 127.60it/s]
  0%|          | 0/2418 [00:00<?, ?it/s]

len of review list 2418


100%|██████████| 2418/2418 [00:23<00:00, 102.72it/s]
  0%|          | 0/2418 [00:00<?, ?it/s]

len of review list 2418


100%|██████████| 2418/2418 [00:22<00:00, 108.92it/s]


In [8]:
#### ADD ADJ, VERB, NOUN Words COUNT#####
review_list['adj_count']=review_list.adj.apply(lambda x: len(str(x).split(' ')))
review_list['verb_count']=review_list.verb.apply(lambda x: len(str(x).split(' ')))
review_list['noun_count']=review_list.noun.apply(lambda x: len(str(x).split(' ')))
review_list.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2418 entries, 0 to 2417
Data columns (total 21 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   noun          2418 non-null   object 
 1   verb          2418 non-null   object 
 2   adj           2418 non-null   object 
 3   Unnamed: 0    2418 non-null   int64  
 4   Unnamed: 0.1  2418 non-null   int64  
 5   index         2418 non-null   int64  
 6   review_id     2418 non-null   object 
 7   user_id       2418 non-null   object 
 8   business_id   2418 non-null   object 
 9   stars         2418 non-null   float64
 10  useful        2418 non-null   int64  
 11  funny         2418 non-null   int64  
 12  cool          2418 non-null   int64  
 13  text          2418 non-null   object 
 14  date          2418 non-null   object 
 15  token_nums    2418 non-null   int64  
 16  stem_nums     2418 non-null   int64  
 17  ylabel        2418 non-null   float64
 18  adj_count     2418 non-null 

In [9]:
###################################################################################################
## Step 2: compute indivativeness and get top-10 most indicative adjectives for each rating star ##
###################################################################################################

# the probability of observing word in all reviews
def compute_prob_in_all(adj, word): 
    word_num = 0
    for i in adj:
        if(word in i):
            word_num += 1
    prob_in_all = word_num / len(adj)
    return prob_in_all

# the probability of observing word in all reviews with rating star
def compute_prob_in_stars(adj, stars, current_star, word): 
    word_num = 0
    stars_num = 0
    for i in range(len(adj)):
        if(stars[i] == current_star):
            stars_num += 1
            if(word in adj[i]):
                word_num += 1
    prob_in_stars = word_num / stars_num
    return prob_in_stars

reviews = pd.read_csv('adj_output.csv')
adj_reviews = reviews['adj'].values

adj = [] # each line represents adjectives extracted from each review
for i in adj_reviews:
    if(pd.isnull(i)):
        adj.append([])
    else:
        adj.append(i.split(' '))

all_adj_word = set()  # all non-repeated adjectives that appear in reviews
for i in adj:
    for j in i:
        all_adj_word.add(j)

stars = reviews['stars'].values
all_stars=[1.0,2.0,3.0,4.0,5.0]

# compute indivativeness and get top-10 results
for current_star in all_stars:
    word_IA = dict() # key:word value:IA(indicative adjectives)
    for word in all_adj_word:
        prob_in_stars = compute_prob_in_stars(adj, stars, current_star, word)
        prob_in_all = compute_prob_in_all(adj, word)
        if(prob_in_stars == 0):
            prob_indicative = 0
        else:
            prob_indicative = prob_in_stars * math.log(prob_in_stars / prob_in_all)
        word_IA[word] = prob_indicative
    result = sorted(word_IA.items(), key=lambda d:d[1], reverse=True)
    print('top-10 most indicative adjectives for star', current_star, ':')
    print(result[0:10],'\n')

top-10 most indicative adjectives for star 1.0 :
[('bad', 0.3349702275150748), ('cold', 0.2622636536469612), ('terrible', 0.20721020153997782), ('rude', 0.16097466149000178), ('horrible', 0.15203793554082995), ('ready', 0.13486744561065778), ('wrong', 0.12711965558701296), ('available', 0.12529620624176843), ('rare', 0.12029021636907426), ('cooked', 0.10787581795771095)] 

top-10 most indicative adjectives for star 2.0 :
[('cold', 0.21429655300961564), ('slow', 0.19222949289018987), ('disappointing', 0.1891926608774909), ('bad', 0.17802966894431), ('many', 0.1753209711653264), ('horrible', 0.16493433372145236), ('raw', 0.13207620667760558), ('few', 0.11967811264724608), ('bland', 0.11569061903964695), ('well', 0.11035205903787124)] 

top-10 most indicative adjectives for star 3.0 :
[('ok', 0.1709997132094282), ('good', 0.15434128813254755), ('nice', 0.13735409872116927), ('same', 0.1095573134485608), ('mediocre', 0.10212780202347821), ('bland', 0.0995592122031467), ('busy', 0.096471244

In [10]:
###indicative adjective counts###
star1=['bad','cold','terrible','rude','horrible','ready','wrong','avaiable','rare','cooked']
star2=['cold','slow','disappointing','bad','many','horrible','raw','few','bland','well']
star3=['ok','good','nice','same','mediocre','bland','busy','average','cold','disappointed']
star4=['good','little','nice','only','small','great','large','first','quick','open']
star5=['amazing','favorite','delicious','perfect','great','wonderful','incredible','fantastic','top','beautiful']
review_list['adj_new']=review_list.adj.apply(lambda x: str(x).split(' '))
i=0
review_list['ind_adj']=0
while i<2418:
    if review_list['stars'].iloc[i]==1.0:
        for x in review_list['adj_new'].iloc[i]:
            if x in star1:
                review_list['ind_adj'].iloc[i]+=1
                
    elif review_list['stars'].iloc[i]==2.0:
        for x in review_list['adj_new'].iloc[i]:
            if x in star2:
                review_list['ind_adj'].iloc[i]+=1
    
    elif review_list['stars'].iloc[i]==3.0:
        for x in review_list['adj_new'].iloc[i]:
            if x in star3:
                review_list['ind_adj'].iloc[i]+=1     
                
    elif review_list['stars'].iloc[i]==4.0:
        for x in review_list['adj_new'].iloc[i]:
            if x in star4:
                review_list['ind_adj'].iloc[i]+=1
    
    elif review_list['stars'].iloc[i]==5.0:
        for x in review_list['adj_new'].iloc[i]:
            if x in star5:
                review_list['ind_adj'].iloc[i]+=1
                
    i+=1
review_list['ind_adj']

0       2
1       2
2       2
3       3
4       2
       ..
2413    3
2414    3
2415    0
2416    1
2417    1
Name: ind_adj, Length: 2418, dtype: int64

In [11]:
####Flesh Kincaid Readability Test####
def FleschKincaidTest(text):
	score = 0.0
	if len(text) > 0:
		score = (0.39 * len(text.split()) / len(text.split('.')) ) + 11.8 * ( sum(list(map(lambda x: 1 if x in ["a","i","e","o","u","y","A","E","I","O","U","y"] else 0,text))) / len(text.split())) - 15.59
		return score if score > 0 else 0
i=0
review_list['Readability']=0
while i<=2417:
    review_list['Readability'].iloc[i]=FleschKincaidTest(review_list['text'].iloc[i])
    i+=1
review_list['Readability']

0       10.941576
1       10.747027
2        8.727809
3        8.338019
4       13.749167
          ...    
2413    10.779925
2414    10.249024
2415    10.859082
2416     8.968662
2417    11.782741
Name: Readability, Length: 2418, dtype: float64

In [12]:
###wrong words counts in review###
d = enchant.Dict("en_US")
review_list['word']=review_list.text.apply(lambda x: str(x).split(' '))
i=0
review_list['notindic']=0
while i<=2417:
    for x in review_list['word'].iloc[i]:
        if len(x)==0:
            continue
        elif d.check(x)==False:
            review_list['notindic'].iloc[i]+=1
    i+=1
review_list['notindic']

0        8
1        0
2        7
3        4
4       12
        ..
2413     2
2414     8
2415    10
2416     2
2417    20
Name: notindic, Length: 2418, dtype: int64

In [13]:
review_list.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2418 entries, 0 to 2417
Data columns (total 26 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   noun          2418 non-null   object 
 1   verb          2418 non-null   object 
 2   adj           2418 non-null   object 
 3   Unnamed: 0    2418 non-null   int64  
 4   Unnamed: 0.1  2418 non-null   int64  
 5   index         2418 non-null   int64  
 6   review_id     2418 non-null   object 
 7   user_id       2418 non-null   object 
 8   business_id   2418 non-null   object 
 9   stars         2418 non-null   float64
 10  useful        2418 non-null   int64  
 11  funny         2418 non-null   int64  
 12  cool          2418 non-null   int64  
 13  text          2418 non-null   object 
 14  date          2418 non-null   object 
 15  token_nums    2418 non-null   int64  
 16  stem_nums     2418 non-null   int64  
 17  ylabel        2418 non-null   float64
 18  adj_count     2418 non-null 

In [14]:
####Feature Engineering####
scaler = preprocessing.StandardScaler()
stem_nums_param = scaler.fit(review_list[['stem_nums']])
review_list['Scaled_stem_nums'] = scaler.fit_transform(review_list[['stem_nums']], stem_nums_param)
verb_count_param = scaler.fit(review_list[['verb_count']])
review_list['Scaled_verb_count'] = scaler.fit_transform(review_list[['verb_count']], verb_count_param)
noun_count_param = scaler.fit(review_list[['noun_count']])
review_list['Scaled_noun_count'] = scaler.fit_transform(review_list[['noun_count']], noun_count_param)
ind_adj_param = scaler.fit(review_list[['ind_adj']])
review_list['Scaled_ind_adj'] = scaler.fit_transform(review_list[['ind_adj']], ind_adj_param)
Readability_param = scaler.fit(review_list[['Readability']])
review_list['Scaled_Readability'] = scaler.fit_transform(review_list[['Readability']], Readability_param)
notindic_param = scaler.fit(review_list[['notindic']])
review_list['Scaled_notindic'] = scaler.fit_transform(review_list[['notindic']], notindic_param)
review_list.drop(['stem_nums','verb_count','noun_count','ind_adj','Readability','notindic'],axis=1, inplace=True)
review_list.drop(['review_id','user_id','business_id','stars','noun','verb','adj'],axis=1, inplace=True)
review_list.drop(['index','useful','funny','cool','text','date','token_nums'],axis=1, inplace=True)
review_list.drop(['adj_count','adj_new','word','Unnamed: 0', 'Unnamed: 0.1'],axis=1, inplace=True)
review_list.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2418 entries, 0 to 2417
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ylabel              2418 non-null   float64
 1   Scaled_stem_nums    2418 non-null   float64
 2   Scaled_verb_count   2418 non-null   float64
 3   Scaled_noun_count   2418 non-null   float64
 4   Scaled_ind_adj      2418 non-null   float64
 5   Scaled_Readability  2418 non-null   float64
 6   Scaled_notindic     2418 non-null   float64
dtypes: float64(7)
memory usage: 132.4 KB


In [15]:
###Classifier Model Traning###
y = review_list['ylabel']
X = review_list.drop('ylabel',axis=1)
kfold=StratifiedKFold(n_splits=10)
classifiers=[]
classifiers.append(SVC())
classifiers.append(DecisionTreeClassifier())
classifiers.append(RandomForestClassifier(random_state=1))
classifiers.append(GradientBoostingClassifier(random_state=1))
classifiers.append(KNeighborsClassifier())
classifiers.append(LogisticRegression())
classifiers.append(AdaBoostClassifier())
classifiers.append(GaussianNB())
for classifier in classifiers:
    cv_results = (cross_val_score(classifier,X,
                                  y,scoring='accuracy'
                                  ,cv=kfold,n_jobs=-1))
    print("Accuracy: %0.4f (+/- %0.4f) [%s]" % 
          ((cv_results.mean(), cv_results.std(), classifier)))

Accuracy: 0.6667 (+/- 0.0187) [SVC()]
Accuracy: 0.5778 (+/- 0.0292) [DecisionTreeClassifier()]
Accuracy: 0.6381 (+/- 0.0202) [RandomForestClassifier(random_state=1)]
Accuracy: 0.6576 (+/- 0.0238) [GradientBoostingClassifier(random_state=1)]
Accuracy: 0.6460 (+/- 0.0291) [KNeighborsClassifier()]
Accuracy: 0.6716 (+/- 0.0199) [LogisticRegression()]
Accuracy: 0.6655 (+/- 0.0327) [AdaBoostClassifier()]
Accuracy: 0.6564 (+/- 0.0321) [GaussianNB()]


In [16]:
###hyper parameter adjustment###
#Logistic Regression
modelLR=LogisticRegression()
LR_param_grid = {'C' : [1,2,3],
                'penalty':['l1','l2']}
modelgsLR = GridSearchCV(modelLR,param_grid = LR_param_grid, cv=kfold, 
                                     scoring="accuracy", n_jobs= -1, verbose = 1)
modelgsLR.fit(X,y)
print('modelgsLR score is：%.4f'%modelgsLR.best_score_)

Fitting 10 folds for each of 6 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.2s finished


modelgsLR score is：0.6716


In [17]:
#GBC###
def fit_model(alg,parameters):
    scorer=make_scorer(roc_auc_score)
    grid = GridSearchCV(alg,parameters,scoring=scorer,cv=5)
    grid=grid.fit(X,y)
    print (grid.best_params_)
    return grid
alg6=GradientBoostingClassifier(random_state=1)
parameters6_1 = {'n_estimators':[100,200,300]}
parameters6_2 = {'max_depth':[4,8]}
parameters6_3 = {'min_samples_leaf': [100,150],'max_features': [0.3, 0.1]}
parameters6_4 = {'learning_rate': [0.1, 0.05, 0.01]}
clf6_m1=fit_model(alg6,parameters6_1)

{'n_estimators': 100}


In [18]:
alg6=GradientBoostingClassifier(n_estimators=100,random_state=1)
clf6_m2=fit_model(alg6,parameters6_2)

{'max_depth': 4}


In [19]:
alg6=GradientBoostingClassifier(n_estimators=100,max_depth=4,random_state=1)
clf6_m3=fit_model(alg6,parameters6_3)

{'max_features': 0.3, 'min_samples_leaf': 100}


In [20]:
alg6=GradientBoostingClassifier(n_estimators=100,max_depth=4,min_samples_leaf=100,max_features=0.3,random_state=1)
clf6_m4=fit_model(alg6,parameters6_4)

{'learning_rate': 0.1}


In [21]:
cv_results = (cross_val_score(alg6,X,y,scoring='accuracy',cv=kfold,n_jobs=-1))
print("Accuracy: %0.4f (+/- %0.4f)" % ((cv_results.mean(), cv_results.std())))

Accuracy: 0.6696 (+/- 0.0281)


In [22]:
clf1 = LogisticRegression()
clf2 = RandomForestClassifier(random_state=2)
clf3 = GradientBoostingClassifier(random_state=1)
clf4 = SVC()
clf5 = AdaBoostClassifier()
clf6 = KNeighborsClassifier()
eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), 
                                    ('gnb', clf3), ('svc',clf4), ('Ada',clf5), 
                                    ('KNN', clf6)], voting='hard')
for clf, label in zip([clf1, clf2, clf3, clf4, clf5, clf6, eclf], 
                      ['Logistic Regression', 'rf', 'GradientBoost', 'svc'
                       , 'Ada', 'KNN', 'Ensemble']):
    scores = cross_val_score(clf,X,y,
                             scoring='accuracy',cv=kfold,n_jobs=-1)
    print("Accuracy: %0.4f (+/- %0.4f) [%s]" % 
          (scores.mean(), scores.std(), label))

Accuracy: 0.6716 (+/- 0.0199) [Logistic Regression]
Accuracy: 0.6369 (+/- 0.0219) [rf]
Accuracy: 0.6576 (+/- 0.0238) [GradientBoost]
Accuracy: 0.6667 (+/- 0.0187) [svc]
Accuracy: 0.6655 (+/- 0.0327) [Ada]
Accuracy: 0.6460 (+/- 0.0291) [KNN]
Accuracy: 0.6704 (+/- 0.0217) [Ensemble]


In [23]:
###Final Result####
clf1.fit(X,y)
preData=clf1.predict(X)
preData=preData.astype(int)
LRpreResultDf=pd.DataFrame()
LRpreResultDf['text']=data2['text']
LRpreResultDf['helpfulness']=preData
LRpreResultDf
i=0
LRpreResultDf['ranking']=0
while i <=2417:
    LRpreResultDf['ranking'].iloc[i]=review_list['Scaled_ind_adj'].iloc[i]-review_list['Scaled_Readability'].iloc[i]-review_list['Scaled_notindic'].iloc[i]
    i+=1
helpfulness1 = LRpreResultDf['helpfulness']==1
helpfulness1
LRpreResultDf1=LRpreResultDf[helpfulness1]
final_LRpreResultDf= LRpreResultDf1.sort_values(by=['ranking'], ascending=False)
pd.set_option('display.max_colwidth', None)
final_LRpreResultDf.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Unnamed: 0,text,helpfulness,ranking
6418,"One of the best things about this place is the pricing compared to steakhouses on the on the Strip. Its also a great place in that it's a steakhouse but it also has other items, great for couples where one enjoys a good steak and the other is less of a carnivore!\n\nThe service was great and the hostess immediately asked if it was our first time or if we were celebrating something special. Yes to the first, no to the second. But that information was clearly passed on, as our waited and a manager-type both acknowledged that we had not been in before and took the time to explain the menu/food/ect. So there is a long line of communication going on between employees and that's a good thing. \n\nNow to the food: \n\nShort-rib grilled cheese - Order it! Just do it. It comes with thick bread, egg battered and then grilled with tender, savory short-rib meat, gruyere cheese and white cheddar. It's savory, flavorful and filling. \n\nRibeye - I'm not much of a steak eater, but I enjoyed the seasoning and flavors of this steak and the way it was cooked. \n\nGarlic Chips - come with each entree and are the classic home-made style of kettle cooked potato chips. They are salty and garlicky and good. \n\nMac and cheese - pretty standard. Creamy with breadcrumbs on the top. Not the best mac and cheese you've ever had, but still good. \n\nBrocillini - pretty basic, but good. \n\nLamb cigars - almost like an eggroll, but with lamb filling. Unique but worth trying. I could have done with out the wrapping and just had the strips of meat, but the eggroll aspect did give it a unique flavor and style that I haven't had before. \n\nBaked brie - I'd probably skip next time. I love cheeses but this brie did not have much flavor and the bread was over cooked. Just wasn't my favorite. \n\nEach of those items plus a bottle of wine and the bill was about $150 which is a nice price when you compare it to what you would pay at a steakhouse on the Strip. The menu is large enough that you could go back several times and never order the same thing. The service was great, food was high quality and well prepared and the atmosphere was nice.",1,3.746124
8700,"I most recently went here for dinner to celebrate my friend's birthday. This is one of those restaurants that is just ok. An average restaurant. The food is not bad, but not too memorable either. I've been here a couple times before for brunch. The reason I go for brunch here is not because of the food, but because they have bottomless mimosas (for $16. Bottomless bloody mary for $22) and they're not stingy with the champagne :) Also, the place is nice and easily accessible, so it makes a good place to meet up with friends. I tried their short rib hash and steakhouse scramble for brunch. Both were ok. Not special enough for me to want to order again. I do really like their portobello fries! \n\nFor my friend's birthday dinner, I shared a 48 Oz. tomahawk steak with 3 other people. I think the steak was cooked a bit too charred on the outside, but the inside was a nice medium. However, the steak was bland and not tender. It was so bland that I had to eat it with a sauce. I recommend the brandied mushroom and the red wine demi (they offer 7 different sauces). The fact that I had to eat the steak with a sauce is quite sad. To me, a sign of a good steak is when you can eat it by itself and you can savor that juicy meaty taste alone, not covered by some sauces. For sides, I tried the cauliflower (this is good), brussels sprouts (just ok), and mashes potatoes (again, just ok). I also tried the steak tartare for appetizer. Nothing wrong with it, but I've had better. Service was kind of on the slow side that night. \n\nSide notes: you can bring your own bottles of wine and they don't charge a corkage fee. If you have a big party, they automatically add a 20% tip. \n\nOverall, this is a nice place away from the strip to have some drinks and appetizers (portobello fries!). If you're looking for a good steak though, there are other local places that offer better tasting steaks.",1,2.902208
7551,"Been here three times and all three times I really enjoyed it. First time, mainly because I was with my two awesome girlfriends I have not seen in years and we always manage to order the best dishes together and have a blast. \n\nOther two times were with my man and the dog and well.. I could be eating a frozen chicken drumstick in an NYC subway stop and have the most awesomest time ever. \n\nOther than great company, the food is also pretty good. I love a good off the strip steakhouse.\n\nThey used to have this gazpacho that is just ridiculous. It's off of the menu now. I recently \ne-mailed the GM asking for the recipe and they gave it to us! My love and I had an amazing time preparing it at home. Got loaded on some first class whites and enjoyed this amazing, cold and crisp treat. Just perfect. Thank you guys so so much!! I made two gallons of it, probably. Gazpacho for days, people.\n\nThe fried spinach salad is pretty amazing. Great texture paired with tangy and sweet Asian vinaigrette.\n\nThe grits and pork belly is just amazing. My steak was as pretty damn good too. Everything was lovely except the ceviche.... Interesting flavor, off putting color. Did not care for it, but only because I was craving the traditional chopped shrimp, onion, cilantro, lime situation. \n\nSit on the patio, the evening and nightfall are great :-)",1,2.600994
7256,"The ambiance is lovely and many of the dishes above average but the service was so terrible, it can guarantee you'll have a bad night ....so why bother?\n\nEverything started off fine even though the wait for a table ended up not being the '10 minutes"" they promised but well over 30 minutes. They sit you down and get your order and everything seems fine. And then you wait.\n\nOver 45 minutes later, without a peep from our waitress to check in and let us know what is going, then dump food on our table. I say ""dump"" because we had ordered several courses - soups, salad and main dishes and side dishes andmost of it came out at once...other than the soup and a veggie dish, which we then had to wait another 15 minutes for. How the heck does the soup come out last?! The worst part about it is that most of the dishes were cold. \n\nRather than serving us dishes as they were ready, it was obvious they waited or forgot about our order for a very long time.\n\nWe mentioned our frustration to the waitress in a nice way and she said the kitchen was backed up. Then management pops up while we are still waiting for some of our food and asking how is everything. We mention having some issues with the wait and he makes a snarky comment rather than apologizing or trying to make things a bit better. We weren't at all rude but he handled it all very badly. \n\nI think the popularity of the restaurant has made management think that it's ok to treat people badly and that they can afford to lose a few customers. I was appalled and will never go back.\n\nIt was one of the worst evenings we've had in a long time. We can't suggest anything other than just avoid the place until they work out the service kinks unless you are in the mood to take the risk of really bad and rude service.",1,2.562474
6402,So I went here on a date recently.. the atmosphere was perfect for a romantic date! the host was very friendly and sweet. We walked up the stairs to a nice table by the windows. its far enough from other tables that you feel like you can have a nice conversation with the person you're with. The overall feel of the place is classy yet relaxing feel to it as well. The waiter we got was very attentive and was awesome. He was an older bald gentleman.. don't remember his name though. He seemed like he worked there for a while and knows his stuff. My date brought a couple of bottle of wines and the waiter was awesome at making sure the wine was poured into the glass decanter to bring out the flavor of the wine and he even poured it back into the bottle when we couldn't finish it all before we left. Service was amazing! top notch\n\nSo for starters we ordered the bone marrow carne asada.. my dates favorite dish! It was alright for me.. we also had Maryland lump crab cake as well.. which was tasty but nothing amazing. Just a normal crab cake. \n\nSo for dinner my date ordered the ribeye cap for $30 and I ordered the bavette which is supposed to be american kobe.. for $28. Price point for steaks is not bad.. the sauces they have for the steaks are pretty tasty too! So.. as far as my steak goes.. I was expecting a soft filet type of kobe steak... the steak I got was on the chewy side and definitely can't be cut with a fork.. I was not blown away by this steak.. I was expecting a better tasting steak with a better cut for what it was... maybe I should have ordered a the ribeye cap??.. which might of been more flavorful.. we also ordered sides too.. the mac and cheese.. was so good! as well as broccolini i believe... \n\nOverall.. I would like to come back again.. this is a perfect date spot or a nice restaurant to come to with friends.. Service is great.. and food is good. I would recommend!,1,2.286715
