In [2]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn import preprocessing
import numpy as np
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
import pandas as pd
import nltk
import random
from sklearn.externals import joblib
from nltk.tokenize import word_tokenize,sent_tokenize
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,BaggingClassifier,GradientBoostingClassifier
import re
import time
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn import decomposition
from nltk.stem.wordnet import WordNetLemmatizer

## Read Data

Read the data from 6 csv file, each corresponds to data of one place.

ny: New York mi: Miami bf: Buffalo fw: Fort Worth ys: Yellow Stone National Park gc: Grand Canyon

In [3]:
ny = pd.read_csv('ny_new.csv')
mi = pd.read_csv('mi_new.csv')
bf = pd.read_csv('bf_new.csv')
fw = pd.read_csv('fw_new.csv')
ys = pd.read_csv('ys_new.csv')
gc = pd.read_csv('gc_new.csv')

ny = shuffle(ny)
mi = shuffle(mi)
bf = shuffle(bf)
fw = shuffle(fw)
ys = shuffle(ys)
gc = shuffle(gc)

## Data Processing

Divide the whole dataset into train set and test set, and define all the functions needed.

In [4]:
df = [ny[:17000],mi[:17000],bf[:6000],fw[:13000],ys[:8000],gc[:8000]]
sample = pd.concat(df)
df2 = [ny[17000:],mi[17000:],bf[6000:],fw[13000:],ys[8000:],gc[8000:]]
test_text = pd.concat(df2)

In [5]:
input_text = list(sample['text'])
rate = list(sample['overall_rate'])
test_input = list(test_text['text'])
test_rate = list(test_text['overall_rate'])

In [6]:
sent = []
for review in input_text:
    sent.append(len(sent_tokenize(review)))

In [7]:
sent2 = []
for review in test_input:
    sent2.append(len(sent_tokenize(review)))

In [8]:
def clean(text): #define function to clean the data set
    wordnet_lemmatizer = WordNetLemmatizer()
    theText = text.split()
    tokens = [re.sub(r'[^a-zA-Z!]+', ' ',word) for word in theText]
    words = [wordnet_lemmatizer.lemmatize(token) for token in tokens]
    words = [word.lower() for word in words if not word.isupper()] + [word for word in words if word.isupper()]
    tokens = " ".join(words)
    return tokens

In [9]:
def clean2(text): #define function to clean the data set
    wordnet_lemmatizer = WordNetLemmatizer()
    theText = text.split()
    tokens = [re.sub(r'[^a-zA-Z!]+', ' ',word) for word in theText]
    words = [wordnet_lemmatizer.lemmatize(token) for token in tokens]
    words = [word.lower() for word in words if not word.isupper()] + [word for word in words if word.isupper()]
    return words

In [10]:
#define function to call the model
thePathLut = '/Users/AngelaChi/Documents/2017-9/projects files/'
theLUT = pd.read_csv(thePathLut + 'classifierLUT.csv',index_col=0)
def algoArray(theAlgo):     
    theAlgoOut = theLUT.loc[theAlgo,'functionCall']
    return theAlgoOut
def optFunc(theAlgo,theParams):
    theModel = theLUT.loc[theAlgo,'optimizedCall']
    tempParam = list()
    for key, value in theParams.iteritems(): 
        tempParam.append(str(key) + "=" + str(value)) 
    theParams = ",".join(tempParam)
    theModel = theModel + theParams + ")"
    return theModel

In [11]:
#define funtion to select certain kinds of words as features
def pos_select(input_list):
    total_pos = []
    for i in input_list:
        total_pos.append(nltk.pos_tag(i,tagset='universal'))
    total_words = []
    for words in total_pos:
        lis = []
        for i in words:
            if str(i[1]) == 'ADJ' or str(i[1]) == 'ADV':
                if len(str(i[0])) > 1:
                    lis.append(str(i[0]))
        total_words.append(lis)
    words = []
    for word in total_words:
        word2 = " ".join(word)
        words.append(word2)
    return words

In [12]:
#define funtion to select certain features
def gram_select(total_terms):
    one_gram = []
    two_gram = []
    three_gram = []
    for term in total_terms:
        if len(term[0].split()) == 1 and ((str(term[1]) == 'ADJ' or str(term[1]) == 'ADV') or str(term[0]).isupper()):
            one_gram.append(term)
        if len(term[0].split()) == 2 and (str(term[0].split()[0]) == 'no' or str(term[0].split()[0]) == 'not'):
            two_gram.append(term)
        if len(term[0].split()) == 3 and (str(term[0].split()[0]) == 'no' or str(term[0].split()[0]) == 'not'):
            three_gram.append(term)
    useful_word = one_gram + two_gram + three_gram
    useful_word = [str(a) for a in list(zip(*useful_word))[0]]
    return useful_word

In [13]:
#define funtion to train the model
def train(res, rate):
    theModels = ['RF','ABDT','BAG','GBC']
    theResults = pd.DataFrame(0,index=theModels,columns=['accuracy','confidence','runtime'])
    for theModel in theModels:
        startTime = time.time()
        model = eval(algoArray(theModel))
        print(model)
        #cross validation    
        cvPerf = cross_val_score(model,res,rate,cv=10) 
        theResults.loc[theModel,'accuracy'] = round(cvPerf.mean(),2)
        theResults.loc[theModel,'confidence'] = round(cvPerf.std() * 2,2)
        endTime = time.time()
        theResults.loc[theModel,'runtime'] = round(endTime - startTime,0)
    return theResults

In [14]:
#define function to perform grid-search on parameters
def grid_search(theResults, res, rate):
    modelChoice = theResults['accuracy'].idxmax()              
    startTime = time.time()
    model = eval(algoArray(modelChoice))
    grid = GridSearchCV(estimator=model, param_grid={"n_estimators": [10,30,50,100]})#eval(gridSearch(modelChoice))
    grid.fit(res,rate)
    bestScore = round(grid.best_score_,4)
    parameters = grid.best_params_
    endTime = time.time()
    print("Best Score: " + str(bestScore) + " and Grid Search Time: " + str(round(endTime - startTime,0)))
    return modelChoice, parameters

In [15]:
#define function to perform feature selection
def feature_selection(input_list, rate):
    vectorizer = CountVectorizer(min_df = 40, max_df = 0.8, ngram_range = (1,3))
    dtm = pd.DataFrame(vectorizer.fit_transform(input_list).toarray())
    names = vectorizer.get_feature_names()
    lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(dtm, rate)
    model_s = SelectFromModel(lsvc, prefit=True)
    X_new = pd.DataFrame(model_s.transform(dtm))
    lis = list(model_s.get_support())
    features = []
    for i in range(len(lis)):
        if lis[i] == True:
            features.append(names[i])
    X_new.columns = features
    return X_new, vectorizer, lsvc

In [16]:
#define function to read in emotion dictionary
def get_nrc_data():
    nrc = "NRC-emotion.txt"
    count=0
    emotion_dict=dict()
    with open(nrc,'r') as f:
        all_lines = list()
        for line in f:
            if count < 46:
                count+=1
                continue
            line = line.strip().split('\t')
            if int(line[2]) == 1:
                if emotion_dict.get(line[0]):
                    emotion_dict[line[0]].append(line[1])
                else:
                    emotion_dict[line[0]] = [line[1]]
    return emotion_dict

In [17]:
#define function to count numbers of words of different categories in review
def emotion_analyzer(text):
    #Set up the result dictionary
    emotions = {x for y in emotion_dict.values() for x in y}
    emotion_count = dict()
    for emotion in emotions:
        emotion_count[emotion] = 0
    for word in text.split():
        if emotion_dict.get(word):
            for emotion in emotion_dict.get(word):
                emotion_count[emotion] += 1.0
    return emotion_count

In [18]:
#define function to choose model
def model_choice(result,reducedTDM,rate):
    para = 0
    modelChoice = result['accuracy'].idxmax()
    if modelChoice == 'RF':
        model, para = grid_search(result, reducedTDM, rate)
    else:
        model = modelChoice
    return model, para

In [19]:
input_list = []
for i in input_text:
    input_list.append(clean(i))

In [20]:
test_list = []
for i in test_input:
    test_list.append(clean(i))

### Model 1

Use all words

In [20]:
vectorizer1 = TfidfVectorizer(max_features=1000,ngram_range=(1,3))
tdm1 = pd.DataFrame(vectorizer1.fit_transform(input_list).toarray())
tdm1.columns=vectorizer1.get_feature_names()
pca1 = decomposition.PCA(n_components=.95)
pca1.fit(tdm1)
reducedTDM1 = pd.DataFrame(pca1.transform(tdm1))

In [21]:
result1 = train(reducedTDM1, rate)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=50,
            verbose=0, warm_start=False)


In [22]:
startTime = time.time()
model, para = model_choice(result1,reducedTDM1,rate)
if para == 0:
    model = eval(algoArray(model))
else:
    model = eval(optFunc(model,para))
model.fit(reducedTDM1,rate)
joblib.dump(model, 'basic_model.pkl') #save model
endTime = time.time()
print("Model Save Time: " + str(round(endTime - startTime,0)))

Best Score: 0.5133 and Grid Search Time: 1325.0
Model Save Time: 1611.0


In [23]:
test1 = vectorizer1.transform(test_list)
X2_new1 = pca1.transform(test1.toarray())

In [24]:
accuracy_score(test_rate, model.predict(X2_new1))

0.50358518518518514

### Model 2

Only use adjective and adverbs in the input

In [25]:
input_list2 = []
for i in input_text:
    input_list2.append(list(clean2(i)))

In [26]:
words = pos_select(input_list2)

In [27]:
vectorizer2 = CountVectorizer(min_df = 40, max_df = 0.85, ngram_range = (1,1))
dtm2 = pd.DataFrame(vectorizer2.fit_transform(words).toarray())
dtm2.columns=vectorizer2.get_feature_names()
pca2 = decomposition.PCA(n_components=.95)
pca2.fit(dtm2)
reducedTDM2 = pd.DataFrame(pca2.transform(dtm2))

In [28]:
result2 = train(reducedTDM2, rate)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=50,
            verbose=0, warm_start=False)


In [29]:
startTime = time.time()
model, para  = model_choice(result2,reducedTDM2,rate)
if para == 0:
    model = eval(algoArray(model))
else:
    model = eval(optFunc(model,para))
model.fit(reducedTDM2,rate)
joblib.dump(model, 'adj_model.pkl') #save model
endTime = time.time()
print("Model Save Time: " + str(round(endTime - startTime,0)))

Best Score: 0.5202 and Grid Search Time: 1334.0
Model Save Time: 1631.0


In [30]:
test_list2 = []
for i in test_input:
    test_list2.append(clean2(i))

In [31]:
test_input = pos_select(test_list2)

In [32]:
test2 = vectorizer2.transform(test_input)
X2_new2 = pca2.transform(test2.toarray())
accuracy_score(test_rate, model.predict(X2_new2))

0.50761481481481485

### Model 3

Use features with one to three word, and then use SVC to select features, and eliminate some useless features by hand.

For features with only one word, only keep those that are adjectives or adverbs.

For features with two or three word, only keep those with 'not' or 'no'.

In [21]:
X_new, vectorizer3, lsvc1 = feature_selection(input_list, rate)
features = X_new.columns

In [22]:
total_terms1 = nltk.pos_tag(features,tagset='universal')
useful_word = gram_select(total_terms1)

In [23]:
len(useful_word)

308

In [24]:
df_feature1 = X_new[useful_word]
df_feature1 = df_feature1.as_matrix()

In [79]:
useful_word

['able',
 'absolutely',
 'ac',
 'actually',
 'affordable',
 'again',
 'almost',
 'already',
 'also',
 'always',
 'amaury',
 'amazing',
 'anniversary',
 'anywhere',
 'atmosphere',
 'attentive',
 'available',
 'average',
 'away',
 'awesome',
 'awful',
 'back',
 'bad',
 'barely',
 'bartender',
 'basic',
 'basically',
 'bath',
 'beautiful',
 'beautifully',
 'bell',
 'best',
 'better',
 'big',
 'bigger',
 'biggest',
 'birthday',
 'bottle',
 'brian',
 'bright',
 'broadway',
 'broken',
 'busy',
 'center',
 'central',
 'cheap',
 'choose',
 'classy',
 'clean',
 'clearly',
 'close',
 'closed',
 'comfortable',
 'complimentary',
 'conveniently',
 'convention',
 'courteous',
 'decent',
 'definitely',
 'delicious',
 'different',
 'difficult',
 'disappointing',
 'doe',
 'double',
 'downside',
 'downtown',
 'due',
 'early',
 'easy',
 'efficient',
 'elegant',
 'else',
 'elsewhere',
 'enough',
 'entire',
 'especially',
 'etc',
 'even',
 'ever',
 'everywhere',
 'exactly',
 'excellent',
 'exceptional',
 '

In [41]:
result3 = train(df_feature1, rate)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=50,
            verbose=0, warm_start=False)
AdaBoostClassifier(algorithm='SAMME',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
          learning_rate=1, n_estimators=300, random_state=None)
BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=10, n_jobs=1, oob_score=False, rand

In [42]:
startTime = time.time()
model, para = model_choice(result3,df_feature1,rate)
if para == 0:
    model = eval(algoArray(model))
else:
    model = eval(optFunc(model,para))
model.fit(df_feature1, rate)
joblib.dump(model, 'self_model.pkl') #save model
endTime = time.time()
print("Model Save Time: " + str(round(endTime - startTime,0)))

Model Save Time: 261.0


In [49]:
test3 = pd.DataFrame(vectorizer3.transform(test_list).toarray())
test_names = vectorizer3.get_feature_names()
test3.columns = test_names

In [54]:
model_t = SelectFromModel(lsvc1, prefit=True)
X_new_test = pd.DataFrame(model_t.transform(test3))

In [55]:
lis2 = list(model_t.get_support())

In [56]:
features2 = []
for i in range(len(lis2)):
    if lis2[i] == True:
        features2.append(test_names[i])
X_new_test.columns = features2

In [57]:
total_terms_test = nltk.pos_tag(features2,tagset='universal')
useful_word_test = gram_select(total_terms_test)
len(useful_word_test)

308

In [48]:
accuracy_score(test_rate, model.predict(X_new_test[useful_word_test]))

0.57351111111111108

In [49]:
result3

Unnamed: 0,accuracy,confidence,runtime
RF,0.54,0.07,39.0
ABDT,0.55,0.05,1006.0
BAG,0.52,0.08,348.0
GBC,0.58,0.06,3592.0


### Model 4

Based on model 3, add over-sampling and under-sampling to the model.

For those with rating under-represented in the data, over-sample with replacement.

For those with rating over-represented in the data, under-sample without replacement.

In [25]:
dis = sample.groupby(['overall_rate']).size().reset_index(name='counts')

In [26]:
bar = sample.shape[0]
bar = bar/5

In [27]:
min_list = list(dis[dis['counts'] < bar]['overall_rate'])
max_list = list(dis[dis['counts'] >= bar]['overall_rate'])
df_min = sample[sample.overall_rate.isin(min_list)]
df_max = sample[sample.overall_rate.isin(max_list)]
sample1 = df_max.groupby('overall_rate').apply(lambda s: s.sample(bar)) 
sample2 = df_min.groupby('overall_rate').apply(lambda s: s.sample(bar, replace = True))
sample_b = sample1.append(sample2)

In [28]:
input_text_b = list(sample_b['text'])
rate_b = list(sample_b['overall_rate'])

In [29]:
input_list_b = []
for i in input_text_b:
    input_list_b.append(clean(i))

In [30]:
X_n, vectorizer4, lsvc2 = feature_selection(input_list_b, rate_b)
features_n = X_n.columns

In [31]:
total_terms2 = nltk.pos_tag(features_n,tagset='universal')
useful_word_b = gram_select(total_terms2)

In [32]:
len(useful_word_b)

446

In [33]:
df_feature2 = X_n[useful_word_b]
df_feature2 = df_feature2.as_matrix()

In [34]:
result4 = train(df_feature2, rate_b)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=50,
            verbose=0, warm_start=False)
AdaBoostClassifier(algorithm='SAMME',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
          learning_rate=1, n_estimators=300, random_state=None)
BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=10, n_jobs=1, oob_score=False, rand

In [35]:
startTime = time.time()
model, para = model_choice(result4,df_feature2,rate_b)
if para == 0:
    model = eval(algoArray(model))
else:
    model = eval(optFunc(model,para))
model.fit(df_feature2,rate_b)
joblib.dump(model, 'resample.pkl') #save model
endTime = time.time()
print("Model Save Time: " + str(round(endTime - startTime,0)))

Best Score: 0.7772 and Grid Search Time: 231.0
Model Save Time: 266.0


In [36]:
test4 = pd.DataFrame(vectorizer4.transform(test_list).toarray())
test_names4 = vectorizer4.get_feature_names()
test4.columns = test_names4

In [37]:
model_t2 = SelectFromModel(lsvc2, prefit=True)
X_new_test2 = pd.DataFrame(model_t2.transform(test4))

In [38]:
lis3 = list(model_t2.get_support())
len(lis3)

50543

In [39]:
features3 = []
for i in range(len(lis3)):
    if lis3[i] == True:
        features3.append(test_names4[i])
X_new_test2.columns = features3

In [42]:
total_terms_test2 = nltk.pos_tag(features3,tagset='universal')
useful_word_test2 = gram_select(total_terms_test2)
len(useful_word_test2)

446

In [43]:
accuracy_score(test_rate, model.predict(X_new_test2[useful_word_test2]))

0.5510518518518519

### Model 5

Based on model 3, add the number of sentences of review as a new input.

In [44]:
df_feature1 = pd.DataFrame(df_feature1)

In [45]:
df_feature1['length'] = sent

In [46]:
result5 = train(df_feature1, rate)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=50,
            verbose=0, warm_start=False)
AdaBoostClassifier(algorithm='SAMME',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
          learning_rate=1, n_estimators=300, random_state=None)
BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=10, n_jobs=1, oob_score=False, rand

In [47]:
startTime = time.time()
model, para = model_choice(result5,df_feature1,rate)
if para == 0:
    model = eval(algoArray(model))
else:
    model = eval(optFunc(model,para))
model.fit(df_feature1,rate)
joblib.dump(model, 'length.pkl') #save model
endTime = time.time()
print("Model Save Time: " + str(round(endTime - startTime,0)))

Model Save Time: 205.0


In [58]:
X_new_test3 = X_new_test[useful_word_test]
X_new_test3['length'] = sent2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [59]:
accuracy_score(test_rate, model.predict(X_new_test3))

0.57019259259259258

In [76]:
result5

Unnamed: 0,accuracy,confidence,runtime
RF,0.54,0.07,32.0
ABDT,0.55,0.06,364.0
BAG,0.53,0.08,329.0
GBC,0.58,0.05,1848.0


### Model 6

Use the sentiment polarity scores of review text as input.

In [60]:
analyser = SentimentIntensityAnalyzer()

In [61]:
res = []
for review in input_text:
    res.append(analyser.polarity_scores(review))

In [62]:
res = pd.DataFrame.from_records(res)
res = res.drop(['compound'], axis=1)

In [63]:
result6 = train(res, rate)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=50,
            verbose=0, warm_start=False)
AdaBoostClassifier(algorithm='SAMME',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
          learning_rate=1, n_estimators=300, random_state=None)
BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=10, n_jobs=1, oob_score=False, rand

In [64]:
startTime = time.time()
model, para = model_choice(result6,res,rate) #train fully validated and optimized model
if para == 0:
    model = eval(algoArray(model))
else:
    model = eval(optFunc(model,para))
model.fit(res,rate)
joblib.dump(model, 'rate.pkl') #save model
endTime = time.time()
print("Model Save Time: " + str(round(endTime - startTime,0)))

Model Save Time: 5.0


In [65]:
res2 = []
for review in test_input:
    res2.append(analyser.polarity_scores(review))

In [66]:
res2 = pd.DataFrame.from_records(res2)
res2 = res2.drop(['compound'], axis=1)

In [67]:
accuracy_score(test_rate, model.predict(res2))

0.52272592592592593

### Model 7

Use the number of words in different sentiment categories of each review text as input.

In [68]:
emotion_dict = get_nrc_data()
review_emotion_dic = {1:{},2:{},3:{},4:{},5:{}}

In [69]:
res = []
for review in input_text:
    res.append(emotion_analyzer(review))

In [70]:
res = pd.DataFrame.from_records(res)

In [71]:
result7 = train(res, rate)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=50,
            verbose=0, warm_start=False)
AdaBoostClassifier(algorithm='SAMME',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
          learning_rate=1, n_estimators=300, random_state=None)
BaggingClassifier(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=10, n_jobs=1, oob_score=False, rand

In [72]:
startTime = time.time()
model, para = model_choice(result7,res,rate) #train fully validated and optimized model
if para == 0:
    model = eval(algoArray(model))
else:
    model = eval(optFunc(model,para))
model.fit(res,rate)
joblib.dump(model, 'rate2.pkl') #save model
endTime = time.time()
print("Model Save Time: " + str(round(endTime - startTime,0)))

Model Save Time: 21.0


In [73]:
res2 = []
for review in test_input:
    res2.append(emotion_analyzer(review))

In [74]:
res2 = pd.DataFrame.from_records(res2)

In [75]:
accuracy_score(test_rate, model.predict(res2))

0.49315555555555557