In [98]:
import pandas as pd
import nltk
import re
import gensim
import numpy as np
from nltk import FreqDist
from nltk import PorterStemmer
from nltk import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier # import random forest
from sklearn import metrics
from nltk.corpus import stopwords

In [94]:
df = pd.read_csv('DisneylandReviews.csv')

In [100]:
df1 = df[['Review_Text', 'Performance']]
Paris = df[df['Branch'] == 'Disneyland_Paris']
California = df[df['Branch'] == 'Disneyland_California']
HongKong = df[df['Branch'] == 'HongKong']

In [96]:
content_list = df['Review_Text']
content = ' '.join(content_list)
tokens = nltk.word_tokenize(content)

In [99]:
#Stopwords Removal and only keep text data then change to lowercase
mystopwords = stopwords.words('english')
words = [w.lower() for w in tokens if w.isalpha() if w.lower()not in mystopwords]

In [101]:
POS_tags = nltk.pos_tag(tokens) #use unprocessed 'tokens', not 'words'
#Generate a list of POS tags
POS_tag_list = [(words,tag) for (words,tag) in POS_tags if (tag == 'NNP' and words.isalnum())]
#Generate a frequency distribution of all the POS tags
tag_freq = nltk.FreqDist(POS_tag_list)
#Sort the result 
sorted_tag_freq = sorted(tag_freq.items(), key = lambda k:k[1], reverse = True)

In [43]:
sorted_tag_freq[0:50]

[(('Disney', 'NNP'), 14152),
 (('Disneyland', 'NNP'), 7516),
 (('Paris', 'NNP'), 5878),
 (('Florida', 'NNP'), 1909),
 (('Park', 'NNP'), 1893),
 (('Mountain', 'NNP'), 1805),
 (('Mickey', 'NNP'), 1215),
 (('Studios', 'NNP'), 1050),
 (('Space', 'NNP'), 1029),
 (('Christmas', 'NNP'), 977),
 (('Thunder', 'NNP'), 910),
 (('World', 'NNP'), 903),
 (('Hotel', 'NNP'), 867),
 (('Walt', 'NNP'), 830),
 (('Orlando', 'NNP'), 819),
 (('Buzz', 'NNP'), 753),
 (('Peter', 'NNP'), 697),
 (('Caribbean', 'NNP'), 685),
 (('Main', 'NNP'), 623),
 (('US', 'NNP'), 622),
 (('Food', 'NNP'), 615),
 (('Pan', 'NNP'), 614),
 (('Jones', 'NNP'), 609),
 (('Indiana', 'NNP'), 600),
 (('Magic', 'NNP'), 598),
 (('California', 'NNP'), 593),
 (('DLP', 'NNP'), 566),
 (('Fast', 'NNP'), 544),
 (('Street', 'NNP'), 526),
 (('Star', 'NNP'), 525),
 (('France', 'NNP'), 522),
 (('Village', 'NNP'), 484),
 (('USA', 'NNP'), 477),
 (('Lightyear', 'NNP'), 471),
 (('Just', 'NNP'), 446),
 (('Dreams', 'NNP'), 442),
 (('Euro', 'NNP'), 441),
 (('

In [44]:
content_list = California['Review_Text']
content = ' '.join(content_list)
tokens = nltk.word_tokenize(content)

#change all tokens into lower case 
words1 = [w.lower() for w in tokens]   #list comprehension 
#only keep text words, no numbers 
words2 = [w for w in words1 if w.isalpha()]

POS_tags = nltk.pos_tag(tokens) #use unprocessed 'tokens', not 'words'
#Generate a list of POS tags
POS_tag_list = [(words2,tag) for (words2,tag) in POS_tags if (tag == 'NNP' and words2.isalnum())]
#Generate a frequency distribution of all the POS tags
tag_freq = nltk.FreqDist(POS_tag_list)
#Sort the result 
sorted_tag_freq = sorted(tag_freq.items(), key = lambda k:k[1], reverse = True)

In [45]:
sorted_tag_freq[0:50]

[(('Disneyland', 'NNP'), 15889),
 (('Disney', 'NNP'), 12396),
 (('California', 'NNP'), 4172),
 (('World', 'NNP'), 3004),
 (('Mountain', 'NNP'), 2672),
 (('Adventure', 'NNP'), 2496),
 (('Park', 'NNP'), 1847),
 (('Space', 'NNP'), 1430),
 (('Indiana', 'NNP'), 1356),
 (('Jones', 'NNP'), 1319),
 (('Christmas', 'NNP'), 1288),
 (('Fast', 'NNP'), 1243),
 (('Florida', 'NNP'), 1171),
 (('Star', 'NNP'), 1130),
 (('Mickey', 'NNP'), 1104),
 (('Walt', 'NNP'), 1093),
 (('Pass', 'NNP'), 951),
 (('Halloween', 'NNP'), 905),
 (('Splash', 'NNP'), 891),
 (('Magic', 'NNP'), 858),
 (('Matterhorn', 'NNP'), 840),
 (('Main', 'NNP'), 811),
 (('Mansion', 'NNP'), 787),
 (('Caribbean', 'NNP'), 770),
 (('Haunted', 'NNP'), 754),
 (('Anaheim', 'NNP'), 745),
 (('Orlando', 'NNP'), 719),
 (('Street', 'NNP'), 714),
 (('Land', 'NNP'), 668),
 (('Thunder', 'NNP'), 658),
 (('Fantasmic', 'NNP'), 651),
 (('Pirates', 'NNP'), 604),
 (('Disneyworld', 'NNP'), 604),
 (('Wars', 'NNP'), 592),
 (('Tours', 'NNP'), 563),
 (('Great', 'NNP

In [46]:
def count_pos_neg(data, positive_dict, negative_dict):
# count of positive and negative words that appeared in each message
# net count which is calculated by positive count subtracting negative count. 
    poscnt = []
    negcnt = []
    netcnt = []

    for nrow in range(0,len(data)):
        text = data[nrow]
        
        qa = 0
        qb = 0

        for word in positive_dict :
            if (word in text) :
                qa = qa + 1

        for word in negative_dict :
            if (word in text) :
                qb = qb + 1

        qc = qa - qb

        poscnt.append(qa)
        negcnt.append(qb)
        netcnt.append(qc)

    return (poscnt, negcnt, netcnt)

In [47]:
data = df.Review_Text.str.lower()

In [48]:
data

0        if you've ever been to disneyland anywhere you...
1        its been a while since d last time we visit hk...
2        thanks god it wasn   t too hot or too humid wh...
3        hk disneyland is a great compact park. unfortu...
4        the location is not in the city, took around 1...
                               ...                        
42651    i went to disneyland paris in july 03 and thou...
42652    2 adults and 1 child of 11 visited disneyland ...
42653    my eleven year old daughter and myself went to...
42654    this hotel, part of the disneyland paris compl...
42655    i went to the disneyparis resort, in 1996, wit...
Name: Review_Text, Length: 42656, dtype: object

In [49]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [50]:
analyzer = SentimentIntensityAnalyzer()
scores = [analyzer.polarity_scores(sentence) for sentence in data]
neg_s = [i["neg"] for i in scores]
neu_s = [i["neu"] for i in scores]
pos_s = [i["pos"] for i in scores]
compound_s = [i["compound"] for i in scores]

In [54]:
df1['compound_Vader'] = compound_s
df1['Vader'] = df1['compound_Vader'].apply(lambda x:1 if x > 0 else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['compound_Vader'] = compound_s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['Vader'] = df1['compound_Vader'].apply(lambda x:1 if x >= 0 else 0)


In [55]:
df1

Unnamed: 0,Review_Text,Performance,compound_Vader,Vader
0,If you've ever been to Disneyland anywhere you...,1,0.7069,1
1,Its been a while since d last time we visit HK...,1,0.9866,1
2,Thanks God it wasn t too hot or too humid wh...,1,0.9920,1
3,HK Disneyland is a great compact park. Unfortu...,1,0.8425,1
4,"the location is not in the city, took around 1...",1,0.2846,1
...,...,...,...,...
42651,i went to disneyland paris in july 03 and thou...,1,0.9894,1
42652,2 adults and 1 child of 11 visited Disneyland ...,1,0.9906,1
42653,My eleven year old daughter and myself went to...,1,0.8402,1
42654,"This hotel, part of the Disneyland Paris compl...",1,0.9538,1


In [58]:
from sklearn.metrics import confusion_matrix
confusion_matrix = pd.crosstab(df1['Performance'], df1['Vader'], rownames=['Actual'], colnames=['Predicted'])
confusion_matrix

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2880,5855
1,1399,32522


In [59]:
from sklearn.metrics import classification_report
print(classification_report(df1['Performance'], df1['Vader']))

              precision    recall  f1-score   support

           0       0.67      0.33      0.44      8735
           1       0.85      0.96      0.90     33921

    accuracy                           0.83     42656
   macro avg       0.76      0.64      0.67     42656
weighted avg       0.81      0.83      0.81     42656



In [60]:
from textblob import TextBlob

In [61]:
df1["score_TextBlob"] = df1["Review_Text"].map(lambda x:TextBlob(x).sentiment.polarity)
df1['TB'] = df1['score_TextBlob'].apply(lambda x:1 if x >= 0 else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1["score_TextBlob"] = df1["Review_Text"].map(lambda x:TextBlob(x).sentiment.polarity)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['TB'] = df1['score_TextBlob'].apply(lambda x:1 if x >= 0 else 0)


In [62]:
df1

Unnamed: 0,Review_Text,Performance,compound_Vader,Vader,score_TextBlob,TB
0,If you've ever been to Disneyland anywhere you...,1,0.7069,1,0.243981,1
1,Its been a while since d last time we visit HK...,1,0.9866,1,0.236131,1
2,Thanks God it wasn t too hot or too humid wh...,1,0.9920,1,0.160498,1
3,HK Disneyland is a great compact park. Unfortu...,1,0.8425,1,0.189286,1
4,"the location is not in the city, took around 1...",1,0.2846,1,0.266667,1
...,...,...,...,...,...,...
42651,i went to disneyland paris in july 03 and thou...,1,0.9894,1,0.252273,1
42652,2 adults and 1 child of 11 visited Disneyland ...,1,0.9906,1,0.179250,1
42653,My eleven year old daughter and myself went to...,1,0.8402,1,0.153205,1
42654,"This hotel, part of the Disneyland Paris compl...",1,0.9538,1,0.265327,1


In [63]:
from sklearn.metrics import confusion_matrix
confusion_matrix = pd.crosstab(df1['Performance'], df1['TB'], rownames=['Actual'], colnames=['Predicted'])
confusion_matrix

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2396,6339
1,1390,32531


In [68]:
# remove non alphabets
remove_non_alphabets = lambda x: re.sub(r'[^a-zA-Z]',' ',x)

# tokenn alphabets-only list
tokenize = lambda x: word_tokenize(x)

# assign ps to a lambda function to run on each line of value
ps = PorterStemmer()
stem = lambda w: [ ps.stem(x) for x in w ]

# assign lemmatizer to a lambda function to run on each line of value
lemmatizer = WordNetLemmatizer()
leammtizer = lambda x: [ lemmatizer.lemmatize(word) for word in x ]

In [73]:
# apply all above methods to the column ''
print('Processing : [=', end='')
df1['Review_Text'] = df1['Review_Text'].apply(remove_non_alphabets)
print('=', end='')
df1['Review_Text'] = df1['Review_Text'].apply(tokenize)
print('=', end='')
df1['Review_Text'] = df1['Review_Text'].apply(stem)
print('=', end='')
df1['Review_Text'] = df1['Review_Text'].apply(leammtizer)
print('=', end='')
df1['Review_Text'] = df1['Review_Text'].apply(lambda x: ' '.join(x))
print('] : Completed', end='')
df1

Processing : [=

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['Review_Text'] = df1['Review_Text'].apply(remove_non_alphabets)


=

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['Review_Text'] = df1['Review_Text'].apply(tokenize)


=

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['Review_Text'] = df1['Review_Text'].apply(stem)


=

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['Review_Text'] = df1['Review_Text'].apply(leammtizer)


=] : Completed

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['Review_Text'] = df1['Review_Text'].apply(lambda x: ' '.join(x))


Unnamed: 0,Review_Text,Performance,compound_Vader,Vader,score_TextBlob,TB
0,If you ve ever been to disneyland anywher you ...,1,0.7069,1,0.243981,1
1,it been a while sinc d last time we visit HK d...,1,0.9866,1,0.236131,1
2,thank god it wasn t too hot or too humid when ...,1,0.9920,1,0.160498,1
3,HK disneyland is a great compact park unfortun...,1,0.8425,1,0.189286,1
4,the locat is not in the citi took around hour ...,1,0.2846,1,0.266667,1
...,...,...,...,...,...,...
42651,i went to disneyland pari in juli and thought ...,1,0.9894,1,0.252273,1
42652,adult and child of visit disneyland pari begin...,1,0.9906,1,0.179250,1
42653,My eleven year old daughter and myself went to...,1,0.8402,1,0.153205,1
42654,thi hotel part of the disneyland pari complex ...,1,0.9538,1,0.265327,1


In [77]:
# split to 30 percent test data and 70 percent train data
# labels can be seen as y, an dependent variable
train_corpus, test_corpus, train_labels, test_labels = train_test_split(df1['Review_Text'],
                                                                        df1["Performance"],
                                                                        test_size=0.3)

In [82]:
# tokenize documents for word2vec
tokenized_train = [nltk.word_tokenize(text)
                   for text in train_corpus]
tokenized_test = [nltk.word_tokenize(text)
                   for text in test_corpus]  

# build word2vec model                   
wv_model = gensim.models.Word2Vec(tokenized_train,
                               size=200,                          #set the size or dimension for the word vectors 
                               window=60,                        #specify the length of the window of words taken as context
                               min_count=10)                   #ignores all words with total frequency lower than 10

def average_word_vectors(words, model, vocabulary, num_features):
    
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector 
   

def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index2word)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)

# averaged word vector features from word2vec
avg_wv_train_features = averaged_word_vectorizer(corpus=tokenized_train,
                                                 model=wv_model,
                                                 num_features=200)                   
avg_wv_test_features = averaged_word_vectorizer(corpus=tokenized_test,
                                                model=wv_model,
                                                num_features=200) 

  feature_vector = np.add(feature_vector, model[word])


In [86]:
# define a function that trains the model, performs predictions and evaluates the predictions
def train_predict_evaluate_model(classifier, 
                                 train_features, train_labels, 
                                 test_features, test_labels):
    # build model    
    classifier.fit(train_features, train_labels)
    # predict using model
    predictions = classifier.predict(test_features) 
    # evaluate model prediction performance   
    '''get_metrics(true_labels=test_labels, 
                predicted_labels=predictions)'''
    print(metrics.classification_report(test_labels,predictions))
    return predictions, get_metrics(true_labels=test_labels, predicted_labels=predictions)   

In [90]:
# define a function to evaluate our classification models based on four metrics
# This defined function is also useful in other cases. This is comparing test_y and pred_y. 
# Both contain 1s and 0s.
def get_metrics(true_labels, predicted_labels):
    metrics_dict = dict(zip(["accuracy", "precision", "recall", "f1"], [None]*4))
    #metrics_dict = {i:None for i in ["accuracy", "precision", "recall", "f1"]}
    for m in metrics_dict.keys():
        exec('''metrics_dict['{}'] = np.round(                                                    
                        metrics.{}_score(true_labels, 
                                               predicted_labels),
                        2)'''.format(m, m))
    return metrics_dict

In [91]:
# assign random forest function to an object
rf = RandomForestClassifier(criterion="entropy")

# predict and evaluate random forest
rf_avgwv_predictions, rf_avgwv_metrics = train_predict_evaluate_model(classifier=rf,
                                           train_features=avg_wv_train_features,
                                           train_labels=train_labels,
                                           test_features=avg_wv_test_features,
                                           test_labels=test_labels)

              precision    recall  f1-score   support

           0       0.79      0.41      0.54      2609
           1       0.87      0.97      0.92     10188

    accuracy                           0.86     12797
   macro avg       0.83      0.69      0.73     12797
weighted avg       0.85      0.86      0.84     12797



In [92]:
confusion_matrix = pd.crosstab(test_labels, rf_avgwv_predictions, rownames=['Actual'], colnames=['Predicted'])
confusion_matrix

Predicted,0,1
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1064,1545
1,285,9903
