### MMAI 5400 Assignment 2 - Sentiment Classification

#### Jiawen Li

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

In [2]:
review = pd.read_csv('reviews.csv', delimiter='\t')
review.head()

Unnamed: 0,Name,RatingValue,DatePublished,Review
0,Playa Cabana,4,2020-02-26,I was tasked with finding a spot for a group d...
1,Playa Cabana,3,2019-08-04,Went here with my friends and family. I liked ...
2,Playa Cabana,3,2019-08-24,Surprisingly good Flautas! They came as 3 roll...
3,Playa Cabana,4,2019-06-06,As a Mexican I always crave authentic Mexican ...
4,Playa Cabana,5,2020-05-25,Best tacos I've ever had. Both locations are g...


### Categorize RatingValue into negative (1, 2), neutral (3), and positive (4, 5) emotions.
#### In the "Sentiment" column that we created, code negative, neutral, and positive as 0, 1, and 2 respectively.
#### Thus, negative (1, 2) turns into 0; neutral (3) turns into 1; and positive (4, 5) turns into 2.

In [3]:
def binned_rating(rating):
    if rating <= 2:
        return 0
    elif rating == 3:
        return 1
    elif rating >= 4:
        return 2
    
review['Sentiment'] = review['RatingValue'].apply(binned_rating)
review = review[['Sentiment', 'Review']].reset_index(drop=True)

### Balance the data by reducing the number of positive comments to match the number of negative and neutral comments.

Check out how many reviews there are for positive, neutral and negative sentiments.

In [4]:
sen_counts = review['Sentiment'].value_counts()
sen_counts

2    1465
1     297
0     158
Name: Sentiment, dtype: int64

In [5]:
min_sen_counts = sen_counts.min()

In [6]:
# Draw samples from the sentiment review dataset and make sure each seniment would have equal amount of review
review_balanced = pd.concat([review[review['Sentiment'] == 0].sample(min_sen_counts, random_state=0),
                        review[review['Sentiment'] == 1].sample(min_sen_counts, random_state=0),
                        review[review['Sentiment'] == 2].sample(min_sen_counts, random_state=0)])

review_balanced_counts = review_balanced['Sentiment'].value_counts()
review_balanced_counts

0    158
1    158
2    158
Name: Sentiment, dtype: int64

In [7]:
review_balanced.head()

Unnamed: 0,Sentiment,Review
112,0,Food was awful - worst tacos I've had in my en...
583,0,"II really, really wanted to just love this pla..."
1512,0,"Terrible service. Despite seeing my party, the..."
1588,0,"Not for everyone, maybe an acquired taste. Had..."
788,0,This restaurant is really for the experience. ...


In [8]:
# Shuffle the balanced DataFrame
review_balanced = review_balanced.sample(frac=1, random_state=1).reset_index(drop=True)

review_balanced.insert(0, 'Number', range(1, len(review_balanced) + 1))

review_balanced

Unnamed: 0,Number,Sentiment,Review
0,1,1,"The entrance was a bit hard to find, since the..."
1,2,1,I think I set my expectations too high for thi...
2,3,0,This place is overhyped.... the food was quite...
3,4,2,Really delicious food that takes a twist on tr...
4,5,1,I'm really not sure why this place has such gr...
...,...,...,...
469,470,1,Authentic italian food served in the same fash...
470,471,0,I love Ramen and every winter I'm in Japan . W...
471,472,2,"Well, damn. This place is just too good to be ..."
472,473,1,TL;DR Delicious Chinese food but not exactly a...


### Split data set into training and validation sets and save them as train.csv and valid.csv

In [9]:
from sklearn.model_selection import train_test_split

train_df, valid_df = train_test_split(review_balanced, test_size=0.2, random_state=1)

# Save the training and validation sets to CSV files
train_path = './train.csv'
valid_path = './valid.csv'

train_df.to_csv(train_path, index=False)
valid_df.to_csv(valid_path, index=False)

In [10]:
train_df['Sentiment'].value_counts()

0    136
2    122
1    121
Name: Sentiment, dtype: int64

In [11]:
valid_df['Sentiment'].value_counts()

1    37
2    36
0    22
Name: Sentiment, dtype: int64

### Modeling BoW text classification with data from training

#### Extracting features from text files (Bag of Words)
#### Tokenizing text with ```scikit-learn```

Use **CountVectorizer**, which builds a dictionary of features and transforms documents to feature vectors to perform text preprocessing, tokenizing and filtering of stopwords

#### Use **CountVectorizer** to transform the **'Review'** column from the train data set. It will sperate the text messages into many different tokens (i.e. characters, words)

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
train_df_counts = count_vect.fit_transform(train_df['Review'])
train_df_counts.shape

(379, 4780)

##### This means that we have 379 corpus (reviews) in total, and 4780 vocabularies (tokens). The result is correct.

In [13]:
train_df_counts.max() # the most frequent word in this document occurs 60 times across the whole files

60

In [14]:
count_vect.vocabulary_

{'in': 2148,
 'word': 4707,
 'mediocre': 2581,
 'always': 216,
 'hit': 2047,
 'or': 2888,
 'miss': 2648,
 'cramped': 1103,
 'over': 2920,
 'priced': 3230,
 'and': 237,
 'not': 2794,
 'overly': 2935,
 'well': 4633,
 'trained': 4360,
 'servers': 3670,
 'stick': 4008,
 'to': 4302,
 'the': 4228,
 'basics': 430,
 'when': 4648,
 'ordering': 2892,
 'pass': 3002,
 'on': 2859,
 'deserts': 1244,
 'try': 4392,
 'this': 4253,
 'place': 3115,
 'you': 4762,
 'may': 2553,
 'get': 1841,
 'lucky': 2472,
 'but': 651,
 'it': 2223,
 'gamble': 1817,
 'sometimes': 3862,
 'pays': 3025,
 'off': 2833,
 'bust': 647,
 'hands': 1967,
 'down': 1356,
 'one': 2861,
 'of': 2832,
 'my': 2721,
 'favourite': 1619,
 'italian': 2225,
 'places': 3120,
 'city': 878,
 'isn': 2220,
 'your': 4767,
 'typical': 4415,
 'rustic': 3548,
 'joint': 2252,
 'its': 2230,
 'clean': 892,
 'modern': 2661,
 'decor': 1200,
 'doesn': 1334,
 'just': 2264,
 'feel': 1626,
 'classy': 889,
 'upscale': 4485,
 'we': 4617,
 'decided': 1196,
 'family'

### Occurence has it's own limit (longer documents tends to have higher average count values), therefore we need to turn occurence to frequencies.

Calculate Term Frequencies (``TF``): Divide the number of occurrences of each word in a document by the total number of words in the document

Also, downscale weights for words that occur in many documents in the corpus and are therefore less informative than those that occur only in a smaller portion (fewer documents) of the corpus.

Therefore, we calculate the **tf–idf** for “Term Frequency times Inverse Document Frequency”.

**Converting the raw frequency counts obtained from CountVectorizer to a more informative metric called TF-IDF (Term Frequency-Inverse Document Frequency). This metric helps to reflect the importance of a word to a document in a collection or corpus.**

In [15]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
train_tfidf = tfidf_transformer.fit_transform(train_df_counts)
train_tfidf.shape

(379, 4780)

In [16]:
print(train_tfidf)

  (0, 4762)	0.06721113617995302
  (0, 4707)	0.20149380873309078
  (0, 4648)	0.08544643463263649
  (0, 4633)	0.09563753044454328
  (0, 4392)	0.09018531497988175
  (0, 4360)	0.21193087055745422
  (0, 4302)	0.04320125437051756
  (0, 4253)	0.056439695510484264
  (0, 4228)	0.07668463203836308
  (0, 4008)	0.186783596293084
  (0, 3862)	0.3526930689374411
  (0, 3670)	0.14133354637504197
  (0, 3230)	0.16163632202871378
  (0, 3115)	0.06676597908733335
  (0, 3025)	0.22664108299746102
  (0, 3002)	0.1933981946375521
  (0, 2935)	0.17207338385307724
  (0, 2920)	0.1105937088970067
  (0, 2892)	0.13964580728475467
  (0, 2888)	0.09419575736671264
  (0, 2859)	0.06200453243478066
  (0, 2833)	0.11981728476173095
  (0, 2794)	0.05627365472696401
  (0, 2648)	0.18119103307941897
  (0, 2581)	0.14692610958870703
  :	:
  (377, 1732)	0.08163209004213283
  (377, 1725)	0.08962231851744334
  (377, 1585)	0.24575144004514846
  (377, 1478)	0.21176450902015617
  (377, 1313)	0.35542225685888745
  (377, 1200)	0.236026660330

**Interpreting the TF-IDF matrix:**
In the context of a TF-IDF matrix, for example, an entry like (0, 4762) 0.06721113617995302 means:

- 0: This is the index of the document in your dataset (in our case, first review), starting from 0. So it corresponds to the first document or review.
- 4762: This is the index of a specific word or token in the vocabulary that CountVectorizer has learned from your corpus.
- 0.06721113617995302: This is the TF-IDF score for that particular word in the first document (review).

**The token at position 4762 inside the first review has a TF-IDF value of 0. 0672, indicating that the word is not very important, unrepresentative, and not unique enough。**

### Training a classifier

Now that we have our features, we can train a classifier to try to predict the category of a review. We will use **naïve Bayes** classifier. Note: the one most suitable for word counts is the **multinomial variant**.

The x variable here is the TF-IDF matrix, and the target is ``Sentiment`` column from train_df

In [17]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(train_tfidf, train_df['Sentiment'])

Now, try to predict the outcome on a new document. Firstly, we need to extract the features of the new documents using almost the same feature extracting chain as before. We call ```transform``` instead of ```fit_transform``` on the transformers, since they have already been fit to the training set:

In [18]:
docs_new = ['Worst experience', 'I like it.']
new_counts = count_vect.transform(docs_new)
new_tfidf = tfidf_transformer.transform(new_counts)

predicted = clf.predict(new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, train_df['Sentiment']))

'Worst experience' => 406    1
236    2
294    1
318    0
18     2
      ..
255    1
72     0
396    1
235    2
37     0
Name: Sentiment, Length: 379, dtype: int64
'I like it.' => 406    1
236    2
294    1
318    0
18     2
      ..
255    1
72     0
396    1
235    2
37     0
Name: Sentiment, Length: 379, dtype: int64


#### With our self written reviews, clearly the model is not good enough. Since `Worst experience` is definitely a negative (0) review; and `I like it` is a positive review (2).

#### Next, we will try to build a pipeline to make the whole process easier to execute, and then evaluate the model based on accuracy and then tuning the model to make it more efficient.

### Building a pipeline

Establish a pipeline to automate the process of vectorizer => transformer => classifier.

In [19]:
from sklearn.pipeline import Pipeline
text_clf_nb = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [20]:
text_clf_nb.fit(train_df['Review'], train_df['Sentiment'])

### Evaluate the performance of our text classifier on the validation set

In [21]:
review_test = valid_df['Review']

predicted = text_clf_nb.predict(review_test)
accuracy = np.mean(predicted == valid_df['Sentiment'])
print(f"accuracy: {accuracy:.2%}")
print('\n')

average_f1 = f1_score(valid_df['Sentiment'], predicted, average='macro')
print(f"Average F1 score: {average_f1:.2%}")
print('\n')

class_f1 = f1_score(valid_df['Sentiment'], predicted, average=None)
print("Class-wise F1 scores:")
print(f"negative: {class_f1[0]:.2%}")
print(f"neutral: {class_f1[1]:.2%}")
print(f"positive: {class_f1[2]:.2%}")
print('\n')

conf_matrix = confusion_matrix(valid_df['Sentiment'], predicted)
conf_matrix_df = pd.DataFrame(conf_matrix, 
                              index=['negative', 'neutral', 'positive'],
                              columns=['negative', 'neutral', 'positive'])
print("Confusion Matrix:")
print(conf_matrix_df)

accuracy: 47.37%


Average F1 score: 46.71%


Class-wise F1 scores:
negative: 55.07%
neutral: 45.07%
positive: 40.00%


Confusion Matrix:
          negative  neutral  positive
negative        19        3         0
neutral         17       16         4
positive        11       15        10


**We achieved 47.36% accuracy using MultinomialNB. Average F1 score is 46.71%.** Let’s see if we can do better with a linear **support vector machine (SVM)**, which is one of the best text classification algorithms (although it’s also a bit slower than naïve Bayes). We can change the learner by simply plugging a different classifier object into our pipeline:

In [22]:
from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None)),
])

text_clf_svm.fit(train_df['Review'], train_df['Sentiment'])

predicted = text_clf_svm.predict(review_test)
accuracy = np.mean(predicted == valid_df['Sentiment'])

print(f"accuracy: {accuracy:.2%}")
print('\n')

average_f1 = f1_score(valid_df['Sentiment'], predicted, average='macro')
print(f"Average F1 score: {average_f1:.2%}")
print('\n')

class_f1 = f1_score(valid_df['Sentiment'], predicted, average=None)
print("Class-wise F1 scores:")
print(f"negative: {class_f1[0]:.2%}")
print(f"neutral: {class_f1[1]:.2%}")
print(f"positive: {class_f1[2]:.2%}")
print('\n')

conf_matrix = confusion_matrix(valid_df['Sentiment'], predicted)
conf_matrix_df = pd.DataFrame(conf_matrix, 
                              index=['negative', 'neutral', 'positive'],
                              columns=['negative', 'neutral', 'positive'])
print("Confusion Matrix:")
print(conf_matrix_df)

accuracy: 57.89%


Average F1 score: 58.46%


Class-wise F1 scores:
negative: 71.43%
neutral: 43.08%
positive: 60.87%


Confusion Matrix:
          negative  neutral  positive
negative        20        2         0
neutral         11       14        12
positive         3       12        21


**We achieved 57.89% accuracy using the support vector machine (SVM). And the average F1 score is 58.46%.**

### Parameter tuning using grid search

Run an exhaustive search of the best parameters on a grid of possible values. Try out all classifiers on either words or bigrams, with or without idf, and with a penalty parameter of either 0.01 or 0.001 for the linear SVM

In [23]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3),
}

#### Use grid searched text classification MultinomialNB model to test on a self-created review to see if the tuned model are able to make the right classification

In [24]:
gs_clf_nb = GridSearchCV(text_clf_nb, parameters, cv=5, n_jobs=-1)

gs_clf_nb = gs_clf_nb.fit(train_df['Review'], train_df['Sentiment'])

target_names = {0: 'negative', 1: 'neutral', 2: 'positive'}

predicted_sentiment_code = gs_clf_nb.predict(['I love it'])[0]

predicted_sentiment_name = target_names[predicted_sentiment_code]

print(predicted_sentiment_name)

positive


In [25]:
predicted_sentiment_code = gs_clf_nb.predict(['This is my worst experience.'])[0]

predicted_sentiment_name = target_names[predicted_sentiment_code]

print(predicted_sentiment_name)

negative


In [26]:
predicted_sentiment_code = gs_clf_nb.predict(['Overall, I think the food is great, but there are still improvements to be made.'])[0]

predicted_sentiment_name = target_names[predicted_sentiment_code]

print(predicted_sentiment_name)

neutral


#### Print out the optimized parameters for MultinomialNB model

In [27]:
gs_clf_nb.best_score_

for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf_nb.best_params_[param_name]))

clf__alpha: 0.01
tfidf__use_idf: False
vect__ngram_range: (1, 2)


### Now, re-establish the multinomial Naive Bayes model using the parameters found by grid search

In [28]:
text_clf_nb_optimized = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1, 2))),
    ('tfidf', TfidfTransformer(use_idf=False)),
    ('clf', MultinomialNB(alpha=0.01))
])

text_clf_nb_optimized.fit(train_df['Review'], train_df['Sentiment'])

#### Evaluate the model again using the validation data set by assessing the performance metrics

In [29]:
review_test = valid_df['Review']

predicted = text_clf_nb_optimized.predict(review_test)
accuracy = np.mean(predicted == valid_df['Sentiment'])
print(f"accuracy: {accuracy:.2%}")
print('\n')

average_f1 = f1_score(valid_df['Sentiment'], predicted, average='macro')
print(f"Average F1 score: {average_f1:.2%}")
print('\n')

class_f1 = f1_score(valid_df['Sentiment'], predicted, average=None)
print("Class-wise F1 scores:")
print(f"negative: {class_f1[0]:.2%}")
print(f"neutral: {class_f1[1]:.2%}")
print(f"positive: {class_f1[2]:.2%}")
print('\n')

conf_matrix = confusion_matrix(valid_df['Sentiment'], predicted)
conf_matrix_df = pd.DataFrame(conf_matrix, 
                              index=['negative', 'neutral', 'positive'],
                              columns=['negative', 'neutral', 'positive'])
print("Confusion Matrix:")
print(conf_matrix_df)

accuracy: 55.79%


Average F1 score: 56.88%


Class-wise F1 scores:
negative: 66.67%
neutral: 50.63%
positive: 53.33%


Confusion Matrix:
          negative  neutral  positive
negative        17        5         0
neutral          9       20         8
positive         3       17        16
