In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [2]:
data = pd.read_csv('/Users/mehradghassemi/sentiment_classification/data/raw/reviews_res.csv', delimiter='\t')

In [3]:
data.head()

Unnamed: 0,Name,RatingValue,DatePublished,Review
0,Playa Cabana,4,2020-02-26,I was tasked with finding a spot for a group d...
1,Playa Cabana,3,2019-08-04,Went here with my friends and family. I liked ...
2,Playa Cabana,3,2019-08-24,Surprisingly good Flautas! They came as 3 roll...
3,Playa Cabana,4,2019-06-06,As a Mexican I always crave authentic Mexican ...
4,Playa Cabana,5,2020-05-25,Best tacos I've ever had. Both locations are g...


In [4]:
def rating_to_sentiment(RatingValue):
    if RatingValue in [1, 2]:
        return 0
    elif RatingValue == 3:
        return 1
    elif RatingValue in [4, 5]:
        return 2

In [5]:
data['sentiment'] = data['RatingValue'].apply(rating_to_sentiment)

In [6]:
data = data.drop(['RatingValue', 'Name', 'DatePublished'], axis=1)

In [7]:
sentiment_counts = data['sentiment'].value_counts()

In [8]:
extra_data = data[data.sentiment == 2].sample(n=1165)
data_new = data.drop(extra_data.index)

In [9]:
sentiment_counts = data_new['sentiment'].value_counts()

In [13]:
train, valid = train_test_split(data_new, test_size=0.2, random_state=42)
train.to_csv("train.csv", index=False)
valid.to_csv("valid.csv", index=False)

In [16]:
train.to_csv("../data/interim/train.csv")
valid.to_csv("../data/interim/valid.csv")

In [20]:
train = pd.read_csv("../data/interim/train.csv")
valid = pd.read_csv("../data/interim/valid.csv")

In [21]:
valid.shape

(151, 3)

In [22]:
train.shape

(604, 3)

In [23]:
count_vec = CountVectorizer()
tfidf = TfidfTransformer(use_idf=True)

In [24]:
classifiers = {
    'MultinomialNB': MultinomialNB(),
    'SGDClassifier': SGDClassifier()
}

In [25]:
param_grid = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': [1e-2, 1e-3]
}

In [26]:
X_valid = valid['Review']
y_valid = valid['sentiment']

In [27]:
for name, clf in classifiers.items():
    text_clf = Pipeline([
        ('vect', count_vec),
        ('tfidf', tfidf),
        ('clf', clf),
    ])

    grid_search = GridSearchCV(
        text_clf,
        param_grid,
        cv=5,
        n_jobs=-1,
        scoring='f1_macro')

    grid_search.fit(train.Review, train.sentiment)

    print(f"Best parameters for {name}: ", grid_search.best_params_)
    print(f"Best cross-validation score for {name}: ", grid_search.best_score_)

    y_pred = grid_search.predict(X_valid)
    cm = confusion_matrix(y_valid, y_pred, normalize='true')
    label_names = ['negative', 'neutral', 'positive']
    cm_df = pd.DataFrame(cm, index=label_names, columns=label_names)
    # Compute accuracy and F1-score
    accuracy = metrics.accuracy_score(y_valid, y_pred)
    f1_score = metrics.f1_score(y_valid, y_pred, average='macro')

    # Print results
    print(f"Accuracy on the test set {name}: {accuracy}")
    print(f"Average F1-score on the test set {name}: {f1_score}")
    # print(f"Confusion matrix for {name}:")
    report = classification_report(
        y_valid, y_pred, target_names=[
            'Negative', 'Neutral', 'Positive'])
    print(f"Classification report for {name}:\n{report}")
    print("Confusion matrix:")
    print(cm_df)

Best parameters for MultinomialNB:  {'clf__alpha': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}
Best cross-validation score for MultinomialNB:  0.5600529013879209
Accuracy on the test set MultinomialNB: 0.543046357615894
Average F1-score on the test set MultinomialNB: 0.5071471329241739
Classification report for MultinomialNB:
              precision    recall  f1-score   support

    Negative       0.89      0.24      0.38        33
     Neutral       0.44      0.66      0.53        53
    Positive       0.63      0.60      0.61        65

    accuracy                           0.54       151
   macro avg       0.65      0.50      0.51       151
weighted avg       0.62      0.54      0.53       151

Confusion matrix:
          negative   neutral  positive
negative  0.242424  0.606061  0.151515
neutral   0.000000  0.660377  0.339623
positive  0.015385  0.384615  0.600000
Best parameters for SGDClassifier:  {'clf__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range':