In [1]:
# import neccessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# show plot intermediately under the calling cell
%matplotlib inline

# disable some warnings
import warnings
warnings.simplefilter(action = "ignore")

In [2]:
df = pd.read_csv('reviews_Digital_Music_5-cleaned.csv')
df.loc[df.reviewText.isna(), 'reviewText']=''
df.head(2)

Unnamed: 0,reviewText,summary,overall
0,"It's hard to believe ""Memory of Trees"" came ou...",Enya's last great album,good
1,"A clasically-styled and introverted album, Mem...",Enya at her most elegant,good


In [3]:
# split df into 3 df of each columns:
revs, sums, ovas = (df.reviewText, df.summary, df.overall)

from sklearn.model_selection import train_test_split

# prepare the train and test data.
rev_x_train, rev_x_test, rev_y_train, rev_y_test = train_test_split(
    revs, ovas, test_size=0.2, random_state=25)
sum_x_train, sum_x_test, sum_y_train, sum_y_test = train_test_split(
    sums, ovas, test_size=0.2, random_state=25)

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

# tf-idf vectorizer
vectorizer = TfidfVectorizer(max_df = 0.5, stop_words='english',
                             sublinear_tf=True, use_idf=True,
                            lowercase=True)
# create vectorized x 
rev_x_train_vectors = vectorizer.fit_transform(rev_x_train)
rev_x_test_vectors = vectorizer.transform(rev_x_test)


sum_x_train_vectors = vectorizer.fit_transform(sum_x_train)
sum_x_test_vectors = vectorizer.transform(sum_x_test)

# these 2 vectors are for k-fold validation
rev_x_vectors = vectorizer.fit_transform(revs)
sum_x_vectors = vectorizer.fit_transform(sums)

In [22]:
df.overall.value_counts()

good       52116
neutral     6789
bad         5801
Name: overall, dtype: int64

# Classify using Naive Bayes models

In [13]:
# tuning using class_weight
# Classify overall using:
    # reviewText data
    # Naive Bayes model
    # hold-out validation and k-fold validation and clf_report
from sklearn.naive_bayes import MultinomialNB
rev_nb_clf = MultinomialNB()
rev_nb_clf.fit(rev_x_train_vectors, rev_y_train)
print('hold-out:', rev_nb_clf.score(rev_x_test_vectors, rev_y_test))

from sklearn.model_selection import cross_val_score
print('cross validation (mean of 10 cv):', 
      cross_val_score(rev_nb_clf, rev_x_vectors, ovas, cv=10).mean())

from sklearn.metrics import classification_report
print('Classification_report: \n', 
      classification_report(
          y_true=rev_y_test,
          y_pred=rev_nb_clf.predict(rev_x_test_vectors)))

hold-out: 0.8072168134755061
cross validation (mean of 10 cv): 0.8047012750461725
Classification_report: 
              precision    recall  f1-score   support

        bad       1.00      0.00      0.00      1134
       good       0.81      1.00      0.89     10447
    neutral       0.00      0.00      0.00      1361

avg / total       0.74      0.81      0.72     12942



In [12]:
# Classify overall using:
    # Summary data 
    # Naive Bayes model
    # hold-out validation and k-fold validation and clf_report
sum_nb_clf = MultinomialNB()
sum_nb_clf.fit(sum_x_train_vectors, sum_y_train)
print('hold-out:', sum_nb_clf.score(sum_x_test_vectors, sum_y_test))
print('cross validation (mean of 10 cv):', 
      cross_val_score(sum_nb_clf, sum_x_vectors, ovas, cv=10).mean())
print('Classification_report: \n', 
      classification_report(
          y_true=sum_y_test,
          y_pred=sum_nb_clf.predict(sum_x_test_vectors)))

hold-out: 0.8149435944985319
cross validation (mean of 10 cv): 0.8111921461671352
Classification_report: 
              precision    recall  f1-score   support

        bad       0.77      0.09      0.16      1134
       good       0.82      1.00      0.90     10447
    neutral       0.40      0.02      0.04      1361

avg / total       0.77      0.81      0.74     12942



# Classify using SVM

In [8]:
# Classify overall using:
    # reviewText data
    # SVM model
    # hold-out validation and k-fold validation
from sklearn.svm import SVC
rev_svm_clf = SVC()
rev_svm_clf.fit(rev_x_train_vectors, rev_y_train)
print('hold-out:', rev_svm_clf.score(rev_x_test_vectors, rev_y_test))
print('cross validation (mean of 10 cv):', 
      cross_val_score(rev_svm_clf, rev_x_vectors, ovas, cv=10).mean())

hold-out: 0.8072168134755061
cross validation (mean of 10 cv): 0.8054276306863761


In [9]:
# Classify overall using:
    # summary data 
    # SVM model
    # hold-out validation and k-fold validation
sum_svm_clf = SVC()
sum_svm_clf.fit(sum_x_train_vectors, sum_y_train)
print('hold-out:', sum_svm_clf.score(sum_x_test_vectors, sum_y_test))
print('cross validation (mean of 10 cv):', 
      cross_val_score(sum_svm_clf, sum_x_vectors, ovas, cv=10).mean())

hold-out: 0.8072168134755061
cross validation (mean of 10 cv): 0.8054276306863761


# Conclusion
From the data gotten from 4 models, the results show that classying using Summary features, naive bayes model gives highest score from both hold-out validation (0.8149) and k-fold validation (0.8111).