In [1]:
import os
import time

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm

from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

t = time.time()

In [2]:
review = pd.read_pickle('data/CD_review_stop_lem.pickle')
review

6          recall love album maybe one forgot figure arti...
32         keith green song shepherd previous album focus...
37         keith green passionate love jesus evident life...
117        buy replace original purchase many year ago so...
150        love cd always part christmas music drive rhyt...
                                 ...                        
4543140                                             good job
4543226                                           love heart
4543228    ann still get doubt seriously ever ever lose v...
4543263    first hear guy pick ocean avenue three album g...
4543359                        really good fun quality stuff
Name: reviewText, Length: 453866, dtype: object

In [3]:
ratings = pd.read_pickle('data/CD_ratings.pickle')
ratings

6          5
32         5
37         5
117        5
150        5
          ..
4543140    4
4543226    5
4543228    5
4543263    5
4543359    5
Name: overall, Length: 453866, dtype: int64

In [4]:
X = review
y = ratings

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25, random_state=42)

In [5]:
pipeline = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())])

parameters = {'vect__max_df': (0.5, 0.75, 1),
              'vect__max_features': (5000, 10000, 50000), 
              'vect__ngram_range': ((1,1), (1,2)),
              'tfidf__use_idf': (True, False),
              'clf__alpha': [0.001, 0.01, 0.1, 0.2, 0.5, 1]
}
              
grid_search = GridSearchCV(pipeline, parameters, scoring='neg_root_mean_squared_error', n_jobs=3, verbose=1)
    
print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
print(parameters)
t0 = time.time()
grid_search = grid_search.fit(X_train, y_train)
print("done in %0.3fs" % (time.time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))         

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'vect__max_df': (0.5, 0.75, 1), 'vect__max_features': (5000, 10000, 50000), 'vect__ngram_range': ((1, 1), (1, 2)), 'tfidf__use_idf': (True, False), 'clf__alpha': [0.001, 0.01, 0.1, 0.2, 0.5, 1]}
Fitting 5 folds for each of 216 candidates, totalling 1080 fits
done in 21469.413s

Best score: -0.993
Best parameters set:
	clf__alpha: 0.01
	tfidf__use_idf: True
	vect__max_df: 0.5
	vect__max_features: 50000
	vect__ngram_range: (1, 2)


In [6]:
y_pred = grid_search.predict(X_test)

print("RMSE:", metrics.mean_squared_error(y_test, y_pred, squared=False))

RMSE: 0.9905117119388835


In [7]:
print("MAE:", metrics.mean_absolute_error(y_test, y_pred))

MAE: 0.500691831105079


In [8]:
print(f"Execution time : {time.strftime('%H:%M:%S', time.gmtime(time.time()-t))}")

Execution time : 05:58:06
