In [7]:
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from warnings import filterwarnings
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline

In [5]:
reviews = pd.read_csv('AmazonReviews.csv')
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   Id                      568454 non-null  int64 
 1   ProductId               568454 non-null  object
 2   UserId                  568454 non-null  object
 3   ProfileName             568438 non-null  object
 4   HelpfulnessNumerator    568454 non-null  int64 
 5   HelpfulnessDenominator  568454 non-null  int64 
 6   Score                   568454 non-null  int64 
 7   Time                    568454 non-null  int64 
 8   Summary                 568427 non-null  object
 9   Text                    568454 non-null  object
dtypes: int64(5), object(5)
memory usage: 43.4+ MB


In [6]:
text = " ".join(review for review in reviews.Text)
print ("There are {} words in the combination of all review.".format(len(text)))

There are 248540641 words in the combination of all review.


In [8]:
pipeline = Pipeline([('CVec', CountVectorizer(stop_words='english')),
                     ('Tfidf', TfidfTransformer()),
                     ('MNB', MultinomialNB())])

In [10]:
%%time
cv_pred = cross_validate(pipeline,
                             reviews['Text'],   ## X
                             reviews['Score'],   ## y
                             cv=5,
                             scoring=('roc_auc_ovr'), n_jobs=-1, verbose =10)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  1.4min remaining:  2.1min
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:  1.4min remaining:   55.9s


CPU times: user 3.41 s, sys: 3.89 s, total: 7.31 s
Wall time: 1min 57s


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  2.0min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  2.0min finished


In [11]:
cv_pred['test_score']

array([0.80588185, 0.81448439, 0.8088359 , 0.81728556, 0.81103624])