In [1]:
import numpy as np                                  # For large and multi-dimensional arrays
import pandas as pd                                 # For data manipulation and analysis

from sklearn.feature_extraction.text import TfidfVectorizer          #For TF-IDF
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [2]:
dataset_path = "Reviews_cleaning.csv"
data = pd.read_csv(dataset_path)
data.head(3)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,TextClean
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,positive,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,bought sever vital can dog food product found ...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,negative,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,product arriv label jumbo salt peanut peanut a...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,positive,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,confect around centuri light pillowi citrus ge...


In [3]:
train, test = train_test_split(data, test_size=0.2)

In [4]:
print("%d items in training data, %d in test data" % (len(train), len(test)))

291338 items in training data, 72835 in test data


In [5]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

In [6]:
X_train_tfidf = tfidf_vectorizer.fit_transform(train["TextClean"])
y_train = train["Score"]

In [7]:
X_test_tfidf = tfidf_vectorizer.fit_transform(test["TextClean"])
y_test = test["Score"]

In [8]:
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [9]:
prediction = dict()
prediction['model'] = model.predict(X_test_tfidf)

In [10]:
print(metrics.classification_report(y_test, prediction['model'], target_names = ["positive", "negative"]))

              precision    recall  f1-score   support

    positive       0.26      0.19      0.22     11397
    negative       0.86      0.90      0.88     61438

   micro avg       0.79      0.79      0.79     72835
   macro avg       0.56      0.54      0.55     72835
weighted avg       0.76      0.79      0.77     72835

