# Scikit-Learn for Text Analysis of Amazon Fine Food Reviews

[Original URL](https://towardsdatascience.com/scikit-learn-for-text-analysis-of-amazon-fine-food-reviews-ea3b232c2c1b)

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
df = pd.read_csv('./tmp/dataset/amazon-fine-food-reviews/Reviews.csv')
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [3]:
# Drop the NaN values and create a new column indicating whether the score is larger than 3 or not.
df.dropna(inplace=True)
df[df['Score'] != 3]
df['Positivity'] = np.where(df['Score'] > 3, 1, 0)
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Positivity
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,1
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,0
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,1
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,0
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...,1


In [4]:
# Split the data into random training adn test subsets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['Text'], df['Positivity'], random_state = 0)

print('X_train first entry: \n\n', X_train[0])
print('\n\nX_train shape: ', X_train.shape)

X_train first entry: 

 I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.


X_train shape:  (426308,)


In [5]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer().fit(X_train)
vect

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [6]:
vect.get_feature_names()[::2000]

['00',
 '255g',
 '843mg',
 'aftertraste',
 'anticarcinogens',
 'average',
 'b000mrd5jo',
 'b001rqemwi',
 'b005jd60wk',
 'beleive',
 'boobs',
 'buttersworth',
 'cc',
 'chuy',
 'compresses',
 'cramper',
 'decap',
 'difficulkt',
 'dreamy',
 'enchanted',
 'expedited',
 'fists',
 'frother',
 'gloved',
 'gurantees',
 'hiking_',
 'images',
 'intruder',
 'kavanagh',
 'lawry',
 'lowry',
 'matured',
 'misnomer',
 'mythreads',
 'numorous',
 'osco',
 'paupua',
 'pittston',
 'preshave',
 'quart',
 'refrigerante',
 'ringworm',
 'savedge',
 'sheer',
 'smiths',
 'sprklng',
 'subtotal',
 'taos',
 'tiis',
 'tubed',
 'unsuccessful',
 'vomitar',
 'wintery',
 'zest']

In [7]:
len(vect.get_feature_names())

106260

In [8]:
# The result is stored in a SciPy sparse matrix, where each row corresponds to a document, and each column is a word from our training vocabulary.
X_train_vectorized = vect.transform(X_train)
X_train_vectorized

<426308x106260 sparse matrix of type '<class 'numpy.int64'>'
	with 22990341 stored elements in Compressed Sparse Row format>

In [9]:
# X_train_vectorized.toarray()

# Logistic Regression

In [10]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train_vectorized, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [11]:
from sklearn.metrics import roc_auc_score

predictions = model.predict(vect.transform(X_test))
print("AUC: ", roc_auc_score(y_test, predictions))

AUC:  0.8440793828222962


In [13]:
feature_names = np.array(vect.get_feature_names())

sorted_coef_index = model.coef_[0].argsort()
print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))

print('Largest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:-11:-1]]))


Smallest Coefs:
['downhill' 'quickness' 'dissapointing' 'realllly' 'limpest' 'bbb'
 'tastless' 'reformulate' 'weiner' 'vh']

Largest Coefs:
['emeraldforest' 'blowout' 'chedder' 'botch' 'antelop' 'bertie'
 'b001rvfdoo' 'tribute' 'hears' 'kenzi']



In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer(min_df = 5).fit(X_train)
len(vect.get_feature_names())

36692

In [None]:
X_train_vectorized = vect.transform(X_train)

model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

In [17]:
predictions = model.predict(vect.transform(X_test))
print("AUC: ", roc_auc_score(y_test, predictions))

AUC:  0.8303786896159779


In [19]:
feature_names = np.array(vect.get_feature_names())

sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()

print("Smallest Tfidf: \n{}\n".format(feature_names[sorted_tfidf_index[:10]]))
print("Largest Tfidf: \n{}\n".format(feature_names[sorted_tfidf_index[:-11:-1]]))

Smallest Tfidf: 
['0373' 'amortized' '0004' '1534' '231' 'amortization' '0377'
 'furnishings' '368' 'serine']

Largest Tfidf: 
['carmel' '98' 'yum' 'good' 'filler' 'word' 'love' 'mmm' 'awesome'
 'banana']



In [20]:
print(model.predict(vect.transform(["The candy is not good, I will never buy them again.", "The candy is not bad, I will buy them again."])))

[1 0]


In [21]:
vect = CountVectorizer(min_df = 5, ngram_range=(1,2)).fit(X_train)
X_train_vectorized = vect.transform(X_train)
len(vect.get_feature_names())

564038

In [22]:
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

predictions = model.predict(vect.transform(X_test))
print("AUC: ", roc_auc_score(y_test, predictions))



AUC:  0.9115621431625667


In [23]:
feature_names = np.array(vect.get_feature_names())
sorted_coef_index = model.coef_[0].argsort()
print('Smallest Coef: \n{}\n'.format(feature_names[sorted_coef_index][:10]))
print('Largest Coef: \n{}\n'.format(feature_names[sorted_coef_index][:-11:-1]))

Smallest Coef: 
['three stars' 'two stars' 'not worth' 'not recommend' 'worst'
 'disappointing' 'not happy' 'disappointment' 'no thanks' 'at best']

Largest Coef: 
['not disappointed' 'four stars' 'be disappointed' 'hooked'
 'not disappoint' 'be sorry' 'just right' 'not bitter' 'not overpowering'
 'addicting']



In [24]:
print(model.predict(vect.transform(["The candy is not good, I will never buy them again.", "The candy is not bad, I will buy them again."])))

[0 1]
