# Sentiment analysis of Amazon product reviews
### Using TF-IDF (Term Frequency-Inverse Document Frequency)

In [66]:
import numpy as np
import pandas as pd

In [67]:
# Dataset used : https://www.kaggle.com/datasets/jillanisofttech/amazon-product-reviews/
df = pd.read_csv("Reviews.csv")

In [68]:
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [69]:
df = df.rename({'Score':'Rating','Text':'Review'},axis=1)

In [70]:
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Rating,Time,Summary,Review
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [71]:
df=df[['ProductId','Rating','Review']]

In [72]:
df.head()

Unnamed: 0,ProductId,Rating,Review
0,B001E4KFG0,5,I have bought several of the Vitality canned d...
1,B00813GRG4,1,Product arrived labeled as Jumbo Salted Peanut...
2,B000LQOCH0,4,This is a confection that has been around a fe...
3,B000UA0QIQ,2,If you are looking for the secret ingredient i...
4,B006K2ZZ7K,5,Great taffy at a great price. There was a wid...


In [73]:
# lets check how many rows and columns we have
df.shape

(568454, 3)

In [74]:
# now lets check each column has how many null values
df.isnull().sum()

ProductId    0
Rating       0
Review       0
dtype: int64

In [75]:
# we will drop the rows or data that is having a null value
df.dropna(inplace=True)

In [76]:
# lets verify we are not having null values remaining
df.isnull().sum()

ProductId    0
Rating       0
Review       0
dtype: int64

In [77]:
df['Rating'].value_counts()

Rating
5    363122
4     80655
1     52268
3     42640
2     29769
Name: count, dtype: int64

In [78]:
# rating '3' represents a neutral review so lets not take it in account right now
df = df[df['Rating']!=3]

In [79]:
df['Rating'].value_counts()

Rating
5    363122
4     80655
1     52268
2     29769
Name: count, dtype: int64

In [80]:
# we will be using rating as target that will tell us whether the review is positive or negative
df['label'] = np.where(df['Rating']>=4,1,0) #1-Positve,0-Negative

In [81]:
df.head()

Unnamed: 0,ProductId,Rating,Review,label
0,B001E4KFG0,5,I have bought several of the Vitality canned d...,1
1,B00813GRG4,1,Product arrived labeled as Jumbo Salted Peanut...,0
2,B000LQOCH0,4,This is a confection that has been around a fe...,1
3,B000UA0QIQ,2,If you are looking for the secret ingredient i...,0
4,B006K2ZZ7K,5,Great taffy at a great price. There was a wid...,1


### Pre-processing

In [88]:
# convert the all reviews into the lower case
df['pre_process'] = df['Review'].apply(lambda x: " ".join(x.lower() for x in str(x).split()))

In [89]:
df.head()

Unnamed: 0,ProductId,Rating,Review,label,pre_process
0,B001E4KFG0,5,I have bought several of the Vitality canned d...,1,i have bought several of the vitality canned d...
1,B00813GRG4,1,Product arrived labeled as Jumbo Salted Peanut...,0,product arrived labeled as jumbo salted peanut...
2,B000LQOCH0,4,This is a confection that has been around a fe...,1,this is a confection that has been around a fe...
3,B000UA0QIQ,2,If you are looking for the secret ingredient i...,0,if you are looking for the secret ingredient i...
4,B006K2ZZ7K,5,Great taffy at a great price. There was a wid...,1,great taffy at a great price. there was a wide...


In [90]:
# Perform the Contractions on the reviews
# r -> means raw string
import re
def contractions(text) :
    text = re.sub(r"won't","will not",text)
    text = re.sub(r"would't","would not",text)
    text = re.sub(r"could't","could not",text)
    text = re.sub(r"\'d", " would",text)
    text = re.sub(r"can\'t", "can not",text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are",text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    return text

df['pre_process'] = df['pre_process'].apply(contractions)

In [91]:
df.head()

Unnamed: 0,ProductId,Rating,Review,label,pre_process
0,B001E4KFG0,5,I have bought several of the Vitality canned d...,1,i have bought several of the vitality canned d...
1,B00813GRG4,1,Product arrived labeled as Jumbo Salted Peanut...,0,product arrived labeled as jumbo salted peanut...
2,B000LQOCH0,4,This is a confection that has been around a fe...,1,this is a confection that has been around a fe...
3,B000UA0QIQ,2,If you are looking for the secret ingredient i...,0,if you are looking for the secret ingredient i...
4,B006K2ZZ7K,5,Great taffy at a great price. There was a wid...,1,great taffy at a great price. there was a wide...


In [92]:
# lets clean the reviews
def cleantxt(text) :
    text = re.sub('[^A-Za-z ]+','', text) # remove non-alpha characters
    text = re.sub('  +',' ', text) # remove extra spaces
    return text

df['pre_process'] = df['pre_process'].apply(cleantxt)

In [93]:
df.head()

Unnamed: 0,ProductId,Rating,Review,label,pre_process
0,B001E4KFG0,5,I have bought several of the Vitality canned d...,1,i have bought several of the vitality canned d...
1,B00813GRG4,1,Product arrived labeled as Jumbo Salted Peanut...,0,product arrived labeled as jumbo salted peanut...
2,B000LQOCH0,4,This is a confection that has been around a fe...,1,this is a confection that has been around a fe...
3,B000UA0QIQ,2,If you are looking for the secret ingredient i...,0,if you are looking for the secret ingredient i...
4,B006K2ZZ7K,5,Great taffy at a great price. There was a wid...,1,great taffy at a great price there was a wide ...


In [94]:
# lets remove stopwords now
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
stop = stopwords.words('english')
df['pre_process']=df['pre_process'].apply(lambda x: " ".join([x for x in x.split() if x not in stop]))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sriva\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [95]:
df.head()

Unnamed: 0,ProductId,Rating,Review,label,pre_process
0,B001E4KFG0,5,I have bought several of the Vitality canned d...,1,bought several vitality canned dog food produc...
1,B00813GRG4,1,Product arrived labeled as Jumbo Salted Peanut...,0,product arrived labeled jumbo salted peanutsth...
2,B000LQOCH0,4,This is a confection that has been around a fe...,1,confection around centuries light pillowy citr...
3,B000UA0QIQ,2,If you are looking for the secret ingredient i...,0,looking secret ingredient robitussin believe f...
4,B006K2ZZ7K,5,Great taffy at a great price. There was a wid...,1,great taffy great price wide assortment yummy ...


In [101]:
# lets do lemmatization (stemming can give incorrect meanings and spelling)
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
df['pre_process']=df['pre_process'].apply(lambda x: " ".join([lemmatizer.lemmatize(w) for w in nltk.word_tokenize(x)]))
df['pre_process']=df['pre_process'].apply(lambda x: " ".join([lemmatizer.lemmatize(w,pos = "v") for w in nltk.word_tokenize(x)]))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sriva\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [102]:
df.head()

Unnamed: 0,ProductId,Rating,Review,label,pre_process
0,B001E4KFG0,5,I have bought several of the Vitality canned d...,1,buy several vitality can dog food product find...
1,B00813GRG4,1,Product arrived labeled as Jumbo Salted Peanut...,0,product arrive label jumbo salt peanutsthe pea...
2,B000LQOCH0,4,This is a confection that has been around a fe...,1,confection around century light pillowy citrus...
3,B000UA0QIQ,2,If you are looking for the secret ingredient i...,0,look secret ingredient robitussin believe find...
4,B006K2ZZ7K,5,Great taffy at a great price. There was a wid...,1,great taffy great price wide assortment yummy ...


### Feature Extraction

In [104]:
from sklearn.model_selection import train_test_split

In [107]:
X_train,X_test,Y_train, Y_test = train_test_split(df['pre_process'], df['label'], test_size=0.25, random_state=30)

In [108]:
print(X_train)

49699     great source protein absolutely delicious flav...
211234    cat adore cat food can not eat quick enough ha...
530966    long story shorti think buy paint old tongue s...
452839    prosbr taste great hot coldbr universally enjo...
544584    able interest new puppy treat except one get o...
                                ...                        
546832                             dog food three year well
241104    coffee hit spotjust righta light flavorful roa...
536641    sooo good even usually like coffee love cafe d...
355658    dog wild many treat love greenies resist never...
467075                       use saffron enjoy aroma flavor
Name: pre_process, Length: 394360, dtype: object


In [109]:
print(Y_train)

49699     1
211234    1
530966    1
452839    1
544584    1
         ..
546832    1
241104    1
536641    1
355658    1
467075    1
Name: label, Length: 394360, dtype: int32


In [110]:
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer().fit(X_train)

In [112]:
len(vect.get_feature_names_out())

164532

In [113]:
X_train_vectorized = vect.transform(X_train)

In [116]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train_vectorized,Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [117]:
predictions = model.predict(vect.transform(X_test))

In [118]:
print(predictions)

[1 1 0 ... 1 1 1]


In [119]:
# Compute Area under the curve (AUC)
from sklearn.metrics import roc_auc_score
print ("AUC:",roc_auc_score(Y_test,predictions))

AUC: 0.8287116948875246
