## M6 - W5 Assignment: NLP -  Sentiment Analysis of Amazon Reviews.
Luke Barry

In [1]:
import pandas as pd
import numpy as np
from nltk import word_tokenize
import nltk
# Importing the required library
from sklearn.feature_extraction.text import CountVectorizer
import langdetect
import bz2
from stop_words import get_stop_words
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
import eli5
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import f1_score, confusion_matrix, classification_report
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt

***

## Download and import the training and testing data sets

In [2]:
train_file = bz2.BZ2File("train.ft.txt.bz2")

# Load and decode
    
lines = [x.decode('utf-8') for x in train_file.readlines()]

# Split in two: sentiment and review

score_review_list = [l.strip('__label__').split(' ', 1) for l in lines]

train_df = pd.DataFrame(score_review_list, columns = ['score', 'review'] )

In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3600000 entries, 0 to 3599999
Data columns (total 2 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   score   object
 1   review  object
dtypes: object(2)
memory usage: 54.9+ MB


In [4]:
train_df.head()

Unnamed: 0,score,review
0,2,Stuning even for the non-gamer: This sound tra...
1,2,The best soundtrack ever to anything.: I'm rea...
2,2,Amazing!: This soundtrack is my favorite music...
3,2,Excellent Soundtrack: I truly like this soundt...
4,2,"Remember, Pull Your Jaw Off The Floor After He..."


In [8]:
df = train_df.sample(50000, random_state=12)

***

## Creating n_tokens column

In [9]:
# List comprehension to tokenize the reviews: the result is a list
tokens = [word_tokenize(review) for review in df['review']]

# Tokens will be a list of lists, where each inner list is a single review. Let's check whether that's indeed the case.
print('The first review in the data set: ', df.iloc[0,0])
print('The first list of the tokens: ', tokens[0])

The first review in the data set:  1
The first list of the tokens:  ['Very', 'bad', 'album', ':', 'This', 'album', 'is', 'really', 'very', 'bad', '.', 'The', 'singer', 'here', 'just', 'ca', "n't", 'sing', '!', 'I', "''", 'm', 'afraid', 'he', "'ll", 'just', 'last', '3', 'or', '4', 'albums', 'at', 'the', 'most', '.', 'Take', 'for', 'example', 'the', 'latest', 'cross-over', 'singing', 'Mario', 'Langoulis', '.', 'He', 'has', 'a', 'better', 'voice', ',', 'better', 'singing', 'skills', 'and', 'better', 'looks', 'than', 'Mr', 'Watson', '.', 'Look', 'wider', 'and', 'you', "'ll", 'see', 'lots', 'of', 'better', 'things', 'on', 'offer', '!']


In [10]:
# We create an empty list for the length of tokens in each review, then loop over the tokens list, which remember is a list
# of lists, count how many tokens we have in each inner list and append it to the length_tokens. 
length_tokens = []
for item in range(len(tokens)):
    length_tokens.append(len(tokens[item]))

print(length_tokens[0], len(tokens[0]))
df['n_tokens']= length_tokens
#nw = pd.Series(length_tokens)
df.head()

74 74


Unnamed: 0,score,review,n_tokens
2367606,1,Very bad album: This album is really very bad....,74
1974555,2,"Works for me: Ms. Vanessa, I don't know if you...",84
1301861,2,"Sour Grapes: Fantastic Video, just ignore all ...",43
643190,2,Feed back: I received this product in a timely...,39
2825239,2,Very Good Album: i heard the song girl's not g...,173


In [11]:
df.n_tokens.describe()

count    50000.000000
mean        91.285700
std         49.201907
min         15.000000
25%         50.000000
50%         81.000000
75%        125.000000
max        309.000000
Name: n_tokens, dtype: float64

***

## Creating Language Column

In [12]:
langdetect.detect_langs(df.iloc[0, 1])

[en:0.9999956937873493]

In [13]:
# Testing Method
languages = []
languages.append(str(langdetect.detect_langs(df.iloc[1,1])).split(':')[0][1:])
languages

['en']

In [14]:
languages = []

for i in range(len(df)):
        languages.append(str(langdetect.detect_langs(df.iloc[i,1])).split(':')[0][1:])

languages[0:10]

['en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en', 'en']

In [16]:
df['language'] = languages
df.language.value_counts()

en    49872
es       85
fr       14
pt       10
de        8
it        2
id        2
ca        1
hr        1
so        1
pl        1
vi        1
sq        1
af        1
Name: language, dtype: int64

***

## Vectorizing and Model Training

In [19]:
# setting stop words
stop_words = set(get_stop_words('english'))

# initializing vectorizer
vectorizer = CountVectorizer(max_features=1000, stop_words = stop_words).fit(df.review)

# transforming output
vect_trans = vectorizer.transform(df.review)

vect_trans

<50000x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 1061253 stored elements in Compressed Sparse Row format>

In [20]:
# putting output into a database
X_df = pd.DataFrame(vect_trans.toarray() , columns = vectorizer.get_feature_names())
X_df.sample(10)

Unnamed: 0,10,100,11,12,15,20,30,50,80,able,...,writer,writing,written,wrong,wrote,year,years,yes,yet,young
42999,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
37320,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
34356,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35704,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
23756,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
38434,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
47879,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1336,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
44559,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
32794,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
# slicing for later concat
df_slice = df.drop('review', axis=1)
# Recode the language to be binary, so that it is == 1 if the language is en, 0 otherwise
df_slice['language'] = df_slice['language'].apply(lambda x: 1 if x=='en' else 0)
df_slice.head()

Unnamed: 0,score,n_tokens,language
2367606,1,74,1
1974555,2,84,1
1301861,2,43,1
643190,2,39,1
2825239,2,173,1


In [24]:
# resetting index for proper concatination
df_slice.reset_index(drop=True, inplace=True)

In [25]:
# Concatenating the sliced and new datasets

model_df = pd.concat([X_df, df_slice], axis=1)
model_df.sample(10)

Unnamed: 0,10,100,11,12,15,20,30,50,80,able,...,wrong,wrote,year,years,yes,yet,young,score,n_tokens,language
11174,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,35,1
41181,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,62,1
37989,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,52,1
12030,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,33,1
11033,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,40,1
16124,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,50,1
23222,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,76,1
7283,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,177,1
45842,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,119,1
4010,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,47,1


In [35]:
# specifying X and y
X = model_df.drop('score', axis=1)
y = model_df.score

In [265]:
## training the model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                      random_state=12)
model = LogisticRegression(random_state=12)
model.fit(X_train, y_train)

LogisticRegression(random_state=12)

In [286]:
# evaluating the model
def evaluation(model):
    """This fuction fit the train set to the model, predict on the test set then plot the train score and val score"""
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
evaluation(model)

[[4215  742]
 [ 714 4329]]
              precision    recall  f1-score   support

           1       0.86      0.85      0.85      4957
           2       0.85      0.86      0.86      5043

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



In [268]:
# inspecting the weights of the features
eli5.show_weights(estimator=model, 
                  feature_names= list(model_df.columns.drop('score')),
                    top=(50,50))

Weight?,Feature
+2.058,excellent
+1.534,perfect
+1.311,wonderful
+1.300,awesome
+1.271,amazing
+1.255,highly
+1.133,favorite
+1.051,loves
+1.007,easy
+0.974,pleased


In [269]:
# creating a database out of the weighted features
weight_df = eli5.explain_weights_df(model, feature_names= list(model_df.columns.drop('score')))
weight_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1003 entries, 0 to 1002
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   target   1003 non-null   object 
 1   feature  1003 non-null   object 
 2   weight   1003 non-null   float64
dtypes: float64(1), object(2)
memory usage: 23.6+ KB


In [270]:
# specifying negative and positive sentiment
sentiment = []
for i in weight_df['weight']:
    if i < 0:
        sentiment.append('negative')
    else:
        sentiment.append('positive')
weight_df['sentiment'] = sentiment
weight_df['sentiment'].value_counts()

positive    505
negative    498
Name: sentiment, dtype: int64

In [271]:
# positive and negative lists
pos = weight_df[weight_df['sentiment'] == 'positive']['feature'].tolist()
neg = weight_df[weight_df['sentiment'] == 'negative']['feature'].tolist()

In [273]:
# sentiment guesser
def sentiment_guesser(review):
    """ This function checks if input sentiment is positive or negative based on the features of the above model """
    
    check_pos =  any(item in review.split() for item in pos)
    check_neg =  any(item in review.split() for item in neg)
    if check_pos is True and check_neg is False:
        print('Positive')    
    elif check_neg is True and check_pos is False:
        print("Negative")
    else:
        print('uncertain')

In [282]:
sentiment_guesser('this is an awesome chair.')

Positive


In [284]:
sentiment_guesser('this book was trash.')

Negative


In [285]:
sentiment_guesser('A bunch of random words.')

uncertain


It should be noted that the above function does not take weight in to consideration and simply responds 'uncertain' when positive and negative words are included in the argument. I would like to come back to it but didn't have time for this assignment.

***