### This exercise aims to train a model to classify amazon product reviews as positive or negative

### __label1__ : negative
### __label2__ : positive

In [22]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score

np.random.seed(500)

In [23]:
data = pd.read_csv("../data/amazon-reviews.csv",encoding='latin-1')

In [24]:
data.head(30)

Unnamed: 0,text,label
0,Stuning even for the non-gamer: This sound tr...,__label__2
1,The best soundtrack ever to anything.: I'm re...,__label__2
2,Amazing!: This soundtrack is my favorite musi...,__label__2
3,Excellent Soundtrack: I truly like this sound...,__label__2
4,"Remember, Pull Your Jaw Off The Floor After H...",__label__2
5,an absolute masterpiece: I am quite sure any ...,__label__2
6,"Buyer beware: This is a self-published book, ...",__label__1
7,Glorious story: I loved Whisper of the wicked...,__label__2
8,A FIVE STAR BOOK: I just finished reading Whi...,__label__2
9,Whispers of the Wicked Saints: This was a eas...,__label__2


In [25]:
# remove blank rows, lower case and perform tokenization
data['text'] = [word_tokenize(entry.lower()) for entry in data['text'].dropna()]

# defaultdict is a dictionary that provides a default value if the index is not found
# in this example, the dictionary defaults to nouns
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

stopWords = stopwords.words('english')

word_Lemmatized = WordNetLemmatizer()

for index,entry in enumerate(data['text']):
    final_words = []
    
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopWords and word.isalpha():
            final_words.append(word_Lemmatized.lemmatize(word,tag_map[tag[0]]))
    # The final processed set of words for each iteration will be stored in 'text_final'
    data.loc[index,'text_final'] = str(final_words)

In [26]:
data['text_final']

0       ['stun', 'even', 'sound', 'track', 'beautiful'...
1       ['best', 'soundtrack', 'ever', 'anything', 're...
2       ['amaze', 'soundtrack', 'favorite', 'music', '...
3       ['excellent', 'soundtrack', 'truly', 'like', '...
4       ['remember', 'pull', 'jaw', 'floor', 'hear', '...
                              ...                        
9995    ['revelation', 'life', 'small', 'town', 'ameri...
9996    ['great', 'biography', 'interesting', 'journal...
9997    ['interest', 'subject', 'poor', 'presentation'...
9998    ['buy', 'box', 'look', 'use', 'obviously', 'ne...
9999    ['beautiful', 'pen', 'fast', 'delivery', 'pen'...
Name: text_final, Length: 10000, dtype: object

In [27]:
entry

['beautiful',
 'pen',
 'and',
 'fast',
 'delivery',
 '.',
 ':',
 'the',
 'pen',
 'was',
 'shipped',
 'promptly',
 '.',
 'this',
 'is',
 'the',
 'classic',
 'montblanc',
 'pen',
 'that',
 'everyone',
 'raves',
 'about',
 '.',
 'it',
 'is',
 'black',
 'in',
 'color',
 'with',
 'golden',
 'trim',
 '.',
 'it',
 'holds',
 'it',
 "'s",
 'own',
 'and',
 'i',
 'am',
 'thankful',
 'to',
 'my',
 'parents',
 'to',
 'gift',
 'this',
 'to',
 'me',
 'as',
 'a',
 'graduation',
 'present',
 '.',
 'someday',
 'i',
 'plan',
 'on',
 'buying',
 'more',
 'of',
 'these',
 'pens',
 'from',
 'amazon.the',
 'vendors',
 'at',
 'amazon',
 'are',
 'amazing',
 '.',
 'i',
 'have',
 'been',
 'a',
 'long',
 'time',
 'shopper',
 'at',
 'amazon',
 ',',
 'mostly',
 'pens',
 'and',
 'electronics',
 '.',
 'however',
 'this',
 'pen',
 'was',
 'gifted',
 'to',
 'me',
 'by',
 'my',
 'mom',
 'and',
 'she',
 'has',
 'nothing',
 'but',
 'good',
 'things',
 'to',
 'say',
 '.',
 'i',
 'on',
 'my',
 'part',
 'love',
 'this',
 'pen

In [28]:
tag_map

defaultdict(<function __main__.<lambda>()>,
            {'J': 'a',
             'V': 'v',
             'R': 'r',
             'N': 'n',
             'M': 'n',
             'D': 'n',
             'I': 'n',
             'C': 'n',
             'S': 'n',
             'P': 'n',
             '$': 'n',
             'U': 'n',
             'E': 'n',
             'F': 'n',
             'W': 'n',
             'T': 'n',
             "'": 'n'})

In [29]:
data.head()

Unnamed: 0,text,label,text_final
0,"[stuning, even, for, the, non-gamer, :, this, ...",__label__2,"['stun', 'even', 'sound', 'track', 'beautiful'..."
1,"[the, best, soundtrack, ever, to, anything, .,...",__label__2,"['best', 'soundtrack', 'ever', 'anything', 're..."
2,"[amazing, !, :, this, soundtrack, is, my, favo...",__label__2,"['amaze', 'soundtrack', 'favorite', 'music', '..."
3,"[excellent, soundtrack, :, i, truly, like, thi...",__label__2,"['excellent', 'soundtrack', 'truly', 'like', '..."
4,"[remember, ,, pull, your, jaw, off, the, floor...",__label__2,"['remember', 'pull', 'jaw', 'floor', 'hear', '..."


In [30]:
train_x, test_x, train_y, test_y = model_selection.train_test_split(data['text_final'],data['label'],test_size=0.3)

In [31]:
encoder = LabelEncoder()
train_y_encoded = encoder.fit_transform(train_y)
test_y_encoded = encoder.fit_transform(test_y)

In [32]:
tfidf_vect = TfidfVectorizer(max_features=5000)
tfidf_vect.fit(data['text_final'])
train_x_tfidf = tfidf_vect.transform(train_x)
test_x_tfidf = tfidf_vect.transform(test_x)

In [33]:
print(tfidf_vect.vocabulary_)



In [34]:
# fit the training dataset on the NB classifier
naive = naive_bayes.MultinomialNB()
naive.fit(train_x_tfidf,train_y)
# predict the labels on validation dataset
predictions_NB = naive.predict(test_x_tfidf)
# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score: ",accuracy_score(predictions_NB, test_y)*100)

Naive Bayes Accuracy Score:  83.06666666666666


### Now we should be able to provide input sentences for the model to classify

In [35]:
sentence_to_classify = "This product is nice"
naive.predict(tfidf_vect.transform([sentence_to_classify]))

array(['__label__2 '], dtype='<U11')