In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/nathan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
col_names = ['marketplace','customer_id','review_id','product_id','product_parent','product_title','product_category','star_rating','helpful_votes','total_votes','vine','verified_purchase','review_headline','review_body','review_date']
cols = {}
for i in range(len(col_names)):
    print (str(i)+': '+col_names[i])
    cols[col_names[i]] = i 

0: marketplace
1: customer_id
2: review_id
3: product_id
4: product_parent
5: product_title
6: product_category
7: star_rating
8: helpful_votes
9: total_votes
10: vine
11: verified_purchase
12: review_headline
13: review_body
14: review_date


In [3]:
np.random.seed(500)

In [4]:
df = pd.read_csv('data/sample_02.csv')

df = df.sample(frac=.1, random_state=1)

In [5]:
helpful_percentage = []
for pos, total in zip(df['8'],df['9']):
    if total>0:
        helpful_percentage.append(float(pos)/float(total))
    else:
        helpful_percentage.append(0)
df['15'] = helpful_percentage

In [6]:
print(len(df))
df.head(3)

6247


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
53304,US,49786746,R217S4F2FV9SRT,965824233,173013649,The NAET Guide Book (4th Ed.),Books,5,13,29,N,N,The best Guide Book for a patient,It's a very good guide-book where you can get ...,11050,0.448276
21985,US,52661944,R295CEHQLEXS5G,515120871,434295698,Finding the Dream (Dream Trilogy),Books,3,1,3,N,N,Sloppy writing,"This book, like all of Nora Roberts' books, is...",12200,0.333333
58996,US,52651244,RNAI0W4D8WWWL,821761455,199563422,With All My Heart,Books,3,2,2,N,N,Not as good as the other two in the trilogy,I had been waiting patiently (sometimes unpati...,10650,1.0


In [7]:
Corpus = pd.DataFrame()
Corpus['text'] = df['13']
lst = []
for x in df['15']:
    if x>=.66:
        lst.append('good')
    else:
        lst.append('bad')
Corpus['label'] = lst

In [8]:
len(Corpus['label'])

6247

In [9]:
# Step - a : Remove blank rows if any.
Corpus['text'].dropna(inplace=True)
# Step - b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
Corpus['text'] = [entry.lower() for entry in Corpus['text']]
# Step - c : Tokenization : In this each entry in the corpus will be broken into set of words
Corpus['text']= [word_tokenize(entry) for entry in Corpus['text']]
# Step - d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
lst = []
for index,entry in enumerate(Corpus['text']):
    
    if index%100 ==0:
        print(index)
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    
    #Corpus.loc[index,'text_final'] = str(Final_words)
    lst.append(str(Final_words))
    
Corpus['text_final'] = lst

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200


In [10]:
len(lst)

6247

In [11]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'],Corpus['label'],test_size=0.3)




In [12]:
Train_Y

11285    good
38726    good
50913    good
5398     good
1045     good
         ... 
15064    good
46551    good
43690    good
26100    good
8675     good
Name: label, Length: 4372, dtype: object

In [13]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

In [14]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(Corpus['text_final'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [15]:
print(Tfidf_vect.vocabulary_)



In [16]:
print(Train_X_Tfidf)

  (0, 4982)	0.03764144096594896
  (0, 4974)	0.08209487111510068
  (0, 4963)	0.03250120702387735
  (0, 4851)	0.05388536704427414
  (0, 4837)	0.08742626490318463
  (0, 4803)	0.07831218702381412
  (0, 4794)	0.27244925192695074
  (0, 4771)	0.060462986454817826
  (0, 4663)	0.05727003505585017
  (0, 4658)	0.04919084519156253
  (0, 4592)	0.08023985191035618
  (0, 4556)	0.06328114281020443
  (0, 4531)	0.07924072576030546
  (0, 4520)	0.035053477477412824
  (0, 4500)	0.09336934269913459
  (0, 4461)	0.08509212050916093
  (0, 4446)	0.06904953315730133
  (0, 4427)	0.11021197271763773
  (0, 4423)	0.036059637284275796
  (0, 4404)	0.10072763833506068
  (0, 4285)	0.06426107622109453
  (0, 4263)	0.0634741836708721
  (0, 4250)	0.07772769813414053
  (0, 4240)	0.061502645420439164
  (0, 4202)	0.09539624454697675
  :	:
  (4370, 153)	0.10431769383491837
  (4370, 139)	0.10534618289984156
  (4370, 12)	0.154644611853282
  (4371, 4957)	0.09692072470263545
  (4371, 4681)	0.09919740256941276
  (4371, 4635)	0.20951

In [17]:
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)

Naive Bayes Accuracy Score ->  62.45333333333334


In [18]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score ->  63.519999999999996


In [None]:
tes