In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
import nltk
from sklearn.model_selection import GridSearchCV
nltk.download('wordnet')
from sklearn.svm import SVC

[nltk_data] Downloading package wordnet to /Users/nathan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
col_names = ['marketplace','customer_id','review_id','product_id','product_parent','product_title','product_category','star_rating','helpful_votes','total_votes','vine','verified_purchase','review_headline','review_body','review_date']
cols = {}
for i in range(len(col_names)):
    print (str(i)+': '+col_names[i])
    cols[col_names[i]] = i 

0: marketplace
1: customer_id
2: review_id
3: product_id
4: product_parent
5: product_title
6: product_category
7: star_rating
8: helpful_votes
9: total_votes
10: vine
11: verified_purchase
12: review_headline
13: review_body
14: review_date


In [3]:
np.random.seed(500)

In [4]:
df = pd.read_csv('data/sample_02.csv')

#df = df.sample(frac=.1, random_state=1)

In [5]:
helpful_percentage = []
for pos, total in zip(df['8'],df['9']):
    if total>0:
        helpful_percentage.append(float(pos)/float(total))
    else:
        helpful_percentage.append(0)
df['15'] = helpful_percentage

In [6]:
print(len(df))
df.head(3)

62467


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,US,38487002,R3LZ2GKGS9YSN4,0198228635,764842125,Brigands with a Cause: Brigandage and Irredent...,Books,5,2,2,N,Y,BRIGANDS WITH A CAUSE :BRIGANDAGE AND IRREDENT...,THE AUTHOR HAS DONE A WONDERFUL JOB IN RESEARC...,13050,1.0
1,US,52219048,R1214RCFCW1QLH,080211797X,728208781,Silent Snow: The Slow Poisoning of the Arctic,Books,5,19,20,N,N,More compelling evidence that we continue to w...,In the tradition of Rachel Carson's landmark 1...,13050,0.95
2,US,27557367,R1S26QN21ILKA9,141205124X,765322347,Maynard the Mannerly Monkey,Books,5,0,0,N,N,Maynard the sweetest little monkey,Ms. Murphy has created such a lovely character...,13050,0.0


In [7]:
Corpus = pd.DataFrame()
Corpus['text'] = df['13']
lst = []
for x in df['15']:
    if x>=.66:
        lst.append('good')
    else:
        lst.append('bad')
Corpus['label'] = lst

In [8]:
len(Corpus['label'])

62467

In [9]:
# Step - a : Remove blank rows if any.
Corpus['text'].dropna(inplace=True)
# Step - b : Change all the text to lower case. This is required as python interprets 'dog' and 'DOG' differently
Corpus['text'] = [entry.lower() for entry in Corpus['text']]
# Step - c : Tokenization : In this each entry in the corpus will be broken into set of words
Corpus['text']= [word_tokenize(entry) for entry in Corpus['text']]
# Step - d : Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
lst = []
for index,entry in enumerate(Corpus['text']):
    
    if index%1000 ==0:
        print(index)
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    
    #Corpus.loc[index,'text_final'] = str(Final_words)
    lst.append(str(Final_words))
    
Corpus['text_final'] = lst

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000


KeyboardInterrupt: 

In [None]:
len(lst)

In [None]:
Corpus.to_csv('data/corpus.csv')


In [None]:
Corpus = pd.read_csv('data/corpus.csv')

In [None]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text_final'],Corpus['label'],test_size=0.3)




In [None]:
Train_Y

In [None]:
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

In [None]:
Tfidf_vect = TfidfVectorizer(max_features=10000)
Tfidf_vect.fit(Corpus['text_final'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [None]:
print(Tfidf_vect.vocabulary_)

In [None]:
print(Train_X_Tfidf[3])

In [None]:
# fit the training dataset on the NB classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_NB = Naive.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("Naive Bayes Accuracy Score -> ",accuracy_score(predictions_NB, Test_Y)*100)

In [None]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=6, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(Test_X_Tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, Test_Y)*100)

In [None]:
sum(Corpus['label']=='good')/len(Corpus)

In [None]:
sum(Corpus['label']=='good')

## Grid Search SVM

In [None]:
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1,2,3,4 ,10, 100, 1000]}]

scores = ['precision', 'recall']


In [None]:
clf = GridSearchCV(SVC(), tuned_parameters, cv=5,scoring='%s_macro' % scores[0])
clf.fit(Train_X_Tfidf,Train_Y)

In [None]:
print(clf.best_params_)