In [1]:
import pandas as pd
import numpy as np
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score ,f1_score,roc_curve,precision_recall_curve
import matplotlib.pyplot as plt

%load_ext autoreload
%autoreload 2
from data_prep import df_prep  
from data_prep import  NLP_Vectorizer
from data_prep import parse_line
from model_src import NLP_model
import time


[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /Users/nathan/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /Users/nathan/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /Users/nathan/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /Users/nathan/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     /Users/nathan/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /Users/nathan/nltk_data...
[nltk_data]    |   Package movie_reviews is already

In [2]:
col_names = ['marketplace','customer_id','review_id','product_id','product_parent','product_title','product_category','star_rating','helpful_votes','total_votes','vine','verified_purchase','review_headline','review_body','review_date']
cols = {}
for i in range(len(col_names)):
    print (str(i)+': '+col_names[i])
    cols[col_names[i]] = i

0: marketplace
1: customer_id
2: review_id
3: product_id
4: product_parent
5: product_title
6: product_category
7: star_rating
8: helpful_votes
9: total_votes
10: vine
11: verified_purchase
12: review_headline
13: review_body
14: review_date


In [3]:
np.random.seed(500)
df = pd.read_csv('data/Spark_Pulls/us_Books_v1_02.csv')


df = df.sample(frac=.1, random_state=1)
len(df)

6672

In [4]:
#Corpus = df_prep(df,np.mean(df['8']/df['9']),.0)
Corpus = pd.read_csv('data/Books_02_Corpus.csv')
Corpus['text_final'] = Corpus['text_final'].fillna(' ')

In [5]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus[['text','text_final','help_score','help_votes','stars']],Corpus['label'],test_size=0.3)



In [6]:
Train_X.isna().sum()

text          0
text_final    0
help_score    0
help_votes    0
stars         0
dtype: int64

In [7]:
CV_2K = NLP_Vectorizer('CV',{'max_features':2000,'ngram_range':(1,3)})
TF_IDF_2K =NLP_Vectorizer('TF_IDF',{'max_features':2000,'ngram_range':(1,3)})

CV_4K = NLP_Vectorizer('CV',{'max_features':4000,'ngram_range':(1,3)})
TF_IDF_4K =NLP_Vectorizer('TF_IDF',{'max_features':4000,'ngram_range':(1,3)}) 

CV_400 = NLP_Vectorizer('CV',{'max_features':400,'ngram_range':(1,3)})
TF_IDF_400 =NLP_Vectorizer('TF_IDF',{'max_features':400,'ngram_range':(1,3)})



orig_Train_Y = Train_Y
Test_Y = CV_2K.encode_Y(Test_Y)
Train_Y = CV_2K.encode_Y(Train_Y)

CV_2K.fit(Train_X['text_final'])
Train_X_Vector_CV_2K = CV_2K.transform(Train_X['text_final'])
Test_X_Vector_CV_2K = CV_2K.transform(Test_X['text_final'])

CV_4K.fit(Train_X['text_final'])
Train_X_Vector_CV_4K = CV_4K.transform(Train_X['text_final'])
Test_X_Vector_CV_4K = CV_4K.transform(Test_X['text_final'])

CV_400.fit(Train_X['text_final'])
Train_X_Vector_CV_400 = CV_400.transform(Train_X['text_final'])
Test_X_Vector_CV_400 = CV_400.transform(Test_X['text_final'])

TF_IDF_2K.fit(Train_X['text_final'])
Train_X_Vector_TF_IDF_2K = TF_IDF_2K.transform(Train_X['text_final'])
Test_X_Vector_TF_IDF_2K = TF_IDF_2K.transform(Test_X['text_final'])

TF_IDF_4K.fit(Train_X['text_final'])
Train_X_Vector_TF_IDF_4K = TF_IDF_4K.transform(Train_X['text_final'])
Test_X_Vector_TF_IDF_4K = TF_IDF_4K.transform(Test_X['text_final'])

TF_IDF_400.fit(Train_X['text_final'])
Train_X_Vector_TF_IDF_400 = TF_IDF_400.transform(Train_X['text_final'])
Test_X_Vector_TF_IDF_400 = TF_IDF_400.transform(Test_X['text_final'])


In [8]:
SVC_Standard = NLP_model('SVC',{'degree':3,'gamma':'auto','kernel':'linear','C':1.0})

RF_Standard =  NLP_model('RF',{'n_estimators':1500})

LR_Standard = NLP_model('LR')

XG_Standard = NLP_model('XGBoost',{'max_depth':5,'n_estimators':1000,'learning_rate':.01})

In [9]:

data_set_train = Train_X_Vector_TF_IDF_400
data_set_test = Test_X_Vector_TF_IDF_400


In [10]:
trains = [Train_X_Vector_TF_IDF_400,Train_X_Vector_TF_IDF_2K,Train_X_Vector_TF_IDF_4K]
tests = [Test_X_Vector_TF_IDF_400,Test_X_Vector_TF_IDF_2K,Test_X_Vector_TF_IDF_4K]
names = ['TF_IDF_400','TF_IDF_2K','TF_IDF_4K']

In [None]:
for data_set_train,data_set_test,name in zip(trains,tests,names):
    ts = time.time()
    print('a')
    SVC_Standard.fit(data_set_train,Train_Y)

    SVC_Standard_predict_proba = SVC_Standard.predict_proba(data_set_test)

    SVC_fpr, SVC_tpr, SVC_thresholds = roc_curve(Test_Y,SVC_Standard_predict_proba[:,1])

    SVC_precision, SVC_recall, SVC_thresholds = precision_recall_curve(Test_Y,SVC_Standard_predict_proba[:,1])

    print('b')
    RF_Standard.fit(data_set_train,Train_Y)

    RF_Standard_predict_proba = RF_Standard.predict_proba(data_set_test)

    RF_fpr, RF_tpr, thresholds = roc_curve(Test_Y,RF_Standard_predict_proba[:,1])

    RF_precision, RF_recall, thresholds = precision_recall_curve(Test_Y,RF_Standard_predict_proba[:,1])
    print('c')

    XG_Standard.fit(data_set_train,Train_Y)

    XG_Standard_predict_proba = XG_Standard.predict_proba(data_set_test)

    XG_fpr, XG_tpr, thresholds = roc_curve(Test_Y,XG_Standard_predict_proba[:,1])

    XG_precision, XG_recall, thresholds = precision_recall_curve(Test_Y,XG_Standard_predict_proba[:,1])

    print('d')

    LR_Standard.fit(data_set_train,Train_Y)

    LR_Standard_predict_proba = LR_Standard.predict_proba(data_set_test)

    LR_fpr, LR_tpr, thresholds = roc_curve(Test_Y,LR_Standard_predict_proba[:,1])

    LR_precision, LR_recall, thresholds = precision_recall_curve(Test_Y,LR_Standard_predict_proba[:,1])

    print('f')
    plt.figure(figsize=(10,10))
    plt.plot(LR_fpr,LR_tpr,label='Logistic Regression')
    plt.plot(XG_fpr,XG_tpr,label='XGBoost')
    plt.plot(RF_fpr,RF_tpr,label='Random Forest')
    plt.plot(SVC_fpr,SVC_tpr,label='Support Vector Classifier')
    plt.plot([0,1],[0,1])
    plt.xlabel('False Positive')
    plt.ylabel('True Positive')
    plt.legend()
    plt.savefig('Viz/'+name+'_ROC_Curve.png')
    
    print(time.time()-ts)




a


In [None]:
plt.figure(figsize=(10,10))
plt.plot(LR_precision,LR_recall,label='Logistic Regression')
plt.plot(XG_precision,XG_recall,label='XGBoost')
plt.plot(RF_precision,RF_recall,label='Random Forest')
plt.plot(SVC_precision,SVC_recall,label='Support Vector Classifier')

plt.xlabel('precision')
plt.ylabel('recall')
plt.ylim(.1,1)
plt.xlim(.5,1)
plt.legend()
plt.savefig('Viz/Precision_Recall_Curve.png')

In [None]:
fprs = []
tprs = []

for data_set_train,data_set_test,name in zip(trains,tests,names):
    LR_Standard.fit(data_set_train,Train_Y)

    LR_Standard_predict_proba = LR_Standard.predict_proba(data_set_test)

    LR_fpr, LR_tpr, thresholds = roc_curve(Test_Y,LR_Standard_predict_proba[:,1])

    LR_precision, LR_recall, thresholds = precision_recall_curve(Test_Y,LR_Standard_predict_proba[:,1])
    fprs.append(LR_fpr)
    tprs.append(LR_tpr)
    

In [None]:
plt.figure(figsize=(10,10))
plt.plot(fprs[0],tprs[0],label=names[0])
plt.plot(fprs[1],tprs[1],label=names[1])
plt.plot(fprs[2],tprs[2],label=names[2])

plt.plot([0,1],[0,1])
plt.xlabel('False Positive')
plt.ylabel('True Positive')
plt.legend()
plt.savefig('Viz/'+'TD_IDF_Size'+'_ROC_Curve.png')
    

In [None]:
plt.hist(df['14']%365,bins=365)