In [1]:
from utils import getCorpus
from modeling import Models
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer 
from sklearn.model_selection import KFold, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from input_fn import Dataset
import pandas as pd
import numpy as np

In [2]:
positivecorpus = getCorpus('https://storm.cis.fordham.edu/~yli/data/electronics/positive.review','review_text')
negativecorpus = getCorpus('https://storm.cis.fordham.edu/~yli/data/electronics/negative.review','review_text')
ps = PorterStemmer() 
vectorizer = CountVectorizer()

In [3]:
p = Dataset(corpus=positivecorpus, tokenizer=word_tokenize, stemmer=ps,keep_pantuation= False)
n = Dataset(corpus=negativecorpus, tokenizer=word_tokenize, stemmer=ps,keep_pantuation= False)

In [4]:
p.getmostwords(20)

[('works', 184),
 ('ipod', 185),
 ('ive', 193),
 ('used', 202),
 ('phone', 203),
 ('bought', 206),
 ('price', 217),
 ('dont', 224),
 ('product', 230),
 ('well', 236),
 ('also', 240),
 ('get', 256),
 ('would', 258),
 ('quality', 274),
 ('like', 284),
 ('sound', 355),
 ('good', 399),
 ('great', 402),
 ('one', 429),
 ('use', 434)]

In [5]:
# Get Matrix
df_positive = p.getdata(p.X,y = 1)
df_negative = n.getdata(n.X,y = 0)
df = pd.concat([df_positive,df_negative],axis=0)
df = df.sample(frac=1).reset_index(drop=True)
df['review'] = df['review'].apply(lambda x : " ".join(x))

In [6]:
df.head()

Unnamed: 0,review,sentiment
0,bought backup internal hard drive laptop exper...,0
1,accident caused one bud stop emitting sound fa...,1
2,wanted case small slim carried laptop perfect ...,1
3,gave gift excellent value tune capacity suffic...,1
4,complaints flash drive easy use enough storage...,1


In [7]:
## Modeling
x_train,x_test,y_train,y_test = train_test_split(df.review,df.sentiment,test_size = 0.3)

vectorizer.fit(df.review)
v_x_train = vectorizer.transform(x_train)
v_x_test = vectorizer.transform(x_test)
kfolds = KFold(n_splits=5, shuffle=True, random_state=42)
models = Models(v_x_train,y_train.values,kfolds)
log,nb = models.main()

START Fit
2019-10-08 21:04:15.200968 NB
Training done !!
2019-10-08 21:04:15.224034 logistics
Training done !!


In [8]:
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_pred=nb.predict(v_x_test),y_true=y_test.values))

              precision    recall  f1-score   support

           0       0.79      0.82      0.80       292
           1       0.82      0.79      0.81       308

   micro avg       0.81      0.81      0.81       600
   macro avg       0.81      0.81      0.80       600
weighted avg       0.81      0.81      0.81       600



In [9]:
print(confusion_matrix(y_pred=nb.predict(v_x_test),y_true=y_test.values))

[[239  53]
 [ 64 244]]


In [10]:
def getSigwordsCV(model,cate = 1,vectorizer = vectorizer):
    
    if cate == 1:
        idx = np.argmax(model.named_steps['logisticregressioncv'].coef_)
    else:
        idx = np.argmin(model.named_steps['logisticregressioncv'].coef_)
        
    return vectorizer.get_feature_names()[idx]

def getSigwords(model,cate = 1,vectorizer = vectorizer):
    
    if cate == 1:
        idx = np.argmax(model.coef_)
    else:
        idx = np.argmin(model.coef_)
        
    return vectorizer.get_feature_names()[idx]



In [11]:
print()
print('Good word in logistic model: ',getSigwordsCV(log))

print()
print('Bad word in logistic model: ',getSigwordsCV(log,0))


Good word in logistic model:  great

Bad word in logistic model:  poor


In [12]:
print()
print('Good word in nbmodel: ',getSigwords(nb))

print()
print('Bad word in nb model: ',getSigwords(nb,0))


Good word in nbmodel:  one

Bad word in nb model:  0101036100
