In [1]:
import re
import numpy as np
import json
import string
import math
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
products = pd.read_csv('Week_6/amazon_baby.csv')
with open('Week_6/module-9-assignment-train-idx.json') as f:
    train_idx = json.load(f)
with open('Week_6/module-9-assignment-test-idx.json') as f:
    test_idx = json.load(f)
    


In [3]:
def remove_punctuation(text):
    trans = str.maketrans('', '', string.punctuation)
    return text.translate(trans) 

In [4]:
products = products.fillna({'review':''})
products['review_clean'] = products['review'].apply(remove_punctuation)
products['sentiment'] = products['rating'].apply(lambda rating : +1 if rating > 3 else -1)



In [5]:
train_data = products.iloc[train_idx]
test_data = products.iloc[test_idx]

In [6]:
train_data = train_data[train_data['rating'] != 3]
test_data = test_data[test_data['rating'] != 3]

In [7]:
train_data = train_data[~train_data['name'].isnull()]
test_data = test_data[~test_data['name'].isnull()]

In [8]:
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
train_matrix = vectorizer.fit_transform(train_data['review_clean'])
test_matrix = vectorizer.transform(test_data['review_clean'])


In [9]:
train_matrix = train_matrix.todense()


In [10]:
test_matrix = test_matrix.todense()

In [12]:
train_data_Y = np.array(train_data['sentiment']).reshape(-1,1)
test_data_Y = np.array(test_data['sentiment']).reshape(-1,1)

In [None]:
model = LogisticRegression(solver='sag', random_state=0)
model.fit(train_matrix, train_data_Y)

In [None]:
accuracy = accuracy_score(y_true=test_data['sentiment'].to_numpy(), y_pred=model.predict(test_matrix))
print ("Test Accuracy: %s" % accuracy)

In [None]:
baseline = len(test_data[test_data['sentiment'] == 1])/len(test_data)
print ("Baseline accuracy (majority class classifier): %s" % baseline)

In [None]:
from sklearn.metrics import confusion_matrix
cmat = confusion_matrix(y_true=test_data['sentiment'].to_numpy(),
                        y_pred=model.predict(test_matrix),
                        labels=model.classes_)    # use the same order of class as the LR model.
print (' target_label | predicted_label | count ')
print ('--------------+-----------------+-------')
# Print out the confusion matrix.
# NOTE: Your tool may arrange entries in a different order. Consult appropriate manuals.
for i, target_label in enumerate(model.classes_):
    for j, predicted_label in enumerate(model.classes_):
        print ('{0:^13} | {1:^15} | {2:5d}'.format(target_label, predicted_label, cmat[i,j]))

In [None]:
from sklearn.metrics import precision_score
precision = precision_score(y_true=test_data['sentiment'].to_numpy(), 
                            y_pred=model.predict(test_matrix))
print ("Precision on test data: %s" % precision)

In [None]:
from sklearn.metrics import recall_score
recall = recall_score(y_true=test_data['sentiment'].to_numpy(),
                      y_pred=model.predict(test_matrix))
print ("Recall on test data: %s" % recall)

In [None]:
probabilities = model.predict_proba(test_matrix)[:,1]

In [None]:
threshold_values = np.linspace(0.5, 1, num=100)
print (threshold_values)

In [None]:
def plot_pr_curve(precision, recall, title):
    plt.rcParams['figure.figsize'] = 7, 5
    plt.locator_params(axis = 'x', nbins = 5)
    plt.plot(precision, recall, 'b-', linewidth=4.0, color = '#B0017F')
    plt.title(title)
    plt.xlabel('Precision')
    plt.ylabel('Recall')
    plt.rcParams.update({'font.size': 16})

plot_pr_curve(precision_all, recall_all, 'Precision recall curve (all)')

In [None]:
baby_reviews = test_data[test_data['name'].apply(lambda x: 'baby' in x.lower())]

In [None]:
baby_matrix = vectorizer.transform(baby_reviews['review_clean'])
probabilities = model.predict_proba(baby_matrix)[:,1]

In [None]:
threshold_values = np.linspace(0.5, 1, num=100)