In [1]:
import re
import numpy as np
import pandas as pd
import turicreate as tc
from sklearn import linear_model
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
%matplotlib inline
import json

In [2]:
pd.set_option('display.max_columns',25)

In [3]:
products = pd.read_csv('Week_2/amazon_baby_subset.csv')
with open('Week_2/important_words.json') as f:
    important_words = json.load(f)
important_words = [str(s) for s in important_words]

In [4]:
products = products.fillna({'review':''})

In [5]:
def remove_punctuation(text):
    import string
    trans_dict = text.maketrans('','', string.punctuation)
    return text.translate(trans_dict)

In [6]:
products['review_clean'] = products['review'].apply(remove_punctuation)

In [7]:
for word in important_words:
    products[word] = products['review_clean'].apply(lambda s : s.split().count(word))

In [8]:
contains_perfect = products[products['perfect'] >= 1]
contains_perfect.shape

(2955, 198)

In [9]:
def get_numpy_data(dataframe, features, label):
    dataframe['constant'] = 1
    features = ['constant'] + features
    features_frame = dataframe[features]
    feature_matrix = np.asmatrix(features_frame)
    label_sarray = dataframe[label]
    label_array = np.asmatrix(label_sarray)
    return(feature_matrix, label_array)

In [47]:
feature_matrix, sentiment = get_numpy_data(products, important_words, 'sentiment')
feature_matrix.shape

(53072, 194)

In [11]:
'''
produces probablistic estimate for P(y_i = +1 | x_i, w).
estimate ranges between 0 and 1.
'''
def predict_probability(feature_matrix, coefficients):
    # Take dot product of feature_matrix and coefficients 
    # YOUR CODE HERE
    score = np.dot(feature_matrix, coefficients)
    
    # Compute P(y_i = +1 | x_i, w) using the link function
    # YOUR CODE HERE
    predictions = 1.0/(1+np.exp(-score))
    
    # return predictions
    return predictions

In [12]:
def feature_derivative(errors, feature):     
    # Compute the dot product of errors and feature
    derivative = np.dot(errors, feature)
        # Return the derivative
    return derivative

In [37]:
def compute_log_likelihood(feature_matrix, sentiment, coefficients):
    indicator = (sentiment==+1)
    scores = np.dot(feature_matrix, coefficients)
    #lp = np.sum(np.subtract(np.array((indicator-1)*scores), np.log(1. + np.exp(-scores))))
    
    ev = np.multiply((indicator-1.), scores)
    nv = np.log(1. + np.exp(-scores))
    
    lp = np.sum(np.subtract(ev, nv))
    return lp

In [38]:
from math import sqrt
def logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter):
    coefficients = np.array(initial_coefficients) # make sure it's a numpy array
    for itr in range(max_iter):
        # Predict P(y_i = +1|x_1,w) using your predict_probability() function
        # YOUR CODE HERE
        predictions = predict_probability(feature_matrix, coefficients)

        # Compute indicator value for (y_i = +1)
        indicator = (sentiment==+1)

        # Compute the errors as indicator - predictions
        errors = indicator - predictions

        for j in range(len(coefficients)): # loop over each coefficient
            # Recall that feature_matrix[:,j] is the feature column associated with coefficients[j]
            # compute the derivative for coefficients[j]. Save it in a variable called derivative
            # YOUR CODE HERE
            derivative = feature_derivative(errors, feature_matrix[:, j])
            
            # add the step size times the derivative to the current coefficient
            # YOUR CODE HERE
            coefficients[j] += step_size * derivative
            
        # Checking whether log likelihood is increasing
        if itr <= 15 or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and itr % 100 == 0) or (itr <= 10000 and itr % 1000 == 0) or itr % 10000 == 0:
            lp = compute_log_likelihood(feature_matrix, sentiment, coefficients)
            
            print ('iteration %*d: log likelihood of observed labels = %.8f' % (int(np.ceil(np.log10(max_iter))), itr, lp))
    return coefficients

In [39]:
initial_coefficients = np.zeros(feature_matrix.shape[1])
step_size = 1e-7
max_iter = 301

In [40]:
coefficients = logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter)

iteration   0: log likelihood of observed labels = -36780.91768478
iteration   1: log likelihood of observed labels = -36775.13434712
iteration   2: log likelihood of observed labels = -36769.35713564
iteration   3: log likelihood of observed labels = -36763.58603240
iteration   4: log likelihood of observed labels = -36757.82101962
iteration   5: log likelihood of observed labels = -36752.06207964
iteration   6: log likelihood of observed labels = -36746.30919497
iteration   7: log likelihood of observed labels = -36740.56234821
iteration   8: log likelihood of observed labels = -36734.82152213
iteration   9: log likelihood of observed labels = -36729.08669961
iteration  10: log likelihood of observed labels = -36723.35786366
iteration  11: log likelihood of observed labels = -36717.63499744
iteration  12: log likelihood of observed labels = -36711.91808422
iteration  13: log likelihood of observed labels = -36706.20710739
iteration  14: log likelihood of observed labels = -36700.5020

In [41]:
np.asarray(predict_probability(feature_matrix,coefficients) >= 0.5,dtype=int).sum()

25126

In [76]:
predictions = np.asarray(predict_probability(feature_matrix, coefficients) >= 0.5,dtype=int)
predictions_p = np.array(predictions[0], dtype = int)
sentiment_p = np.asarray(sentiment, dtype = int)[0]
def my_mapper(lst):
    for x in range(len(lst)):
        if lst[x] == -1:
            lst[x] = 0
    return lst
sentiment_pv = my_mapper(sentiment_p)
print(sentiment_pv)

[1 1 1 ... 0 0 0]


In [87]:
correct = np.asarray(predictions_p == sentiment_pv, dtype=int).sum()
print(predictions_p)
print(sentiment_pv)
measure_metrics = correct / len(predictions_p)
print(measure_metrics)

[1 0 1 ... 0 1 0]
[1 1 1 ... 0 0 0]
0.7518653904130238


In [88]:
coefficients = list(coefficients[1:]) # exclude intercept
word_coefficient_tuples = [(word, coefficient) for word, coefficient in zip(important_words, coefficients)]
word_coefficient_tuples = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=True)

In [89]:
word_coefficient_tuples[:10]

[('great', 0.0665460841704577),
 ('love', 0.06589076292212327),
 ('easy', 0.0647945868025784),
 ('little', 0.04543562630842138),
 ('loves', 0.04497640139490604),
 ('well', 0.030135001092107074),
 ('perfect', 0.02973993710496846),
 ('old', 0.02007754103477538),
 ('nice', 0.018408707995268992),
 ('daughter', 0.017703199905701694)]

In [91]:
word_coefficient_tuples[-10:]

[('monitor', -0.024482100545891717),
 ('return', -0.026592778462247283),
 ('back', -0.02774269723066133),
 ('get', -0.028711552980192585),
 ('disappointed', -0.028978976142317068),
 ('even', -0.030051249236035808),
 ('work', -0.03306951529475273),
 ('money', -0.03898203728648711),
 ('product', -0.041511033392108904),
 ('would', -0.053860148445203114)]

In [92]:
##Part 2
from sklearn.model_selection import train_test_split

In [107]:
sentiment_processed = np.transpose(sentiment)

In [109]:
print(feature_matrix.shape)
print(sentiment_processed.shape)

(53072, 194)
(53072, 1)


In [110]:
#train_data, validation_data = products.random_split(.8, seed=2)
train_data, validation_data, train_output, validation_output = train_test_split(np.array(feature_matrix), np.array(sentiment_processed), test_size=0.2, random_state=2)

In [140]:
pred = predict_probability(train_data, new_init_weight)
indi = (train_output==+1).reshape((1,-1))[0]
err = indi - pred
derv = feature_derivative_with_L2(err, train_data[:, 1], new_init_weight[1], l2_penalties[1], False)
lpp = compute_log_likelihood_with_L2(train_data, train_output.reshape((1,-1))[0], new_init_weight, l2_penalties[1])

In [111]:
def feature_derivative_with_L2(errors, feature, coefficient, l2_penalty, feature_is_constant): 
    
    # Compute the dot product of errors and feature
    ## YOUR CODE HERE
    derivative = np.dot(errors, feature)

    # add L2 penalty term for any feature that isn't the intercept.
    if not feature_is_constant: 
        ## YOUR CODE HERE
        derivative -= 2 * l2_penalty * coefficient
        
    return derivative

In [112]:
def compute_log_likelihood_with_L2(feature_matrix, sentiment, coefficients, l2_penalty):
    indicator = (sentiment==+1)
    scores = np.dot(feature_matrix, coefficients)
    
    ev = np.multiply((indicator-1.), scores)
    nv = np.log(1. + np.exp(-scores))
    l2 = np.multiply(l2_penalty, np.sum(coefficients[1:]**2))
    
    lp = np.sum(np.subtract(np.subtract(ev, nv), l2))
    
    #lp = np.sum((indicator-1)*scores - np.log(1. + np.exp(-scores))) - l2_penalty*np.sum(coefficients[1:]**2)
    
    return lp

In [144]:
def logistic_regression_with_L2(feature_matrix, sentiment, initial_coefficients, step_size, l2_penalty, max_iter):
    coefficients = np.array(initial_coefficients) # make sure it's a numpy array
    sentiment = sentiment.reshape((1,-1))[0]
    for itr in range(max_iter):
        # Predict P(y_i = +1|x_i,w) using your predict_probability() function
        ## YOUR CODE HERE
        predictions = predict_probability(feature_matrix, coefficients)
        
        # Compute indicator value for (y_i = +1)
        indicator = (sentiment==+1)
        
        # Compute the errors as indicator - predictions
        errors = indicator - predictions
        
        for j in range(len(coefficients)): # loop over each coefficient
            is_intercept = (j == 0)
            # Recall that feature_matrix[:,j] is the feature column associated with coefficients[j].
            # Compute the derivative for coefficients[j]. Save it in a variable called derivative
            ## YOUR CODE HERE
            derivative = feature_derivative_with_L2(errors, feature_matrix[:, j], coefficients[j], l2_penalty, is_intercept)
            
            # add the step size times the derivative to the current coefficient
            ## YOUR CODE HERE
            coefficients[j] += step_size * derivative
            
        # Checking whether log likelihood is increasing
        if itr <= 15 or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and itr % 100 == 0) or (itr <= 10000 and itr % 1000 == 0) or itr % 10000 == 0:
            lp = compute_log_likelihood_with_L2(feature_matrix, sentiment, coefficients, l2_penalty)
            
            print ('iteration %*d: log likelihood of observed labels = %.8f' % (int(np.ceil(np.log10(max_iter))), itr, lp))
    return coefficients

In [142]:
new_step_size = 5e-6
new_max_iter = 501
l2_penalties = [0, 4, 10, 100, 1e3, 1e5]
new_init_weight = np.zeros(train_data.shape[1])

In [145]:
coefficients_0_penalty = logistic_regression_with_L2(train_data, train_output, initial_coefficients, new_step_size, l2_penalties[0], new_max_iter)

iteration   0: log likelihood of observed labels = -29244.82697959
iteration   1: log likelihood of observed labels = -29068.30359513
iteration   2: log likelihood of observed labels = -28898.56887514
iteration   3: log likelihood of observed labels = -28735.03180945
iteration   4: log likelihood of observed labels = -28577.23777802
iteration   5: log likelihood of observed labels = -28424.81887564
iteration   6: log likelihood of observed labels = -28277.46424082
iteration   7: log likelihood of observed labels = -28134.90216992
iteration   8: log likelihood of observed labels = -27996.88918670
iteration   9: log likelihood of observed labels = -27863.20324128
iteration  10: log likelihood of observed labels = -27733.63937739
iteration  11: log likelihood of observed labels = -27608.00688484
iteration  12: log likelihood of observed labels = -27486.12735202
iteration  13: log likelihood of observed labels = -27367.83326901
iteration  14: log likelihood of observed labels = -27252.9669

In [None]:
coefficients_4_penalty = logistic_regression_with_L2(train_data, train_output, initial_coefficients, new_step_size, l2_penalties[1], new_max_iter)

iteration   0: log likelihood of observed labels = -29402.88901766
iteration   1: log likelihood of observed labels = -29686.95964644
iteration   2: log likelihood of observed labels = -30262.05447085
iteration   3: log likelihood of observed labels = -31111.41488188
iteration   4: log likelihood of observed labels = -32219.97629854
iteration   5: log likelihood of observed labels = -33573.96154239
iteration   6: log likelihood of observed labels = -35160.62290537
iteration   7: log likelihood of observed labels = -36968.07383400
iteration   8: log likelihood of observed labels = -38985.17499716
iteration   9: log likelihood of observed labels = -41201.45353457
iteration  10: log likelihood of observed labels = -43607.04265992
iteration  11: log likelihood of observed labels = -46192.63388105
iteration  12: log likelihood of observed labels = -48949.43720919
iteration  13: log likelihood of observed labels = -51869.14661896
iteration  14: log likelihood of observed labels = -54943.9091

In [None]:
coefficients_10_penalty = logistic_regression_with_L2(train_data, train_output, initial_coefficients, new_step_size, l2_penalties[2], new_max_iter)

In [None]:
coefficients_1e2_penalty = logistic_regression_with_L2(train_data, train_output, initial_coefficients, new_step_size, l2_penalties[3], new_max_iter)

In [None]:
coefficients_1e3_penalty = logistic_regression_with_L2(train_data, train_output, initial_coefficients, new_step_size, l2_penalties[4], new_max_iter)

In [None]:
coefficients_1e5_penalty = logistic_regression_with_L2(train_data, train_output, initial_coefficients, new_step_size, l2_penalties[5], new_max_iter)