In [8]:
import pandas as pd
import numpy as np
import json

In [4]:
products=pd.read_csv("amazon_baby_subset.csv")

In [6]:
products.head()

Unnamed: 0,name,review,rating,sentiment
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1


In [7]:
products["sentiment"].value_counts()

 1    26579
-1    26493
Name: sentiment, dtype: int64

In [11]:
imp_words=json.load(open("important_words.json"))

In [22]:
type(imp_words[3])

str

# data cleaning

In [14]:


products=products.fillna({"review":""})

In [17]:
def remove_punctuation(text):
    import string
    import re
    
    punctuation=string.punctuation #list of puncts
    clean=re.sub(r"[{}]".format(punctuation),"",text)
    
    return clean

In [18]:
products["clean_review"]=products["review"].apply(remove_punctuation)

In [19]:
# count the words of imp_words that occur in clear_review
for word in imp_words:
    products[word] = products['clean_review'].apply(lambda s : s.split().count(word))


In [24]:
products["perfect"].sum()

3207

In [25]:
products["perfect"].value_counts()

0    50117
1     2731
2      202
3       16
4        6
Name: perfect, dtype: int64

In [26]:
# Now, write some code to compute the number of product reviews that contain the word perfect.

In [27]:
products["contains_perfect"]=products["perfect"].apply(lambda x: 1 if x>=1 else 0)

In [29]:
products["contains_perfect"].sum()

2955

# Q1 How many reviews in amazon_baby_subset.gl contain the word perfect?

ans=2955

#  convert data frame to a multi-dimensional array.

In [31]:
def get_data(dataframe, features, label):
    
    dataframe["constant"]=1
    features = ["constant"] + features
    
    features_frame=dataframe[features]
    feature_matrix=features_frame.as_matrix()
    
    label_sarray=dataframe[label]
    label_array= label_sarray.as_matrix()
    
    return(feature_matrix, label_array)

In [49]:
feature_matrix, sentiment= get_data(products,imp_words,"sentiment")

In [36]:
feature_matrix.shape

(53072, 194)

In [37]:
type(feature_matrix)

numpy.ndarray

# Q2 How many features are there in the feature_matrix?

ans=194

# Estimating conditional probability with link function

$$
P(y_i = +1 | \mathbf{x}_i,\mathbf{w}) = \frac{1}{1 + \exp(-\mathbf{w}^T h(\mathbf{x}_i))},
$$

In [42]:
def predict_probability(feature_matrix, coefficients):
    
    score=np.dot(feature_matrix,coefficients)
    prediction= 1. /(1 +np.exp(-score)) #link funcution
    
    return prediction

    

In [43]:
# check point

dummy_feature_matrix = np.array([[1.,2.,3.], [1.,-1.,-1]])
dummy_coefficients = np.array([1., 3., -1.])

correct_scores      = np.array( [ 1.*1. + 2.*3. + 3.*(-1.),          1.*1. + (-1.)*3. + (-1.)*(-1.) ] )
correct_predictions = np.array( [ 1./(1+np.exp(-correct_scores[0])), 1./(1+np.exp(-correct_scores[1])) ] )

print('The following outputs must match ')
print('------------------------------------------------')
print('correct_predictions           =', correct_predictions)
print('output of predict_probability =', predict_probability(dummy_feature_matrix, dummy_coefficients))

The following outputs must match 
------------------------------------------------
correct_predictions           = [ 0.98201379  0.26894142]
output of predict_probability = [ 0.98201379  0.26894142]


# Compute derivative of log likelihood with respect to a single coefficient

$$
\frac{\partial\ell}{\partial w_j} = \sum_{i=1}^N h_j(\mathbf{x}_i)\left(\mathbf{1}[y_i = +1] - P(y_i = +1 | \mathbf{x}_i, \mathbf{w})\right)
$$

* `errors` vector containing $\mathbf{1}[y_i = +1] - P(y_i = +1 | \mathbf{x}_i, \mathbf{w})$ for all $i$.
* `feature` vector containing $h_j(\mathbf{x}_i)$  for all $i$. 


In [44]:
def feature_derivative(errors, feature):
    derivative=np.dot(errors, feature)
    
    return derivative

# Write a function compute_log_likelihood that implements the equation

$$\ell\ell(\mathbf{w}) = \sum_{i=1}^N \Big( (\mathbf{1}[y_i = +1] - 1)\mathbf{w}^T h(\mathbf{x}_i) - \ln\left(1 + \exp(-\mathbf{w}^T h(\mathbf{x}_i))\right) \Big) $$

In [55]:
def compute_log_likelihood(feature_matrix, sentiment, coefficients):
    indicator= (sentiment==+1)
    scores= np.dot(feature_matrix, coefficients)
    lp=np.sum(indicator-1)*scores -np.log(1. + np.exp(-scores))
    return lp

# Write a function logistic_regression to fit a logistic regression model using gradient ascent.

In [118]:
from math import sqrt
def logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter):
    
    coeff=np.array(initial_coefficients)
    for itr in range(max_iter):
        predictions=predict_probability(feature_matrix, coeff)
        
        indicator= (sentiment==+1)
        errors=indicator - predictions
        
        for j in range(len(coeff)): # derivaTIVES for coeff[j]
            derivative=np.dot(errors, feature_matrix[:,j])
            
            coeff[j]+= step_size*derivative #add step size
            
        # check if log likelyhood is increading
        if itr <=15 or (itr<=100 and itr % 10 == 0) or (itr <=1000 and itr % 100==0) or (itr <=10000 and itr % 1000 == 0) or itr % 10000 == 0:
            lp = compute_log_likelihood(feature_matrix, sentiment, coeff)
            print("iteration {}: log likelihood of observed labels = {}".format( itr, lp))
    return coeff

#caqnt print out obs labels properly.... so

In [165]:
feature_matrix, sentiment= get_data(products,imp_words,"sentiment")
coeff=logistic_regression(feature_matrix, sentiment, initial_coefficients=np.zeros(194),
                         step_size=1e-7, max_iter=301)

iteration 0: log likelihood of observed labels = [ -4.24976531   3.01182894  -1.24286656 ...,  41.41535663   0.84606707
   6.78965943]
iteration 1: log likelihood of observed labels = [ -7.81863918   6.70476282  -1.81120528 ...,  83.46928044   2.35452812
  14.25632162]
iteration 2: log likelihood of observed labels = [ -11.39968057   10.38572071   -2.39804466 ...,  125.46882548    3.83241215
   21.70692469]
iteration 3: log likelihood of observed labels = [ -14.99280179   14.0547685    -3.00326664 ...,  167.41419196    5.27989438
   29.14155349]
iteration 4: log likelihood of observed labels = [ -18.5979156    17.71197174   -3.62675386 ...,  209.3055791     6.69714911
   36.5602924 ]
iteration 5: log likelihood of observed labels = [ -22.21493528   21.3573956    -4.26838958 ...,  251.14318509    8.08434966
   43.96322536]
iteration 6: log likelihood of observed labels = [ -25.84377459   24.99110488   -4.92805773 ...,  292.92720717    9.44166845
   51.35043582]
iteration 7: log likeliho

## q4 it increases 

# predicting sentiments

In [132]:
scores=pd.DataFrame(np.dot(feature_matrix, coeff), columns=["score"])
class_predictions=scores["score"].apply(lambda x: 1 if x>0 else -1)

In [134]:
type(class_predictions)

pandas.core.series.Series

## Q5 How many reviews were predicted to have positive sentiment?

In [135]:
class_predictions.value_counts()

-1    27946
 1    25126
Name: score, dtype: int64

# Measuring accuracy

accu= correct/total

In [140]:
mistakes=(class_predictions != sentiment).sum()
correct= len(sentiment) - mistakes
accuracy=correct/float(len(sentiment))

print("-----------------------------------------------------")
print('# Reviews   correctly classified =', correct)
print('# Reviews incorrectly classified =', mistakes)
print('# Reviews total                  =', len(products))
print("-----------------------------------------------------")
print('Accuracy = {:.2}'.format(accuracy))

-----------------------------------------------------
# Reviews   correctly classified = 39903
# Reviews incorrectly classified = 13169
# Reviews total                  = 53072
-----------------------------------------------------
Accuracy = 0.75


## q6 What is the accuracy of the model on predictions made above? (round to 2 digits of accuracy)

ans=0.75

# Which words contribute most to positive & negative sentiments


In [166]:
coeffs = list(coeff[1:]) # exclude intercept
word_coefficient_tuples = [(word, coefficient) for word, coefficient in zip(imp_words, coeffs)]
word_coefficient_tuples = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=True)

## Q7 Which of the following words is not present in the top 10 "most positive" words?

In [167]:
word_coefficient_tuples[:10]

[('great', 0.066546084170457681),
 ('love', 0.065890762922123258),
 ('easy', 0.06479458680257838),
 ('little', 0.045435626308421372),
 ('loves', 0.044976401394906038),
 ('well', 0.03013500109210707),
 ('perfect', 0.029739937104968459),
 ('old', 0.020077541034775385),
 ('nice', 0.018408707995268992),
 ('daughter', 0.017703199905701694)]

# Q8 ten most neg words

In [168]:
word_coefficient_tuples[-10:]

[('monitor', -0.024482100545891724),
 ('return', -0.026592778462247287),
 ('back', -0.027742697230661334),
 ('get', -0.028711552980192581),
 ('disappointed', -0.028978976142317068),
 ('even', -0.030051249236035808),
 ('work', -0.03306951529475273),
 ('money', -0.038982037286487116),
 ('product', -0.041511033392108904),
 ('would', -0.053860148445203128)]