# load data set

In [1]:
import pandas as pd
import numpy as np

In [2]:
products = pd.read_csv('amazon_baby_subset.csv')

In [3]:
products.columns

Index(['name', 'review', 'rating', 'sentiment'], dtype='object')

In [4]:
products['name'].head(10)

0    Stop Pacifier Sucking without tears with Thumb...
1      Nature's Lullabies Second Year Sticker Calendar
2      Nature's Lullabies Second Year Sticker Calendar
3                          Lamaze Peekaboo, I Love You
4    SoftPlay Peek-A-Boo Where's Elmo A Children's ...
5                            Our Baby Girl Memory Book
6    Hunnt&reg; Falling Flowers and Birds Kids Nurs...
7    Blessed By Pope Benedict XVI Divine Mercy Full...
8    Cloth Diaper Pins Stainless Steel Traditional ...
9    Cloth Diaper Pins Stainless Steel Traditional ...
Name: name, dtype: object

In [5]:
products[['rating','sentiment']].tail(10)

Unnamed: 0,rating,sentiment
53062,2,-1
53063,1,-1
53064,1,-1
53065,2,-1
53066,1,-1
53067,1,-1
53068,1,-1
53069,1,-1
53070,2,-1
53071,2,-1


In [6]:
num = 0
for idx in products.index:
    num += 1 if products.iloc[idx]['sentiment'] > 0 else -1
print('positive reviews:',(num+len(products))/2,'negative reviews:',(len(products)-num)/2, 'in totla',products.shape[0])

positive reviews: 26579.0 negative reviews: 26493.0 in totla 53072


# test cleaning

In [7]:
import json
with open('important_words.json', 'r') as f:
    important_words = json.loads(f.read())
print(important_words, len(important_words))

['baby', 'one', 'great', 'love', 'use', 'would', 'like', 'easy', 'little', 'seat', 'old', 'well', 'get', 'also', 'really', 'son', 'time', 'bought', 'product', 'good', 'daughter', 'much', 'loves', 'stroller', 'put', 'months', 'car', 'still', 'back', 'used', 'recommend', 'first', 'even', 'perfect', 'nice', 'bag', 'two', 'using', 'got', 'fit', 'around', 'diaper', 'enough', 'month', 'price', 'go', 'could', 'soft', 'since', 'buy', 'room', 'works', 'made', 'child', 'keep', 'size', 'small', 'need', 'year', 'big', 'make', 'take', 'easily', 'think', 'crib', 'clean', 'way', 'quality', 'thing', 'better', 'without', 'set', 'new', 'every', 'cute', 'best', 'bottles', 'work', 'purchased', 'right', 'lot', 'side', 'happy', 'comfortable', 'toy', 'able', 'kids', 'bit', 'night', 'long', 'fits', 'see', 'us', 'another', 'play', 'day', 'money', 'monitor', 'tried', 'thought', 'never', 'item', 'hard', 'plastic', 'however', 'disappointed', 'reviews', 'something', 'going', 'pump', 'bottle', 'cup', 'waste', 'retu

In [8]:
def remove_punctuation(text):
    import string
    text = text.str.replace('[^\w\s]', '')
    return text


In [9]:
products = products.fillna({'review':''})
products['review_clean'] = remove_punctuation(products['review'])
products.columns

Index(['name', 'review', 'rating', 'sentiment', 'review_clean'], dtype='object')

In [10]:
for word in important_words:
    products[word] = products['review_clean'].apply(lambda s : float(s.split().count(word)))

In [11]:
num = 0
# check nan is filled
for i in products['review_clean'].index:
    if not isinstance(products.iloc[i]['review_clean'],str):
        print(products.iloc[i]['review_clean'])
        num += 1
print(num, products.shape)

0 (53072, 198)


In [12]:
products['contains_perfect'] = products['perfect'] > 0
print(sum(products['contains_perfect']))

2955


# convert data to numpy array

In [13]:
print(pd.__version__)

0.23.0


In [14]:
def get_numpy_data(dataframe, features, label):
    dataframe['constant'] = 1
    features = ['constant'] + features
    feature_matrix = dataframe[features].values
    label_array = dataframe[label].values
    return feature_matrix, label_array

In [15]:
feature_matrix,labels = get_numpy_data(products, important_words, 'sentiment')
print(feature_matrix.shape, labels.shape)

(53072, 194) (53072,)


# compute conditional probability using sigmoid function

In [16]:
def predict_probability(feature_matrix, coefficients):
    score = np.dot(feature_matrix,coefficients)
    predictions = 1/(1+np.exp(-score))
    return predictions

In [17]:
def feature_derivative(errors, feature):
    return np.dot(errors, feature)

In [18]:
def compute_log_likelihood(feature_matrix, labels, coefficients):
    indicator = (labels==+1)
    scores = np.dot(feature_matrix, coefficients)
    lp1 = np.sum((indicator-1)*scores - np.log(1. + np.exp(-scores)))
    
    pbar = 1/(1+np.exp(-scores))
    lp2 = np.sum(indicator *np.log(pbar) +(1-indicator)*np.log(1-pbar) )
    return lp1

In [19]:
def logistic_regression_mle(feature_matrix, labels, initial_coefficients, step_size, max_iter):
    coef = np.array(initial_coefficients, dtype=float)
    d = coef.shape[0]
    indicator = (labels == 1).astype(float)
    print(feature_matrix.dtype, indicator.dtype, coef.dtype)
    for itr in range(max_iter):
        pred = predict_probability(feature_matrix, coef)
        errors = indicator - pred
        for j in range(d):
            coef[j] += step_size * feature_derivative(errors, feature_matrix[:,j])
#             print(np.dot(errors, feature_matrix[:,j]))
    # Checking whether log likelihood is increasing
        if itr <= 15 or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and itr % 100 == 0) or (itr <= 10000 and itr % 1000 == 0) or itr % 10000 == 0:
            lp = compute_log_likelihood(feature_matrix, labels, coef)
            print ('iteration %*d: log likelihood of observed labels = %.8f' %(int(np.ceil(np.log10(max_iter))), itr, lp))
    return coef

In [20]:
coef = logistic_regression_mle(feature_matrix, labels, [0 for _ in range(194)], 1e-7, 301)

float64 float64 float64
iteration   0: log likelihood of observed labels = -36780.91765822
iteration   1: log likelihood of observed labels = -36775.13429407
iteration   2: log likelihood of observed labels = -36769.35705617
iteration   3: log likelihood of observed labels = -36763.58592657
iteration   4: log likelihood of observed labels = -36757.82088750
iteration   5: log likelihood of observed labels = -36752.06192130
iteration   6: log likelihood of observed labels = -36746.30901047
iteration   7: log likelihood of observed labels = -36740.56213761
iteration   8: log likelihood of observed labels = -36734.82128550
iteration   9: log likelihood of observed labels = -36729.08643701
iteration  10: log likelihood of observed labels = -36723.35757516
iteration  11: log likelihood of observed labels = -36717.63468309
iteration  12: log likelihood of observed labels = -36711.91774409
iteration  13: log likelihood of observed labels = -36706.20674154
iteration  14: log likelihood of obser

# predict sentiments

In [21]:
score = np.dot(feature_matrix,coef)
num_pos = 0
for sco in score:
    num_pos += 1 if sco > 0 else 0
print('# positive ', num_pos)

# positive  25127


In [25]:
score = np.dot(feature_matrix,coef)
correct = 0
for sco,label in zip(score,labels):
    prd = 1 if sco > 0 else -1
    correct += 1 if prd == label else 0
print('accuracy ', correct/ labels.shape[0])

accuracy  0.7518465480856196


In [26]:
coefficients = list(coef[1:]) # exclude intercept
word_coefficient_tuples = [(word, coefficient) for word, coefficient in zip(important_words, coefficients)]
word_coefficient_tuples = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=True)

In [27]:
word_coefficient_tuples[0:10]

[('great', 0.06654611812696795),
 ('love', 0.06589085630451576),
 ('easy', 0.06479467430583866),
 ('little', 0.04543575641799634),
 ('loves', 0.044976404990584126),
 ('well', 0.030134968617416664),
 ('perfect', 0.02973996481683957),
 ('old', 0.020077453090480573),
 ('nice', 0.018408789054742972),
 ('daughter', 0.017703241856832346)]

In [28]:
negarive_words = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=False)
print(negarive_words[0:10])

[('would', -0.053860003347359055), ('product', -0.041510960132838635), ('money', -0.038982013325180455), ('work', -0.03306944956701274), ('even', -0.030051149961455727), ('disappointed', -0.02897894895589544), ('get', -0.028711410346220555), ('back', -0.027742584349641455), ('return', -0.026592754205541476), ('monitor', -0.024482031154393426)]
