In [1]:
import numpy as np
import pandas as pd
import os


if __name__ == '__main__':
    data_dir = 'data_reviews'
    x_train_df = pd.read_csv(os.path.join(data_dir, 'x_train.csv'))
    y_train_df = pd.read_csv(os.path.join(data_dir, 'y_train.csv'))

    N, n_cols = x_train_df.shape
    print("Shape of x_train_df: (%d, %d)" % (N,n_cols))
    print("Shape of y_train_df: %s" % str(y_train_df.shape))

    # Print out the first five rows and last five rows
    tr_text_list = x_train_df['text'].values.tolist()
    rows = np.arange(0, 5)
    for row_id in rows:
        text = tr_text_list[row_id]
        print("row %5d | y = %d | %s" % (row_id, y_train_df.values[row_id,0], text))

    print("...")
    rows = np.arange(N - 5, N)
    for row_id in rows:
        text = tr_text_list[row_id]
        print("row %5d | y = %d | %s" % (row_id, y_train_df.values[row_id,0], text))


Shape of x_train_df: (2400, 2)
Shape of y_train_df: (2400, 1)
row     0 | y = 0 | Oh and I forgot to also mention the weird color effect it has on your phone.
row     1 | y = 0 | THAT one didn't work either.
row     2 | y = 0 | Waste of 13 bucks.
row     3 | y = 0 | Product is useless, since it does not have enough charging current to charge the 2 cellphones I was planning to use it with.
row     4 | y = 0 | None of the three sizes they sent with the headset would stay in my ears.
...
row  2395 | y = 1 | The sweet potato fries were very good and seasoned well.
row  2396 | y = 1 | I could eat their bruschetta all day it is devine.
row  2397 | y = 1 | Ambience is perfect.
row  2398 | y = 1 | We ordered the duck rare and it was pink and tender on the inside with a nice char on the outside.
row  2399 | y = 1 | Service was good and the company was better!


In [2]:
#steps
#load data into python
x_train_df = pd.read_csv('data_reviews/x_train.csv')
tr_list_of_sentences = x_train_df['text'].values.tolist()
tr_list_of_sentences

['Oh and I forgot to also mention the weird color effect it has on your phone.',
 "THAT one didn't work either.",
 'Waste of 13 bucks.',
 'Product is useless, since it does not have enough charging current to charge the 2 cellphones I was planning to use it with.',
 'None of the three sizes they sent with the headset would stay in my ears.',
 'Worst customer service.',
 'The Ngage is still lacking in earbuds.',
 'It always cuts out and makes a beep beep beep sound then says signal failed.',
 'the only VERY DISAPPOINTING thing was there was NO SPEAKERPHONE!!!!',
 'Very disappointed in AccessoryOne.',
 'Basically the service was very bad.',
 'Bad Choice.',
 'The only thing that disappoint me is the infra red port (irda).',
 'horrible, had to switch 3 times.',
 'It feels poorly constructed, the menus are difficult to navigate, and the buttons are so recessed that it is difficult to push them.',
 "Don't make the same mistake I did.",
 "Muddy, low quality sound, and the casing around the wi

In [56]:
#using the approach from lab
def tokenize_text(raw_text):
    ''' Transform a plain-text string into a list of tokens
    
    We assume that *whitespace* divides tokens.
    
    Args
    ----
    raw_text : string
    
    Returns
    -------
    list_of_tokens : list of strings
        Each element is one token in the provided text
    '''
    list_of_tokens = raw_text.split() # split method divides on whitespace by default
    for pp in range(len(list_of_tokens)):
        cur_token = list_of_tokens[pp]
        # Remove punctuation
        for punc in ['?', '!', '_', '.', ',', '"', '/', '(', ')', ';', ':']:
            cur_token = cur_token.replace(punc, "")
        # Turn to lower case
        clean_token = cur_token.lower()
        # Replace the cleaned token into the original list
        list_of_tokens[pp] = clean_token
    return list_of_tokens

In [57]:
tokenize_text(tr_list_of_sentences[0])

['oh',
 'and',
 'i',
 'forgot',
 'to',
 'also',
 'mention',
 'the',
 'weird',
 'color',
 'effect',
 'it',
 'has',
 'on',
 'your',
 'phone']

In [58]:
#the following code was also taken from the lab
tok_count_dict = dict()

for line in tr_list_of_sentences:
    tok_list = tokenize_text(line)
    for tok in tok_list:
        if tok in tok_count_dict:
            tok_count_dict[tok] += 1
        else:
            tok_count_dict[tok] = 1

In [59]:
sorted_tokens = list(sorted(tok_count_dict, key=tok_count_dict.get, reverse=True))
for w in sorted_tokens[:10]:
    print("%5d %s" % (tok_count_dict[w], w))

 1561 the
  921 and
  708 a
  702 i
  609 is
  543 to
  537 it
  494 this
  493 of
  447 was


In [60]:
sorted_tokens[-10:]

['khao',
 'soi',
 'andddd',
 'unbelievably',
 'efficient',
 'gooodd',
 'dinners',
 'bruschetta',
 'devine',
 'pink']

In [61]:
for w in sorted_tokens[-10:]:
    print("%5d %s" % (tok_count_dict[w], w))

    1 khao
    1 soi
    1 andddd
    1 unbelievably
    1 efficient
    1 gooodd
    1 dinners
    1 bruschetta
    1 devine
    1 pink


In [62]:
#filtering out the 10 most common tokens (since they don't do much to influence the positivity or negativity of a review)
#sorted_tokens[10:]

In [63]:
#vocab_list = sorted_tokens[10:]

In [64]:
#need to decide whether or not we want to filter out the tokens that aren't used very much
vocab_list = [w for w in sorted_tokens[10:] if tok_count_dict[w] >= 4]

In [65]:
vocab_dict = dict()
for vocab_id, tok in enumerate(vocab_list):
    vocab_dict[tok] = vocab_id

In [66]:
#vocab_dict

In [67]:
def transform_text_into_feature_vector(text, vocab_dict):
    ''' Produce count feature vector for provided text
    
    Args
    ----
    text : string
        A string of raw text, representing a single 'review'
    vocab_dict : dict with string keys
        If token is in vocabulary, will exist as key in the dict
        If token is not in vocabulary, will not be in the dict

    Returns
    -------
    count_V : 1D numpy array, shape (V,) = (n_vocab,)
        Count vector, indicating how often each vocab word
        appears in the provided text string
    '''
    V = len(vocab_dict.keys())
    count_V = np.zeros(V)
    for tok in tokenize_text(text):
        if tok in vocab_dict:
            vv = vocab_dict[tok]
            count_V[vv] += 1
    return count_V

In [68]:
# Positive words (should produce a few positive entries!)
transform_text_into_feature_vector("good great fantastic excellent good", vocab_dict)

array([0., 0., 0., 0., 0., 0., 0., 2., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [69]:
#Logistic Regression classifier

In [70]:
#number of reviews
N = len(tr_list_of_sentences)
N

2400

In [71]:
#size of vocabulary
V = len(vocab_list)
V

879

In [None]:
# Need to train a classifier

In [82]:
y_tr_N = np.hstack([np.zeros(N//2), np.ones(N//2)])
x_tr_NV = np.zeros((N, V))
for nn, raw_text_line in enumerate(tr_list_of_sentences):
    x_tr_NV[nn] = transform_text_into_feature_vector(raw_text_line, vocab_dict)

(2400, 879)


In [77]:
import sklearn.linear_model
import sklearn.pipeline

In [78]:
# Just pick reasonable choices for quick demo
# We may see a "ConvergenceWarning". That's fine for this demo.
clf = sklearn.linear_model.LogisticRegression(
    C=1000.0, max_iter=20) 

In [101]:
#clf.fit(x_tr_NV, y_tr_N)

In [94]:
from cross_validation_copy import make_train_and_test_row_ids_for_n_fold_cv
from cross_validation_copy import train_models_and_calc_scores_for_n_fold_cv

train_ids_per_fold, test_ids_per_fold = make_train_and_test_row_ids_for_n_fold_cv(N, 3, 0)
# train_ids_per_fold

[array([2232, 1735, 1739, ...,  763,  835, 1653]),
 array([1950,  252, 1936, ...,  763,  835, 1653]),
 array([1950,  252, 1936, ...,  824, 1909, 1763])]

In [99]:
x_tr_NV.shape
y_tr_N.shape

(2400,)

In [104]:
#splitting data into 5 folds
#searching for a variety of hyperparameter configurations

In [109]:
SEED = 12345
FOLDS = 5
tr_error_K, valid_error_K = train_models_and_calc_scores_for_n_fold_cv(clf, x_tr_NV, y_tr_N, FOLDS, SEED)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [110]:
#need to split up the data into 5 folds.

In [111]:
cv_train_err_list = []
cv_valid_err_list = []
err_tr = np.mean(tr_error_K)
err_va = np.mean(valid_error_K)

In [113]:
err_va

0.46582526

In [102]:
yhat_tr_N = clf.predict(x_tr_NV)
acc = np.mean( y_tr_N == yhat_tr_N )

print("Training accuracy: %.3f" % acc)

Training accuracy: 0.932


In [103]:
weights_V = clf.coef_[0]
sorted_tok_ids_V = np.argsort(weights_V)

for vv in sorted_tok_ids_V:
    print("% 7.3f %s" % (weights_V[vv], vocab_list[vv]))

-46.097 phone
-22.290 headset
-21.150 battery
-19.169 product
-17.285 these
-16.989 garbage
-15.408 keep
-14.838 ear
-13.390 pictures
-13.211 sending
-12.250 stupid
-12.156 worked
-12.065 looks
-11.870 couldn't
-11.474 sound
-11.362 obviously
-11.256 less
-11.091 fast
-10.931 item
-10.896 finally
-10.784 walked
-10.605 break
-10.534 important
-10.459 software
-10.405 gets
-10.373 sucked
-10.364 wasted
-10.295 useless
-10.166 audio
-10.100 away
 -9.866 calls
 -9.865 mess
 -9.791 second
 -9.744 annoying
 -9.475 work
 -9.406 or
 -9.371 lot
 -9.290 reception
 -9.171 volume
 -9.143 whole
 -9.065 device
 -8.999 case
 -8.909 fails
 -8.819 wrong
 -8.815 plantronics
 -8.813 then
 -8.686 buy
 -8.664 bother
 -8.605 little
 -8.568 use
 -8.523 signal
 -8.452 crap
 -8.439 fits
 -8.348 free
 -8.308 ended
 -8.170 least
 -8.076 charger
 -8.061 started
 -7.927 shipping
 -7.917 working
 -7.701 rating
 -7.656 turns
 -7.656 make
 -7.570 problems
 -7.511 nokia
 -7.403 3
 -7.380 unreliable
 -7.321 junk
 -7.2