In [2]:
import numpy as np
import pandas as pd
import os


if __name__ == '__main__':
    data_dir = 'data_reviews'
    x_train_df = pd.read_csv(os.path.join(data_dir, 'x_train.csv'))
    y_train_df = pd.read_csv(os.path.join(data_dir, 'y_train.csv'))

    N, n_cols = x_train_df.shape
    print("Shape of x_train_df: (%d, %d)" % (N,n_cols))
    print("Shape of y_train_df: %s" % str(y_train_df.shape))

    # Print out the first five rows and last five rows
    tr_text_list = x_train_df['text'].values.tolist()
    rows = np.arange(0, 5)
    for row_id in rows:
        text = tr_text_list[row_id]
        print("row %5d | y = %d | %s" % (row_id, y_train_df.values[row_id,0], text))

    print("...")
    rows = np.arange(N - 5, N)
    for row_id in rows:
        text = tr_text_list[row_id]
        print("row %5d | y = %d | %s" % (row_id, y_train_df.values[row_id,0], text))


Shape of x_train_df: (2400, 2)
Shape of y_train_df: (2400, 1)
row     0 | y = 0 | Oh and I forgot to also mention the weird color effect it has on your phone.
row     1 | y = 0 | THAT one didn't work either.
row     2 | y = 0 | Waste of 13 bucks.
row     3 | y = 0 | Product is useless, since it does not have enough charging current to charge the 2 cellphones I was planning to use it with.
row     4 | y = 0 | None of the three sizes they sent with the headset would stay in my ears.
...
row  2395 | y = 1 | The sweet potato fries were very good and seasoned well.
row  2396 | y = 1 | I could eat their bruschetta all day it is devine.
row  2397 | y = 1 | Ambience is perfect.
row  2398 | y = 1 | We ordered the duck rare and it was pink and tender on the inside with a nice char on the outside.
row  2399 | y = 1 | Service was good and the company was better!


In [16]:
#steps
#load data into python
x_train_df = pd.read_csv('data_reviews/x_train.csv')
tr_list_of_sentences = x_train_df['text'].values.tolist()
tr_list_of_sentences

['Oh and I forgot to also mention the weird color effect it has on your phone.',
 "THAT one didn't work either.",
 'Waste of 13 bucks.',
 'Product is useless, since it does not have enough charging current to charge the 2 cellphones I was planning to use it with.',
 'None of the three sizes they sent with the headset would stay in my ears.',
 'Worst customer service.',
 'The Ngage is still lacking in earbuds.',
 'It always cuts out and makes a beep beep beep sound then says signal failed.',
 'the only VERY DISAPPOINTING thing was there was NO SPEAKERPHONE!!!!',
 'Very disappointed in AccessoryOne.',
 'Basically the service was very bad.',
 'Bad Choice.',
 'The only thing that disappoint me is the infra red port (irda).',
 'horrible, had to switch 3 times.',
 'It feels poorly constructed, the menus are difficult to navigate, and the buttons are so recessed that it is difficult to push them.',
 "Don't make the same mistake I did.",
 "Muddy, low quality sound, and the casing around the wi

In [26]:
#using the approach from lab
def tokenize_text(raw_text):
    ''' Transform a plain-text string into a list of tokens
    
    We assume that *whitespace* divides tokens.
    
    Args
    ----
    raw_text : string
    
    Returns
    -------
    list_of_tokens : list of strings
        Each element is one token in the provided text
    '''
    list_of_tokens = raw_text.split() # split method divides on whitespace by default
    for pp in range(len(list_of_tokens)):
        cur_token = list_of_tokens[pp]
        # Remove punctuation
        for punc in ['?', '!', '_', '.', ',', '"', '/', '(', ')', ';']:
            cur_token = cur_token.replace(punc, "")
        # Turn to lower case
        clean_token = cur_token.lower()
        # Replace the cleaned token into the original list
        list_of_tokens[pp] = clean_token
    return list_of_tokens

In [27]:
tokenize_text(tr_list_of_sentences[0])

['oh',
 'and',
 'i',
 'forgot',
 'to',
 'also',
 'mention',
 'the',
 'weird',
 'color',
 'effect',
 'it',
 'has',
 'on',
 'your',
 'phone']

In [28]:
#the following code was also taken from the lab
tok_count_dict = dict()

for line in tr_list_of_sentences:
    tok_list = tokenize_text(line)
    for tok in tok_list:
        if tok in tok_count_dict:
            tok_count_dict[tok] += 1
        else:
            tok_count_dict[tok] = 1

In [29]:
sorted_tokens = list(sorted(tok_count_dict, key=tok_count_dict.get, reverse=True))
for w in sorted_tokens[:10]:
    print("%5d %s" % (tok_count_dict[w], w))

 1561 the
  921 and
  708 a
  702 i
  609 is
  543 to
  537 it
  494 this
  493 of
  447 was


In [30]:
for w in sorted_tokens[-10:]:
    print("%5d %s" % (tok_count_dict[w], w))

    1 khao
    1 soi
    1 andddd
    1 unbelievably
    1 efficient
    1 gooodd
    1 dinners
    1 bruschetta
    1 devine
    1 pink


In [37]:
#filtering out the 10 most common tokens (since they don't do much to influence the positivity or negativity of a review)
#sorted_tokens[10:]

In [40]:
vocab_dict = dict()
for vocab_id, tok in enumerate(vocab_list[10:]):
    vocab_dict[tok] = vocab_id

In [42]:
#vocab_dict

In [44]:
def transform_text_into_feature_vector(text, vocab_dict):
    ''' Produce count feature vector for provided text
    
    Args
    ----
    text : string
        A string of raw text, representing a single 'review'
    vocab_dict : dict with string keys
        If token is in vocabulary, will exist as key in the dict
        If token is not in vocabulary, will not be in the dict

    Returns
    -------
    count_V : 1D numpy array, shape (V,) = (n_vocab,)
        Count vector, indicating how often each vocab word
        appears in the provided text string
    '''
    V = len(vocab_dict.keys())
    count_V = np.zeros(V)
    for tok in tokenize_text(text):
        if tok in vocab_dict:
            vv = vocab_dict[tok]
            count_V[vv] += 1
    return count_V

In [46]:
# Positive words (should produce a few positive entries!)
#transform_text_into_feature_vector("good great fantastic excellent good", vocab_dict)

In [3]:
#Logistic Regression classifier