# Sentiment Analysis

## Data Prep

In [1]:
import pandas as pd
import numpy as np

import warnings

warnings.filterwarnings("ignore")

# Read in the data
df = pd.read_csv('data/Amazon_Unlocked_Mobile.csv')

# Sample the data to speed up computation
# Comment out this line to match with lecture
df = df.sample(frac=0.1, random_state=10)

df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
394349,Sony XPERIA Z2 D6503 FACTORY UNLOCKED Internat...,,244.95,5,Very good one! Better than Samsung S and iphon...,0.0
34377,Apple iPhone 5c 8GB (Pink) - Verizon Wireless,Apple,194.99,1,"The phone needed a SIM card, would have been n...",1.0
248521,Motorola Droid RAZR MAXX XT912 M Verizon Smart...,Motorola,174.99,5,I was 3 months away from my upgrade and my Str...,3.0
167661,CNPGD [U.S. Office Extended Warranty] Smartwat...,CNPGD,49.99,1,an experience i want to forget,0.0
73287,Apple iPhone 7 Unlocked Phone 256 GB - US Vers...,Apple,922.0,5,GREAT PHONE WORK ACCORDING MY EXPECTATIONS.,1.0


In [2]:
# Drop missing values
df.dropna(inplace=True)

# Remove any 'neutral' ratings equal to 3
df = df[df['Rating'] != 3]

# Encode 4s and 5s as 1 (rated positively)
# Encode 1s and 2s as 0 (rated poorly)
df['Positively Rated'] = np.where(df['Rating'] > 3, 1, 0)
df.head(10)

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes,Positively Rated
34377,Apple iPhone 5c 8GB (Pink) - Verizon Wireless,Apple,194.99,1,"The phone needed a SIM card, would have been n...",1.0,0
248521,Motorola Droid RAZR MAXX XT912 M Verizon Smart...,Motorola,174.99,5,I was 3 months away from my upgrade and my Str...,3.0,1
167661,CNPGD [U.S. Office Extended Warranty] Smartwat...,CNPGD,49.99,1,an experience i want to forget,0.0,0
73287,Apple iPhone 7 Unlocked Phone 256 GB - US Vers...,Apple,922.0,5,GREAT PHONE WORK ACCORDING MY EXPECTATIONS.,1.0,1
277158,Nokia N8 Unlocked GSM Touch Screen Phone Featu...,Nokia,95.0,5,I fell in love with this phone because it did ...,0.0,1
100311,Blackberry Torch 2 9810 Unlocked Phone with 1....,BlackBerry,77.49,5,I am pleased with this Blackberry phone! The p...,0.0,1
251669,Motorola Moto E (1st Generation) - Black - 4 G...,Motorola,89.99,5,"Great product, best value for money smartphone...",0.0,1
279878,OtterBox 77-29864 Defender Series Hybrid Case ...,OtterBox,9.99,5,I've bought 3 no problems. Fast delivery.,0.0,1
406017,Verizon HTC Rezound 4G Android Smarphone - 8MP...,HTC,74.99,4,Great phone for the price...,0.0,1
302567,"RCA M1 Unlocked Cell Phone, Dual Sim, 5Mp Came...",RCA,159.99,5,My mom is not good with new technoloy but this...,4.0,1


In [3]:
# Most ratings are positive
df['Positively Rated'].mean()

0.7471776686078667

In [4]:
from sklearn.model_selection import train_test_split

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['Reviews'], 
                                                    df['Positively Rated'], 
                                                    random_state=0)

In [5]:
print('X_train first entry:\n\n', X_train.iloc[0])
print('\n\nX_train shape: ', X_train.shape)

X_train first entry:

 Everything about it is awesome!


X_train shape:  (23052,)


## CountVectorizer

- The bag-of-words approach is a simple and commonly used way to represennt text for use in machine leraning, which ignores structure and only counts how often each word occurs.
- CountVectorizer allows us to use the bag-of-words approach by converting a collection of text documents into a matrix of token counts.

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

# Fitting the CountVectorizer tokenizes each document by finding all sequences of characters of at
# least two letters or numbers separated by word boundaries. Converts everything to lowercase and
# buils a vocabulary using these tokens
vect = CountVectorizer().fit(X_train)

In [7]:
# We can get the vocabulary by using the get_feature_names method
vect.get_feature_names()[:50]

['00',
 '000',
 '0000',
 '000000',
 '000mah',
 '002',
 '00am',
 '00k',
 '00pm',
 '00us',
 '01',
 '011',
 '013287002557427',
 '016',
 '02',
 '03',
 '032g',
 '04',
 '0412',
 '044',
 '05',
 '06',
 '0630',
 '07',
 '0700',
 '07gb',
 '08',
 '09',
 '09on',
 '0a',
 '0c',
 '0ghz',
 '0hd',
 '0i',
 '0l',
 '0lte',
 '0mp',
 '0mpthis',
 '0social',
 '0v',
 '10',
 '100',
 '1000',
 '10000',
 '100000',
 '1000000',
 '10000mah',
 '1001multi',
 '10050',
 '100gb']

In [8]:
# The vocabuary is built on any tokens that ocurred in the training data
len(vect.get_feature_names())

19601

In [9]:
# transform the documents in the training data to a document-term matrix
X_train_vectorized = vect.transform(X_train)

# Each row corresponds to a document
# Each column corresponds to a word from our training vocabulary
# Each entry is the times a word appears in a document
X_train_vectorized

<23052x19601 sparse matrix of type '<class 'numpy.int64'>'
	with 613289 stored elements in Compressed Sparse Row format>

In [10]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

LogisticRegression()

In [11]:
from sklearn.metrics import roc_auc_score

predictions = model.predict(vect.transform(X_test))

print("AUC: ", roc_auc_score(y_test, predictions))

AUC:  0.8970876860933052


In [12]:
# get the feature names as numpy array
feature_names = np.array(vect.get_feature_names())

# Sort the coefficients from the model
sorted_coef_index = model.coef_[0].argsort()

# Find the 10 smallest and 10 largest coefficients
# The 10 largest coefficients are being indexed using [:-11:-1]
# The list returned is in order of largest to smallest
print(f"Smallest Coefs:\n{feature_names[sorted_coef_index[:10]]}\n")
print(f"Smallest Coefs:\n{feature_names[sorted_coef_index[:-11:-1]]}\n")

Smallest Coefs:
['worst' 'terrible' 'slow' 'junk' 'sucks' 'waste' 'poor' 'broke'
 'disappointed' 'useless']

Smallest Coefs:
['excelent' 'excellent' 'excelente' 'perfectly' 'love' 'perfect' 'exactly'
 'great' 'best' 'awesome']



## Tfidf

- Tf-idf -> Term frequency-inverse document frequency
- It allos us to weight terms based on how important they are to a document

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Fit the TfidfVectorizer to the training data specifying a minimum document frequency of 5
# This is the min_df and specifies the number of documents a word needs to appear on to be
# taken into account
vect = TfidfVectorizer(min_df = 5).fit(X_train)
len(vect.get_feature_names())

5442

- Height weight is given to terms that appear often in a particular document, but don't appear often in the corpus.
- Features with low tf-idf are either commonly used across all documents or rarely used and only occur in long documents
- Features with high tf-idf are frequenly used within specific documents, but rarely used across all documents

In [14]:
X_train_vectorized = vect.transform(X_train)

model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

predictions = model.predict(vect.transform(X_test))

print("AUC: ", roc_auc_score(y_test, predictions))

AUC:  0.889951006492175


In [15]:
# Checking features with largest/smaller tf-idf

feature_names = np.array(vect.get_feature_names())

sorted_coef_index = X_train_vectorized.max(0).toarray()[0].argsort()

print(f"Smallest tfidf:\n{feature_names[sorted_coef_index[:10]]}\n")
print(f"Largest tfidf:\n{feature_names[sorted_coef_index[:-11:-1]]}\n")

Smallest tfidf:
['61' 'printer' 'approach' 'adjustment' 'consequences' 'length' 'emailing'
 'degrees' 'handsfree' 'chipset']

Largest tfidf:
['unlocked' 'handy' 'useless' 'cheat' 'up' 'original' 'exelent' 'exelente'
 'exellent' 'satisfied']



In [16]:
sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['not' 'slow' 'disappointed' 'worst' 'terrible' 'never' 'return' 'doesn'
 'horrible' 'waste']

Largest Coefs: 
['great' 'love' 'excellent' 'good' 'best' 'perfect' 'price' 'awesome'
 'far' 'perfectly']


In [17]:
# These reviews are treated the same by our current model
# This is obviously an error
print(model.predict(vect.transform(['not an issue, phone is working',
                                    'an issue, phone is not working'])))

[0 0]


## n-grams

- One way to add context is by adding sequences of word features known as n-grams
    - bigrams count pairs of adjacent words
    - trigrams give us triples of adjacent words
    - ...
- Although n-grams can be powerful in capturing meaning, longer sequences can cause an explosion of the number of features

In [18]:
# Fit the CountVectorizer to the training data specifying a minimum document frequency of 5
# and extracting 1-grams and 2-grams
vect = CountVectorizer(min_df = 5, ngram_range = (1, 2)).fit(X_train)

X_train_vectorized = vect.transform(X_train)

len(vect.get_feature_names())

29072

In [19]:
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

predictions = model.predict(vect.transform(X_test))

print('AUC: ', roc_auc_score(y_test, predictions))

AUC:  0.9104640361714084


In [20]:
feature_names = np.array(vect.get_feature_names())

sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['no good' 'junk' 'poor' 'slow' 'worst' 'broken' 'not good' 'terrible'
 'defective' 'horrible']

Largest Coefs: 
['excellent' 'excelente' 'perfect' 'excelent' 'great' 'love' 'awesome'
 'no problems' 'good' 'best']


In [21]:
# These reviews are now correctly identified
print(model.predict(vect.transform(['not an issue, phone is working',
                                    'an issue, phone is not working'])))

[1 0]
