---

_You are currently looking at **version 1.0** of this notebook. To download notebooks and datafiles, as well as get help on Jupyter notebooks in the Coursera platform, visit the [Jupyter Notebook FAQ](https://www.coursera.org/learn/python-text-mining/resources/d9pwm) course resource._

---

<a target="_blank" href="https://colab.research.google.com/github/LuisAngelMendozaVelasco/Applied_Data_Science_with_Python_Specialization/blob/main/Applied_Text_Mining_in_Python/Week3/Labs/Case Study - Sentiment Analysis.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>

In [1]:

%%bash
# If running in Google Colab

# mkdir data
# wget https://raw.githubusercontent.com/LuisAngelMendozaVelasco/Applied_Data_Science_with_Python_Specialization/main/Applied_Text_Mining_in_Python/Week3/Labs/data/Amazon_Unlocked_Mobile.csv.gz -P ./data

*Note: Some of the cells in this notebook are computationally expensive. To reduce runtime, this notebook is using a subset of the data.*

# Case Study: Sentiment Analysis

### Data Prep

In [2]:
%%bash
gunzip -k ./data/Amazon_Unlocked_Mobile.csv.gz

In [3]:
import pandas as pd
import numpy as np

# Read in the data
df = pd.read_csv('./data/Amazon_Unlocked_Mobile.csv', encoding="unicode_escape")

# Sample the data to speed up computation
# Comment out this line to match with lecture
# df = df.sample(frac=0.1, random_state=10)
df.head()

  df = pd.read_csv('./data/Amazon_Unlocked_Mobile.csv', encoding="unicode_escape")


Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5.0,I feel so LUCKY to have found this used (phone...,1.0
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4.0,"nice phone, nice up grade from my pantach revu...",0.0
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5.0,Very pleased,0.0
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4.0,It works good but it goes slow sometimes but i...,0.0
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4.0,Great phone to replace my lost phone. The only...,0.0


In [4]:
# Drop missing values
df.dropna(inplace=True)

# Remove any 'neutral' ratings equal to 3
df = df[df['Rating'] != 3]

# Encode 4s and 5s as 1 (rated positively)
# Encode 1s and 2s as 0 (rated poorly)
df['Positively Rated'] = np.where(df['Rating'] > 3, 1, 0)
df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes,Positively Rated
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5.0,I feel so LUCKY to have found this used (phone...,1.0,1
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4.0,"nice phone, nice up grade from my pantach revu...",0.0,1
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5.0,Very pleased,0.0,1
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4.0,It works good but it goes slow sometimes but i...,0.0,1
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4.0,Great phone to replace my lost phone. The only...,0.0,1


In [5]:
# Most ratings are positive
df['Positively Rated'].mean()

0.7441474169763745

In [6]:
from sklearn.model_selection import train_test_split

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['Reviews'], df['Positively Rated'], random_state=0)

In [7]:
print('X_train first entry:\n', X_train.iloc[0])
print('\nX_train shape:', X_train.shape)

X_train first entry:
 Good telephone and easy to use however the battery discharges VERY FAST. You can use the telephone half a day only. You have to keep the cable and charger in your pocket!!!!. you download two applications and loose 20% of your battery.

X_train shape: (188282,)


# CountVectorizer

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

# Fit the CountVectorizer to the training data
vect = CountVectorizer().fit(X_train)

In [9]:
new_var = vect.get_feature_names_out()
new_var[::2000]

array(['00', '5months', 'alleviating', 'backwith', 'buscas', 'colour',
       'cyclopse', 'dito', 'euphemism', 'forseen', 'hablale',
       'indicative', 'l24', 'maturing', 'negtive', 'p40', 'powersnappy',
       'realplayer', 'rtfpd', 'sistema', 'subcontracted', 'tirado',
       'usaria', 'wonkadoodle'], dtype=object)

In [10]:
len(vect.get_feature_names_out())

46925

In [11]:
# transform the documents in the training data to a document-term matrix
X_train_vectorized = vect.transform(X_train)
X_train_vectorized

<188282x46925 sparse matrix of type '<class 'numpy.int64'>'
	with 4971746 stored elements in Compressed Sparse Row format>

In [12]:
from sklearn.linear_model import LogisticRegression

# Train the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vectorized, y_train)

In [13]:
from sklearn.metrics import roc_auc_score

# Predict the transformed test documents
predictions = model.predict(vect.transform(X_test))
print('AUC:', roc_auc_score(y_test, predictions))

AUC: 0.9314785936688758


In [14]:
pd.DataFrame({'y_test': y_test, 'y_pred': predictions}).head()

Unnamed: 0,y_test,y_pred
187574,1,1
187982,0,0
63890,0,1
42948,1,1
298002,1,1


In [15]:
# get the feature names as numpy array
feature_names = np.array(vect.get_feature_names_out())

# Sort the coefficients from the model
sorted_coef_index = model.coef_[0].argsort()

# Find the 10 smallest and 10 largest coefficients
# The 10 largest coefficients are being indexed using [:-11:-1] 
# so the list returned is in order of largest to smallest
print('Smallest Coefs: \n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs: 
['mony' 'lemon' 'worthless' 'worst' 'false' 'junk' 'useless' 'superthin'
 'horrible' 'terrible']

Largest Coefs: 
['excelent' 'excelente' '4eeeks' 'ofamanda' 'worried' 'excellent' 'lovely'
 'exelente' 'loves' 'flawlessly']


# Tfidf

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Fit the TfidfVectorizer to the training data specifiying a minimum document frequency of 5
vect = TfidfVectorizer(min_df=5).fit(X_train)
len(vect.get_feature_names_out())

16350

In [17]:
X_train_vectorized = vect.transform(X_train)
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vectorized, y_train)
predictions = model.predict(vect.transform(X_test))

print('AUC:', roc_auc_score(y_test, predictions))

AUC: 0.9278791036617637


In [18]:
pd.DataFrame({'y_test': y_test, 'y_pred': predictions}).head()

Unnamed: 0,y_test,y_pred
187574,1,1
187982,0,0
63890,0,0
42948,1,1
298002,1,1


In [19]:
feature_names = np.array(vect.get_feature_names_out())
sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()

print('Smallest tfidf:\n{}\n'.format(feature_names[sorted_tfidf_index[:10]]))
print('Largest tfidf: \n{}'.format(feature_names[sorted_tfidf_index[:-11:-1]]))

Smallest tfidf:
['1300' '___update' 'bridging' 'macbookâ' '34ghz' '___thank' 'cultures'
 'excites' 'messiah' 'brawns']

Largest tfidf: 
['scam' 'scammers' 'unusable' 'flimsy' 'dislike' 'swag' 'food' 'thx'
 'comfortable' 'bjvjjbkvjvj']


In [20]:
sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['not' 'worst' 'useless' 'terrible' 'horrible' 'waste' 'disappointed'
 'return' 'poor' 'doesn']

Largest Coefs: 
['love' 'great' 'excellent' 'perfect' 'amazing' 'perfectly' 'awesome'
 'best' 'easy' 'loves']


In [21]:
# These reviews are treated the same by our current model
print(model.predict(vect.transform(['not an issue, phone is working',
                                    'an issue, phone is not working'])))

[0 0]


# n-grams

In [22]:
# Fit the CountVectorizer to the training data specifiying a minimum 
# document frequency of 5 and extracting 1-grams and 2-grams
vect = CountVectorizer(min_df=5, ngram_range=(1, 2)).fit(X_train)
X_train_vectorized = vect.transform(X_train)
len(vect.get_feature_names_out())

173238

In [23]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vectorized, y_train)
predictions = model.predict(vect.transform(X_test))

print('AUC:', roc_auc_score(y_test, predictions))

AUC: 0.967608932616231


In [24]:
pd.DataFrame({'y_test': y_test, 'y_pred': predictions}).head()

Unnamed: 0,y_test,y_pred
187574,1,1
187982,0,0
63890,0,0
42948,1,1
298002,1,1


In [25]:
feature_names = np.array(vect.get_feature_names_out())
sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['no good' 'junk' 'horrible' 'worst' 'terrible' 'not good' 'not very'
 'product good' 'not happy' 'nope']

Largest Coefs: 
['not bad' 'excellent' 'excelente' 'excelent' 'perfect' 'no problems'
 'awesome' 'exelente' 'no issues' 'great']


In [26]:
# These reviews are now correctly identified
print(model.predict(vect.transform(['not an issue, phone is working',
                                    'an issue, phone is not working'])))

[1 0]


In [27]:
%%bash
rm ./data/Amazon_Unlocked_Mobile.csv