In [4]:
import pandas as pd
import numpy as np

In [7]:
df = pd.read_csv('../Data/Amazon_Unlocked_Mobile.csv',encoding="utf8")
df = df.sample(frac=1, random_state=10)

In [3]:
df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
394349,Sony XPERIA Z2 D6503 FACTORY UNLOCKED Internat...,,244.95,5,Very good one! Better than Samsung S and iphon...,0.0
34377,Apple iPhone 5c 8GB (Pink) - Verizon Wireless,Apple,194.99,1,"The phone needed a SIM card, would have been n...",1.0
248521,Motorola Droid RAZR MAXX XT912 M Verizon Smart...,Motorola,174.99,5,I was 3 months away from my upgrade and my Str...,3.0
167661,CNPGD [U.S. Office Extended Warranty] Smartwat...,CNPGD,49.99,1,an experience i want to forget,0.0
73287,Apple iPhone 7 Unlocked Phone 256 GB - US Vers...,Apple,922.0,5,GREAT PHONE WORK ACCORDING MY EXPECTATIONS.,1.0


In [8]:
len(df)

413840

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 413840 entries, 394349 to 345353
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Product Name  413840 non-null  object 
 1   Brand Name    348669 non-null  object 
 2   Price         407907 non-null  float64
 3   Rating        413840 non-null  int64  
 4   Reviews       413778 non-null  object 
 5   Review Votes  401544 non-null  float64
dtypes: float64(2), int64(1), object(3)
memory usage: 22.1+ MB


In [10]:
# Drop missing values
df.dropna(inplace=True)

# Remove any 'neutral' ratings equal to 3
df = df[df['Rating'] != 3]

# Encode 4s and 5s as 1 (rated positively)
# Encode 1s and 2s as 0 (rated poorly)
df['Positively Rated'] = np.where(df['Rating'] > 3, 1, 0)

In [11]:
df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes,Positively Rated
34377,Apple iPhone 5c 8GB (Pink) - Verizon Wireless,Apple,194.99,1,"The phone needed a SIM card, would have been n...",1.0,0
248521,Motorola Droid RAZR MAXX XT912 M Verizon Smart...,Motorola,174.99,5,I was 3 months away from my upgrade and my Str...,3.0,1
167661,CNPGD [U.S. Office Extended Warranty] Smartwat...,CNPGD,49.99,1,an experience i want to forget,0.0,0
73287,Apple iPhone 7 Unlocked Phone 256 GB - US Vers...,Apple,922.0,5,GREAT PHONE WORK ACCORDING MY EXPECTATIONS.,1.0,1
277158,Nokia N8 Unlocked GSM Touch Screen Phone Featu...,Nokia,95.0,5,I fell in love with this phone because it did ...,0.0,1


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 308277 entries, 34377 to 345353
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Product Name      308277 non-null  object 
 1   Brand Name        308277 non-null  object 
 2   Price             308277 non-null  float64
 3   Rating            308277 non-null  int64  
 4   Reviews           308277 non-null  object 
 5   Review Votes      308277 non-null  float64
 6   Positively Rated  308277 non-null  int32  
dtypes: float64(2), int32(1), int64(1), object(3)
memory usage: 17.6+ MB


# Model Selection in scikit-learn

In [13]:
from sklearn.model_selection import train_test_split

# Split data into train and test subsets
X_train, X_test, y_train, y_test = train_test_split(df['Reviews'], 
                                                    df['Positively Rated'], 
                                                    random_state=0)

In [15]:
print( X_train.iloc[12])   

Phone is a piece of work. Constantly goes out when talking on phone. I can hear other party but they can't hear me. Not happy ):


In [16]:
X_train.shape

(231207,)

In [17]:
X_test.shape

(77070,)

# Extracting features from text files


Text files are actually series of words (ordered). In order to run machine learning algorithms we need to convert the text files into numerical feature vectors. We will be using bag of words model.

## Bag-of-words (BOW)
BOW model allows us to represent text as numerical feature vectors. The idea behind BOW is quite simple and can be summarized as follows:
- 1) Create a vocabulary of unique tokens (or words) from the entire set 
    of documents.
- 2) Construct a feature vector from each document that contains the counts of how often each word occurs in the particular document.

Since the unique words in each document represent only a small subset of all the words in the bag-of-words vocabulary, the feature vectors will consist of mostly zeros, which is why we call them sparse. For this reason we say that bags of words are typically <b>high-dimensional sparse datasets</b>.

{for our example. Briefly, we segment each text file into words (for English splitting by space), and count # of times each word occurs in each document and finally assign each word an integer id. Each unique word in our dictionary will correspond to a feature (descriptive feature).}


### Transform words into vectors (CountVectorizer)
To construct a bag-of-words model based on the word counts in the respective documents, we can use the `CountVectorizer` class implemented in `scikit-learn`. As we will see in the following codes, the `CountVectorizer` class takes an array of text data, which can be documents or just sentences, and constructs the bag-of-words model for us:

Scikit-learn has a high level component which will create feature vectors for us <b>‘CountVectorizer’</b>

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

docs = np.array([
    'The sun is shining',
    'The weather is sweet',
    'The sun is shining and the weather is sweet'])

# Fit the CountVectorizer to the training data 
vect1=CountVectorizer().fit(docs)

# transform the documents in the training data to a document-term matrix. 
bag = vect1.transform(docs)

In [19]:
vect1.vocabulary_

{'the': 5, 'sun': 3, 'is': 1, 'shining': 2, 'weather': 6, 'sweet': 4, 'and': 0}

In [21]:
vect1.get_feature_names_out()

array(['and', 'is', 'shining', 'sun', 'sweet', 'the', 'weather'],
      dtype=object)

## <font color=green> Exercise A</font>

1) Do CountVectorizer for training data

2) Determine: 
- The number of features 
- The shape of sparse matrix

In [22]:
# Fit the CountVectorizer to the training data 
review1 = CountVectorizer().fit(X_train)

# transform the documents in the training data to a document-term matrix. 
X_train_vectorized1 = review1.transform(X_train) #bag of words (unique token in vectors format)


print(len(review1.get_feature_names()))
X_train_vectorized1.shape

53301


(231207, 53301)

In [23]:
review1.vocabulary_

{'it': 26074,
 'an': 4929,
 'otterbox': 33635,
 'what': 51628,
 'do': 15833,
 'you': 52893,
 'expect': 18613,
 'sturdy': 45292,
 'and': 4962,
 'makes': 28973,
 'the': 47008,
 'tablet': 46182,
 'easier': 16644,
 'to': 47699,
 'hold': 23637,
 'on': 33050,
 'very': 50519,
 'good': 21809,
 'phone': 35063,
 'was': 51235,
 'in': 24683,
 'great': 22209,
 'condition': 11918,
 'functions': 20870,
 'well': 51539,
 'only': 33132,
 'problem': 36816,
 'is': 25971,
 'with': 52022,
 'battery': 7124,
 'life': 27853,
 'but': 9055,
 'that': 46980,
 'given': 21579,
 'when': 51670,
 'buy': 9116,
 'used': 49923,
 'previous': 36614,
 'owner': 33956,
 'most': 30794,
 'likely': 28008,
 'wore': 52220,
 'down': 16069,
 'all': 4501,
 'also': 4635,
 'shipped': 42554,
 'extreamly': 18902,
 'fast': 19255,
 'amazing': 4754,
 'had': 22671,
 'two': 48731,
 'of': 32800,
 'them': 47064,
 'from': 20646,
 'how': 23878,
 'much': 31030,
 'enjoyed': 17389,
 'simplicity': 42939,
 'highly': 23481,
 'recommended': 38849,
 'for'

In [None]:
review1.get_feature_names()



['00',
 '000',
 '0000',
 '00000',
 '000000',
 '0000000',
 '00000000000',
 '0000from',
 '0001',
 '0004',
 '000ma',
 '000mah',
 '000mh',
 '000restricted',
 '0051',
 '006',
 '007',
 '00am',
 '00bucks',
 '00for',
 '00it',
 '00k',
 '00now',
 '00pm',
 '00x2',
 '01',
 '011',
 '013435003182980',
 '014',
 '0155379',
 '016',
 '016s',
 '019s',
 '02',
 '02may13',
 '02mbps',
 '03',
 '032g',
 '0330',
 '03pm',
 '04',
 '0400',
 '044',
 '04pm',
 '04th',
 '04the',
 '05',
 '050',
 '0500tkx',
 '050mms',
 '050prot',
 '051',
 '056',
 '0572013',
 '0577454',
 '05788690',
 '05th',
 '05the',
 '05using',
 '06',
 '061',
 '062',
 '0630',
 '066',
 '06pm',
 '07',
 '0780',
 '07am',
 '07gb',
 '07nov2015',
 '08',
 '0804245',
 '0808',
 '0825',
 '0829',
 '087',
 '087581287',
 '08in',
 '08mms',
 '08this',
 '09',
 '0909853',
 '09on',
 '0_1439_7',
 '0_150511',
 '0_print_120716',
 '0_user_manual',
 '0a',
 '0also',
 '0b3tbzlidhq7dce1bv05qdefaota',
 '0bj7255rf1f1a1118w65',
 '0c',
 '0cant',
 '0cesqfjad',
 '0dislikes',
 '0expand

# Logistic Regression classification

We will train a logistic regression model to classify the  Amazon reviews into positive and negative reviews by using feature matrix. 

In [24]:
from sklearn.linear_model import LogisticRegression

# Train the model
model = LogisticRegression()
model.fit(X_train_vectorized1, y_train)

#x_train
#vectors (unique tokens)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [25]:
from sklearn.metrics import roc_auc_score

# Predict the transformed test documents
predictions = model.predict(review1.transform(X_test))
y_proba = model.predict_proba(review1.transform(X_test))
                              
print('AUC: ', roc_auc_score(y_test, y_proba[:,1]))       

AUC:  0.9750933673031394


In [27]:
model.coef_

array([[-0.92874463, -0.17081246,  0.01935562, ...,  0.10821892,
         0.08218676,  0.08218676]])

In [28]:
model.coef_[0].argsort()

array([52365, 26658, 21201, ..., 18482, 18306, 18305], dtype=int64)

In [30]:
# get the feature names as numpy array
feature_names = np.array(review1.get_feature_names_out())

# Sort the coefficients from the model
sorted_coef_index = model.coef_[0].argsort()

# Find the 10 smallest and 10 largest coefficients
print('Smallest Coefs:' )
print(feature_names[sorted_coef_index[:10]])
      
print('\n Largest Coefs:')      
print(feature_names[sorted_coef_index[:-11:-1]])

Smallest Coefs:
['worst' 'junk' 'garbage' 'unusable' 'useless' 'waste' 'freezes'
 'terrible' 'horrible' 'awful']

 Largest Coefs:
['excelent' 'excelente' 'exelente' 'loves' 'excellent' 'loving' 'perfecto'
 'complaints' 'minor' 'fantastic']


# Tfidf

When we are analyzing text data, we often encounter words that occur across multiple documents from both classes. Those frequently occurring words typically don't contain useful or discriminatory information. In this subsection, we will learn about a useful technique called **term frequency-inverse document frequency** (*tf-idf*) that can be used to downweight those frequently occurring words in the feature vectors. On the other words by tf-idf we can reduce the weightage of more common words like (the, is, an etc.) which occurs in all document.

The *tf-idf* can be defined as the product of the term frequency and the inverse document frequency:

\begin{align}
\textit{tf-idf}(t,d) = tf(t,d) \times idf(t,d)
\end{align}

Here the <font color=green><b> *tf(t,d)* </b></font> is the term frequency that equal to **Count of word / Total words, in each document**. The inverse document frequency *idf(t,d)* can be calculated as:

\begin{align}
idf(t,d) = log\frac{n_d}{\text{df(d,t)}}
\end{align}

where <font color=green><b> $n_d$ </b></font> is **the total number of documents**, and <font color=green><b>*df(d,t)*</b></font> is **the number of documents *d* that contain the term t**. Note that the log is used to ensure that low document frequencies are not given too much weight.


scikit-learn implements yet another vectorizer, the TfidfVectorizer, that creates feature vectors as tf-idfs.


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


docs = np.array([
    'The sun is shining',
    'The weather is sweet',
    'The sun is shining and the weather is sweet'])

vect2 = TfidfVectorizer().fit(docs)
bag2 = vect2.transform(docs)

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Fit the TfidfVectorizer to the training data 
review2 = TfidfVectorizer(min_df=5).fit(X_train) #raw unique tokens

In [33]:
len(review2.get_feature_names())

18025

In [34]:
X_train_vectorized2 = review2.transform(X_train) #unique tokens in vector format

model = LogisticRegression()
model.fit(X_train_vectorized2, y_train)

predictions = model.predict(review2.transform(X_test))
y_proba = model.predict_proba(review2.transform(X_test))
                              
print('AUC: ', roc_auc_score(y_test, y_proba[:,1])) 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


AUC:  0.98255131869347


## <font color=green> Exercise B</font> 
- Predict two below reviews as negative or positive using our model: 

      ['no an issue, phone is working', 'an issue, phone is not working']       

In [None]:
model.predict(review2.transform(['amazing, phone is working', 'an issue, phone is not working']))

array([1, 0])

Try this!

In [None]:
model.predict(review2.transform(['no an issue, phone is working','an issue, phone is not working']))

array([0, 0])

# n-grams

The sequence of items in the bag-of-words model that we just created is also called the 1-gram or unigram model — each item or token in the vocabulary represents a single word. Generally, <b>the contiguous sequences of items in NLP</b> — words, letters, or symbols— is also called an n-gram. The choice of the number n in the n-gram model depends on the particular application. For instance, spam filtering applications tend to use n=3 or n=4 for good performances.
To summarize the concept of the n-gram representation, the 1-gram and 2-gram representations of our first document "the sun is shining" would be constructed as follows:
- 1-gram: "the", "sun", "is", "shining"
- 2-gram: "the sun", "sun is", "is shining"

The CountVectorizer class in scikit-learn allows us to use different n-gram models via its ngram_range parameter. By default, it uses a 1-gram representation.

In [None]:
# Try 2-gram representation
docs = np.array([
    'The sun is shining',
    'The weather is sweet',
    'The sun is shining and the weather is sweet'])

vect3=CountVectorizer(ngram_range=(1,2)).fit(docs)
bag3=vect3.transform(docs)

In [None]:
vect3.vocabulary_

{'the': 10,
 'sun': 7,
 'is': 2,
 'shining': 5,
 'the sun': 11,
 'sun is': 8,
 'is shining': 3,
 'weather': 13,
 'sweet': 9,
 'the weather': 12,
 'weather is': 14,
 'is sweet': 4,
 'and': 0,
 'shining and': 6,
 'and the': 1}

In [35]:
review3 = CountVectorizer(min_df=5, ngram_range=(1,2)).fit(X_train) #raw unique tokens

X_train_vectorized3 = review3.transform(X_train) #vectorized unique tokens

In [36]:
len(review3.get_feature_names())



201035

In [37]:
model = LogisticRegression()
model.fit(X_train_vectorized3, y_train)

predictions = model.predict(review3.transform(X_test))
y_proba = model.predict_proba(review3.transform(X_test))
                              
print('AUC: ', roc_auc_score(y_test, y_proba[:,1])) 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


AUC:  0.9891133896935007


In [38]:
feature_names = np.array(review3.get_feature_names())

sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:' )
print(feature_names[sorted_coef_index[:10]])
      
print('\n Largest Coefs:')      
print(feature_names[sorted_coef_index[:-11:-1]])

Smallest Coefs:
['no good' 'not satisfied' 'not happy' 'not worth' 'worst' 'garbage'
 'junk' 'not good' 'at best' 'nope']

 Largest Coefs:
['excelent' 'excelente' 'not bad' 'excellent' 'exelente' 'perfect'
 'awesome' 'no issues' 'perfecto' 'great']




In [39]:
# These reviews are now correctly identified
print(model.predict(review1.transform(['no an issue, phone is working',
                                    'an issue, phone is not working'])))

ValueError: X has 53301 features, but LogisticRegression is expecting 201035 features as input.

In [None]:
# These reviews are now correctly identified
print(model.predict(review2.transform(['no an issue, phone is working',
                                    'an issue, phone is not working'])))

In [40]:
# These reviews are now correctly identified
print(model.predict(review3.transform(['no an issue, phone is working',
                                    'an issue, phone is not working'])))

[1 0]


In [41]:
# These reviews are now correctly identified
print(model.predict(review3.transform(['not bad, okay',
                                    'worst experience ever'])))

[1 0]


# Text Classification

## Using sklearn's NaiveBayes Classifier

### <font color=green> Exercise C</font> 
1. Do text classification for the Amazon reviews dataset using NaiveBayes Classifier
2. Evaluate your model classifier

In [None]:
from sklearn import naive_bayes
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

In [None]:
review4 = TfidfVectorizer(min_df=5, ngram_range=(1,2)).fit(X_train)
X_train_vectorized4 = review4.transform(X_train)

In [None]:
model_clsfNB = naive_bayes.MultinomialNB()
model_clsfNB.fit(X_train_vectorized4,y_train)
test_predictions = model_clsfNB.predict(review4.transform(X_test))

In [None]:
metrics.f1_score(y_test,test_predictions,average='micro')

0.949357726741923

In [None]:
metrics.accuracy_score(y_test,test_predictions)

0.949357726741923

In [None]:
metrics.recall_score(y_test,test_predictions,average='micro')

0.949357726741923

In [None]:
metrics.precision_score(y_test,test_predictions,average='micro')

0.949357726741923