In [1]:
%matplotlib inline

import time
import functools
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import numpy as np
import scipy.sparse as sparse

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from nltk.corpus import stopwords
from sklearn.naive_bayes import BernoulliNB
from sklearn import linear_model
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import recall_score
from sklearn.model_selection import train_test_split

### Pipeline

![Pipeline](pipeline.png)

### Get Familiar with dataset

In [2]:
# Load csv file into DataFrame
kindle_data = pd.read_csv('sampled_data.csv')
type(kindle_data)

pandas.core.frame.DataFrame

In [3]:
# Print first row
# Format: data_frame.col_nam[row]
print("overall    :", kindle_data.overall[0])
print("reviewText :", kindle_data.reviewText[0])

overall    : pos
reviewText : This book ended even before it started and it made me want for more. Oh oh such a teaser. I want the book now please. So exciting.


In [4]:
# Length of kindle_data
len(kindle_data)

126871

In [5]:
# Get a sample (head) of the data frame
kindle_data.head()

Unnamed: 0,overall,reviewText
0,pos,This book ended even before it started and it ...
1,pos,This is a great read with so much emotion you ...
2,pos,"It&#8217;s Christmas Eve and miraculously, Sal..."
3,pos,I enjoyed meeting the character of Cassandra. ...
4,pos,"Can I be the next Hunter wife? Again, I have ..."


> Try `HTMLParser` to un-escape the text as a stage of preprocessing

In [6]:
import html
html.unescape(kindle_data.reviewText[2])[:100]

'It’s Christmas Eve and miraculously, Sally Moss has got her twins tucked up in bed, presents all wra'

In [7]:
# Statics on tags
kindle_data.overall.value_counts()

pos    64559
neg    62312
Name: overall, dtype: int64

In [8]:
# Split complete data set into [pos, neg]
def splitPosNeg(data_):
    neg = data_.loc[data_.overall=='neg']
    pos = data_.loc[data_.overall=='pos']
    return [pos,neg]

[pos,neg] = splitPosNeg(kindle_data)

In [9]:
print(type(pos))
print("pos:", len(pos), ", neg:", len(neg))

<class 'pandas.core.frame.DataFrame'>
pos: 64559 , neg: 62312


### Preprocessing

In [14]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/ywu58/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
lemmatizer = nltk.WordNetLemmatizer()
stop = stopwords.words('english')
translation = str.maketrans(string.punctuation,' '*len(string.punctuation))

In [16]:
print(stop)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'no

In [17]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [18]:
# test. Translate abc into def
transtbl = str.maketrans('abc','def')
'ababc'.translate(transtbl) 

'dedef'

In [19]:
def preprocessing(line):
    tokens=[]
    line = str(line).translate(translation)  # Replace punctuation
    line = nltk.word_tokenize(line.lower())  # Tokenize
    
    for t in line:
        # Remove stopwords
        if t not in stop:
            stemmed = lemmatizer.lemmatize(t)
            tokens.append(stemmed)
    
    return ' '.join(tokens)

In [41]:
# Yet a more compact way to write the code
def preprocessing(line: str) -> str:
    line = str(line).translate(translation)
    line = nltk.word_tokenize(line.lower())
    
    line = [lemmatizer.lemmatize(t, pos = 'v') for t in line if t not in stop]
    line = [lemmatizer.lemmatize(t, pos = 'n') for t in line if t not in stop]
    return ' '.join(line)

In [42]:
test_str = "I bought it yesterday and I really love apples!"
preprocessing(test_str)

'buy yesterday really love apple'

In [43]:
# Preprocess all data
pos_data = [preprocessing(p) for p in pos['reviewText']]
neg_data = [preprocessing(p) for p in neg['reviewText']]

In [26]:
# Yet a more modern way to write code
pos_data = list(map(preprocessing, pos['reviewText']))
neg_data = list(map(preprocessing, neg['reviewText']))

### Some modern functions to introduce
- map
- reduce
- filter

They are very useful when running the project on a cluster or distributed compute system like Hadoop or Spark.

In [44]:
# Some useful modern functions
l = [0,1,2,3,4,5,6,7,8,9]

# Map
def square(x: int) -> int:
    return x * x

print( list(map(square, l)) )

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]


In [45]:
# Using lambda function

print( list(map(lambda x: x * x, l)) )

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]


In [46]:
[x * x for x in l]

[0, 1, 4, 9, 16, 25, 36, 49, 64, 81]

In [47]:
# Reduce
# reduce function is moved to functools
def add(x: int, y: int) -> int:
    return x + y

rst = functools.reduce(add, l)
print ("reduce", l, "by add:", rst)

reduce [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] by add: 45


In [48]:
# Using lambda function
# reduce is moved to functools in Python 3
rst = functools.reduce(lambda x, y: x + y, l)
print ("reduce", l, "by add:", rst)

reduce [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] by add: 45


In [49]:
rst = functools.reduce(lambda x, y: min(x, y), l)
print ("reduce", l, "by min:", rst)

reduce [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] by min: 0


In [50]:
# Filter
# Much faster than loop, similar with list comprehension
list(filter(lambda x: x < 5, l))

[0, 1, 2, 3, 4]

### Split Training Data & Test Data

In [56]:
pos.head()

Unnamed: 0,overall,reviewText
0,pos,This book ended even before it started and it ...
1,pos,This is a great read with so much emotion you ...
2,pos,"It&#8217;s Christmas Eve and miraculously, Sal..."
3,pos,I enjoyed meeting the character of Cassandra. ...
4,pos,"Can I be the next Hunter wife? Again, I have ..."


In [51]:
data = pos_data + neg_data
labels = np.concatenate((pos['overall'].values,neg['overall'].values))

In [54]:
labels

array(['pos', 'pos', 'pos', ..., 'neg', 'neg', 'neg'], dtype=object)

In [52]:
# Split data into training set and testing set (20:80)
# stratify: make sure pos/neg remains the same in training set and testing set
train_data, test_data, train_labels, test_labels = \
train_test_split(
    data, 
    labels, 
    test_size=0.2, 
    stratify=labels, 
    random_state=1234
)

In [53]:
print("training size = ", len(train_data), "testing size = ", len(test_data))

training size =  101496 testing size =  25375


#### Underfitting vs Overfitting
![](http://scikit-learn.org/stable/_images/sphx_glr_plot_underfitting_overfitting_001.png)

Common Method:
- 20:80 Split
- K-fold

To estimate accuracy (f-score):
- 20:20:60 Split
- 10:20:70 Split

In [57]:
# Push all tokens and compute frequency of words
t = []
for line in train_data:
    l = nltk.word_tokenize(line)
    for w in l:
        t.append(w)
        
word_features = nltk.FreqDist(t)

In [58]:
# Yet another more python-y style
tokens = [word for line in train_data \
               for word in nltk.word_tokenize(line)]

word_features = nltk.FreqDist(tokens)

In [59]:
print(word_features)

<FreqDist with 76266 samples and 4188749 outcomes>


In [60]:
word_features.most_common(10)

[('book', 124119),
 ('read', 74580),
 ('story', 62132),
 ('like', 39143),
 ('one', 36880),
 ('love', 35376),
 ('get', 33777),
 ('character', 30268),
 ('good', 28930),
 ('would', 27660)]

In [61]:
topwords = [fpair[0] for fpair in list(word_features.most_common(10000))]

### Vectorizer

In [62]:
cnt_vec = CountVectorizer()
cnt_vec

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [63]:
# Create our BAG of words (specify words we care about)
cnt_fit = cnt_vec.fit_transform([' '.join(topwords)])
cnt_fit

<1x9968 sparse matrix of type '<class 'numpy.int64'>'
	with 9968 stored elements in Compressed Sparse Row format>

#### Tf–idf term weighting

- Tf: term-frequency
- idf: inverse document-frequency
- Tf-idf = $tf(t,d) \times idf(t)$

$$
idf(t) = log{\frac{1 + nd}{1 + df(d, t)}} + 1
$$

![](http://www.onemathematicalcat.org/Math/Algebra_II_obj/Graphics/log_base_gt1.gif)

> Sentent 1: The boy **love** the toy

> Sentent 2: The boy **hate** the toy

In [64]:
transformer = TfidfTransformer()
transformer

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [65]:
# Assume vocabulary = ['The', 'boy', 'the', 'toy', 'love', 'hate']
counts = [[1, 1, 1, 1, 1, 0],
          [1, 1, 1, 1, 0, 1]]
tfidf = transformer.fit_transform(counts)
tfidf

<2x6 sparse matrix of type '<class 'numpy.float64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [66]:
tfidf.toarray()

array([[ 0.4090901 ,  0.4090901 ,  0.4090901 ,  0.4090901 ,  0.57496187,
         0.        ],
       [ 0.4090901 ,  0.4090901 ,  0.4090901 ,  0.4090901 ,  0.        ,
         0.57496187]])

In [67]:
tf_trans = TfidfTransformer()
tf_fit = tf_trans.fit_transform(cnt_fit)
tf_fit

<1x9968 sparse matrix of type '<class 'numpy.float64'>'
	with 9968 stored elements in Compressed Sparse Row format>

In [68]:
# Since CountVectorizer and TfidTransformer are often used together
# There is a class named TfidfVectorizer that combine these two steps
tf_vec = TfidfVectorizer()
tf_fit = tf_vec.fit_transform([' '.join(topwords)])
tf_fit

<1x9968 sparse matrix of type '<class 'numpy.float64'>'
	with 9968 stored elements in Compressed Sparse Row format>

### Feature Extraction

In [69]:
# Extract features from training set
# Vocabulary is from topwords
train_features = tf_vec.transform(train_data)

# cnt_train_features = cnt_vec.transform(train_data)
# train_features = tf_trans.transform(cnt_train_features)

In [70]:
# Array[n_train_data * n_features]
train_features.shape

(101496, 9968)

In [71]:
# Extract features from test set
test_features = tf_vec.transform(test_data)

# cnt_test_features = cnt_vec.transform(test_data)
# test_features = tf_trans.transform(cnt_test_features)

In [72]:
# (Uni+Bi)-Gram
bg_tf_vec = TfidfVectorizer(ngram_range=(1,2))
bg_tf_vec.fit([' '.join(topwords)])
bg_train_features = bg_tf_vec.transform(train_data)

bg_train_features.shape

# Array[n_train_data * (uni_gram_features + bi_gram_features)]

(101496, 19935)

In [73]:
# Extract (uni+bi)-gram test features
bg_test_features = bg_tf_vec.transform(test_data)

### [Multinomial NB](http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html)

The multinomial Naive Bayes classifier is suitable for **classification with discrete features** (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work.

In [74]:
from sklearn.naive_bayes import MultinomialNB

In [75]:
mnb_model = MultinomialNB()
mnb_model

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [76]:
# Train Model
start = time.time()
mnb_model.fit(train_features, train_labels)
end = time.time()

print("Multinomial NB model trained in %f seconds" % (end-start))

Multinomial NB model trained in 0.268600 seconds


In [77]:
# Predict
pred = mnb_model.predict(test_features)
print(pred)

['neg' 'pos' 'pos' ..., 'neg' 'pos' 'neg']


In [78]:
# Metrics
# metrics.accuracy_score(y_true, y_pred)
accuracy = metrics.accuracy_score(pred,test_labels)
print(accuracy)

0.816945812808


In [79]:
# Use keyword arguments to set arguments explicitly
print(metrics.classification_report(y_true=test_labels, y_pred=pred))

             precision    recall  f1-score   support

        neg       0.82      0.80      0.81     12463
        pos       0.81      0.84      0.82     12912

avg / total       0.82      0.82      0.82     25375



In [80]:
# Example from sklearn documentation

y_true = [0, 1, 2, 2, 2]
y_pred = [0, 0, 2, 2, 1]
target_names = ['class 0', 'class 1', 'class 2']
print(metrics.classification_report(y_true, y_pred, target_names=target_names))

             precision    recall  f1-score   support

    class 0       0.50      1.00      0.67         1
    class 1       0.00      0.00      0.00         1
    class 2       1.00      0.67      0.80         3

avg / total       0.70      0.60      0.61         5



#### Train & test using Uni-Gram + Bi-Gram features

In [81]:
# Train & test using (uni+bi)-gram features
bg_mnb_model = MultinomialNB()
bg_mnb_model.fit(bg_train_features, train_labels)
bg_pred = bg_mnb_model.predict(bg_test_features)
print(bg_pred)

['neg' 'pos' 'pos' ..., 'neg' 'pos' 'neg']


In [82]:
# Statistics
bg_accuracy = metrics.accuracy_score(bg_pred,test_labels)
print(bg_accuracy)

0.816157635468


In [83]:
print(metrics.classification_report(y_true=test_labels, y_pred=bg_pred))

             precision    recall  f1-score   support

        neg       0.83      0.79      0.81     12463
        pos       0.81      0.84      0.82     12912

avg / total       0.82      0.82      0.82     25375



### Other possible models

#### Linear SVM

In [84]:
from sklearn.svm import LinearSVC

svc_model = LinearSVC()
print(svc_model, end='\n'*2)

start = time.time()
svc_model.fit(train_features, train_labels)
end = time.time()
print('SVC model trained in: %.2fs' % (end - start), end='\n'*2)
svc_pred = svc_model.predict(test_features)

print('Accuracy = %.2f' % metrics.accuracy_score(svc_pred, test_labels))
print(metrics.classification_report(y_pred=svc_pred, y_true=test_labels))

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

SVC model trained in: 1.96s

Accuracy = 0.83
             precision    recall  f1-score   support

        neg       0.83      0.82      0.82     12463
        pos       0.83      0.83      0.83     12912

avg / total       0.83      0.83      0.83     25375



#### Logistic Regression

In [85]:
from sklearn.linear_model import LogisticRegression

lr_model = LinearSVC()
print(lr_model, end='\n'*2)

start = time.time()
lr_model.fit(train_features, train_labels)
end = time.time()
print('SVC model trained in: %.2fs' % (end - start), end='\n'*2)
lr_pred = lr_model.predict(test_features)

print('Accuracy = %.2f' % metrics.accuracy_score(lr_pred, test_labels))
print(metrics.classification_report(y_pred=lr_pred, y_true=test_labels))

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

SVC model trained in: 1.87s

Accuracy = 0.83
             precision    recall  f1-score   support

        neg       0.83      0.82      0.82     12463
        pos       0.83      0.83      0.83     12912

avg / total       0.83      0.83      0.83     25375



### Predict new sentences

In [86]:
# Predict a new sentence
# vectorizer needs to be pre-fitted
# At the end of the project, the function signature should be something like:
# predict_new(sentent: str, vec, model) -> str

def predict_new(sentence: str):
    sentence = preprocessing(sentence)
    features = tf_vec.transform([sentence])
    pred = mnb_model.predict(features)
    return pred[0]

In [89]:
predict_new("not good")

'neg'

### Save model

In [88]:
import pickle

# Save vectorizer
with open('tf_vec.pkl', 'wb') as pkl_file:
    pickle.dump(tf_vec, pkl_file)

In [None]:
# Save model
with open('mnb_model.pkl', 'wb') as pkl_file:
    pickle.dump(mnb_model, pkl_file)