## Sparse matrix representation of the corpus using `CountVectorizer`

In [1]:
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
corpus = [
    'This is the first document.', 
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?'
]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
X

<4x9 sparse matrix of type '<class 'numpy.int64'>'
	with 21 stored elements in Compressed Sparse Row format>

In [3]:
vectorizer.get_feature_names()

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']

In [4]:
vectorizer.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.int64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': None,
 'min_df': 1,
 'ngram_range': (1, 1),
 'preprocessor': None,
 'stop_words': None,
 'strip_accents': None,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'vocabulary': None}

In [5]:
# Bag of words representation
X.toarray()

array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
       [0, 2, 0, 1, 0, 1, 1, 0, 1],
       [1, 0, 0, 1, 1, 0, 1, 1, 1],
       [0, 1, 1, 1, 0, 0, 1, 0, 1]])

In [6]:
vectorizer.vocabulary_

{'this': 8,
 'is': 3,
 'the': 6,
 'first': 2,
 'document': 1,
 'second': 5,
 'and': 0,
 'third': 7,
 'one': 4}

## Tokenize the corpus

In [8]:
from nltk.tokenize import word_tokenize

In [11]:
# instantiate the count vectorizer with tokenizer provided by NLTK; 
# this will also take care of special characters
vectorizer = CountVectorizer(tokenizer=word_tokenize, token_pattern=None)

# fit the vectorizer on the corpus
vectorizer.fit(corpus)

corpus_transformed = vectorizer.transform(corpus)

corpus_transformed.toarray()

array([[1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1],
       [1, 0, 0, 2, 0, 1, 0, 1, 1, 0, 1],
       [1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1],
       [0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1]])

In [12]:
vectorizer.vocabulary_

{'this': 10,
 'is': 5,
 'the': 8,
 'first': 4,
 'document': 3,
 '.': 0,
 'second': 7,
 'and': 2,
 'third': 9,
 'one': 6,
 '?': 1}

## Using the same methodology on IMDB dataset 

In [22]:
import time
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection
from sklearn.feature_extraction.text import CountVectorizer

In [62]:
# read the data
df = pd.read_csv("../data/IMDB Dataset.csv")
df.head(1)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive


In [63]:
# map positive sentiment as 1 negative as 0
df.sentiment = df.sentiment.map({"positive": 1, "negative": 0})
df["kfold"] = -1
df.head()

Unnamed: 0,review,sentiment,kfold
0,One of the other reviewers has mentioned that ...,1,-1
1,A wonderful little production. <br /><br />The...,1,-1
2,I thought this was a wonderful way to spend ti...,1,-1
3,Basically there's a family where a little boy ...,0,-1
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,-1


In [64]:
# shuffle the dataframe
df = df.sample(frac=1).reset_index(drop=True)

# fetch the labels
y = df.sentiment.values

# instantiate kfold CV
kf = model_selection.StratifiedKFold(n_splits=5)

# create stratified folds
for idx, (t_, v_) in enumerate(kf.split(X=df, y=y)):
    df.loc[v_, "kfold"] = idx

df.head()

Unnamed: 0,review,sentiment,kfold
0,It is a real shame that nearly no one under 30...,0,0
1,I have read the novel Reaper of Ben Mezrich a ...,0,0
2,This movie was very very mediocre and very ver...,0,0
3,"Okay, some other people have commented that th...",0,0
4,somewhere i'd read that this film is supposed ...,0,0


In [65]:
# Sanity check
df.kfold.value_counts()

4    10000
3    10000
2    10000
1    10000
0    10000
Name: kfold, dtype: int64

In [23]:
# go over the folds created
for f_ in range(5):
    # create training and validation dataframes
    start_time = time.time()
    train_df = df[df["kfold"] != f_].reset_index(drop=True)
    valid_df = df[df["kfold"] == f_].reset_index(drop=True)
    
    # initialize the count vectorizer using the word_tokenize as the tokenizer
    vectorizer = CountVectorizer(tokenizer=word_tokenize, token_pattern=None)
    
    # fit the vectorizer on the train_df reviews
    vectorizer.fit(train_df.review)
    
    # create training and validation data
    xtrain = vectorizer.transform(train_df.review)
    xvalid = vectorizer.transform(valid_df.review)
    
    # initialize logistic regression
    model = linear_model.LogisticRegression(n_jobs=-1)
    
    # fit the model on training data
    model.fit(xtrain, train_df.sentiment)
    
    # make predictions on validation set
    preds = model.predict(xvalid)
    preds_proba = model.predict_proba(xvalid)[:, 1]
    
    # calculate accuracy and roc score
    accuracy = metrics.accuracy_score(valid_df.sentiment, preds)
    roc = metrics.roc_auc_score(valid_df.sentiment, preds_proba)
    
    # display results
    print(f"Fold: {f_}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"ROC AUC score: {roc:.4f}")
    print(f"Time elapsed: {(time.time() - start_time):.4f} seconds.")
    print("")

Fold: 0
Accuracy: 0.8943
ROC AUC score: 0.9558
Time elapsed: 311.2196 seconds.

Fold: 1
Accuracy: 0.8941
ROC AUC score: 0.9547
Time elapsed: 319.5518 seconds.

Fold: 2
Accuracy: 0.8942
ROC AUC score: 0.9566
Time elapsed: 308.2627 seconds.

Fold: 3
Accuracy: 0.8928
ROC AUC score: 0.9559
Time elapsed: 307.9073 seconds.

Fold: 4
Accuracy: 0.8925
ROC AUC score: 0.9562
Time elapsed: 306.1910 seconds.



## Trying some other models

In [25]:
from sklearn import naive_bayes

# go over the folds created
for f_ in range(5):
    # create training and validation dataframes
    start_time = time.time()
    train_df = df[df["kfold"] != f_].reset_index(drop=True)
    valid_df = df[df["kfold"] == f_].reset_index(drop=True)
    
    # initialize the count vectorizer using the word_tokenize as the tokenizer
    vectorizer = CountVectorizer(tokenizer=word_tokenize, token_pattern=None)
    
    # fit the vectorizer on the train_df reviews
    vectorizer.fit(train_df.review)
    
    # create training and validation data
    xtrain = vectorizer.transform(train_df.review)
    xvalid = vectorizer.transform(valid_df.review)
    
    # initialize naive bayes model
    model = naive_bayes.MultinomialNB()
    
    # fit the model on training data
    model.fit(xtrain, train_df.sentiment)
    
    # make predictions on validation set
    preds = model.predict(xvalid)
    # preds_proba = model.predict_proba(xvalid)[:, 1]
    
    # calculate accuracy and roc score
    accuracy = metrics.accuracy_score(valid_df.sentiment, preds)
    # roc = metrics.roc_auc_score(valid_df.sentiment, preds_proba)
    
    # display results
    print(f"Fold: {f_}")
    print(f"Accuracy: {accuracy:.4f}")
    # print(f"ROC AUC score: {roc:.4f}")
    print(f"Time elapsed: {(time.time() - start_time):.4f} seconds.")
    print("")

Fold: 0
Accuracy: 0.8384
Time elapsed: 296.3420 seconds.

Fold: 1
Accuracy: 0.8350
Time elapsed: 296.7659 seconds.

Fold: 2
Accuracy: 0.8516
Time elapsed: 300.0909 seconds.

Fold: 3
Accuracy: 0.8469
Time elapsed: 308.1134 seconds.

Fold: 4
Accuracy: 0.8460
Time elapsed: 295.8493 seconds.



## Using `tf-idf` instead of count vectorizer

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

tfidf = TfidfVectorizer(tokenizer=word_tokenize, token_pattern=None)

corpus = [
    'This is the first document.', 
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?'
]

X = tfidf.fit_transform(corpus)

In [34]:
X.toarray()

array([[0.42520648, 0.        , 0.        , 0.42520648, 0.5252146 ,
        0.34763416, 0.        , 0.        , 0.34763416, 0.        ,
        0.34763416],
       [0.32513203, 0.        , 0.        , 0.65026407, 0.        ,
        0.26581674, 0.        , 0.50938216, 0.26581674, 0.        ,
        0.26581674],
       [0.31055267, 0.        , 0.48654076, 0.        , 0.        ,
        0.25389715, 0.48654076, 0.        , 0.25389715, 0.48654076,
        0.25389715],
       [0.        , 0.59276931, 0.        , 0.37835697, 0.46734613,
        0.30933162, 0.        , 0.        , 0.30933162, 0.        ,
        0.30933162]])

In [35]:
tfidf.vocabulary_

{'this': 10,
 'is': 5,
 'the': 8,
 'first': 4,
 'document': 3,
 '.': 0,
 'second': 7,
 'and': 2,
 'third': 9,
 'one': 6,
 '?': 1}

In [38]:
x, y = np.where(X.toarray())
print(tuple(zip(x, y)))

((0, 0), (0, 3), (0, 4), (0, 5), (0, 8), (0, 10), (1, 0), (1, 3), (1, 5), (1, 7), (1, 8), (1, 10), (2, 0), (2, 2), (2, 5), (2, 6), (2, 8), (2, 9), (2, 10), (3, 1), (3, 3), (3, 4), (3, 5), (3, 8), (3, 10))


In [39]:
# go over the folds created
for f_ in range(5):
    # create training and validation dataframes
    start_time = time.time()
    train_df = df[df["kfold"] != f_].reset_index(drop=True)
    valid_df = df[df["kfold"] == f_].reset_index(drop=True)
    
    # initialize the count vectorizer using the word_tokenize as the tokenizer
    vectorizer = TfidfVectorizer(tokenizer=word_tokenize, token_pattern=None)
    
    # fit the vectorizer on the train_df reviews
    vectorizer.fit(train_df.review)
    
    # create training and validation data
    xtrain = vectorizer.transform(train_df.review)
    xvalid = vectorizer.transform(valid_df.review)
    
    # initialize logistic regression
    model = linear_model.LogisticRegression(n_jobs=-1)
    
    # fit the model on training data
    model.fit(xtrain, train_df.sentiment)
    
    # make predictions on validation set
    preds = model.predict(xvalid)
    preds_proba = model.predict_proba(xvalid)[:, 1]
    
    # calculate accuracy and roc score
    accuracy = metrics.accuracy_score(valid_df.sentiment, preds)
    roc = metrics.roc_auc_score(valid_df.sentiment, preds_proba)
    
    # display results
    print(f"Fold: {f_}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"ROC AUC score: {roc:.4f}")
    print(f"Time elapsed: {(time.time() - start_time):.2f} seconds.")
    print("")

Fold: 0
Accuracy: 0.8961
ROC AUC score: 0.9605
Time elapsed: 307.82 seconds.

Fold: 1
Accuracy: 0.8944
ROC AUC score: 0.9595
Time elapsed: 320.43 seconds.

Fold: 2
Accuracy: 0.8991
ROC AUC score: 0.9613
Time elapsed: 305.38 seconds.

Fold: 3
Accuracy: 0.8975
ROC AUC score: 0.9607
Time elapsed: 308.70 seconds.

Fold: 4
Accuracy: 0.8977
ROC AUC score: 0.9634
Time elapsed: 310.83 seconds.



In [40]:
# go over the folds created
for f_ in range(5):
    # create training and validation dataframes
    start_time = time.time()
    train_df = df[df["kfold"] != f_].reset_index(drop=True)
    valid_df = df[df["kfold"] == f_].reset_index(drop=True)
    
    # initialize the count vectorizer using the word_tokenize as the tokenizer
    vectorizer = TfidfVectorizer(tokenizer=word_tokenize, 
                                 token_pattern=None,
                                 ngram_range=(1, 2))
    
    # fit the vectorizer on the train_df reviews
    vectorizer.fit(train_df.review)
    
    # create training and validation data
    xtrain = vectorizer.transform(train_df.review)
    xvalid = vectorizer.transform(valid_df.review)
    
    # initialize logistic regression
    model = linear_model.LogisticRegression(n_jobs=-1)
    
    # fit the model on training data
    model.fit(xtrain, train_df.sentiment)
    
    # make predictions on validation set
    preds = model.predict(xvalid)
    preds_proba = model.predict_proba(xvalid)[:, 1]
    
    # calculate accuracy and roc score
    accuracy = metrics.accuracy_score(valid_df.sentiment, preds)
    roc = metrics.roc_auc_score(valid_df.sentiment, preds_proba)
    
    # display results
    print(f"Fold: {f_}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"ROC AUC score: {roc:.4f}")
    print(f"Time elapsed: {(time.time() - start_time):.2f} seconds.")
    print("")

Fold: 0
Accuracy: 0.8982
ROC AUC score: 0.9625
Time elapsed: 443.80 seconds.

Fold: 1
Accuracy: 0.8993
ROC AUC score: 0.9617
Time elapsed: 443.94 seconds.

Fold: 2
Accuracy: 0.9010
ROC AUC score: 0.9638
Time elapsed: 412.41 seconds.

Fold: 3
Accuracy: 0.9015
ROC AUC score: 0.9629
Time elapsed: 420.32 seconds.

Fold: 4
Accuracy: 0.8991
ROC AUC score: 0.9650
Time elapsed: 443.36 seconds.



## Stemming and Lemmatization

In [41]:
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer

In [43]:
# initialize the lemmatizer
"""
Lemmatization keeps the meaning of the sentences intact.
"""
lemma = WordNetLemmatizer()

# initialize the stemmer
"""
Stemming doesn't guarantee the final word would still have some meaning.
"""
stemmer = SnowballStemmer("english")

words = ["go", "goa", "gone", "going", "broad", "broaden", "broadening"]

for word in words:
    print(f"word: {word}")
    print(f"stemed word={stemmer.stem(word)}")
    print(f"lemmatized word={lemma.lemmatize(word)}")
    print("")

word: go
stemed word=go
lemmatized word=go

word: goa
stemed word=goa
lemmatized word=goa

word: gone
stemed word=gone
lemmatized word=gone

word: going
stemed word=go
lemmatized word=going

word: broad
stemed word=broad
lemmatized word=broad

word: broaden
stemed word=broaden
lemmatized word=broaden

word: broadening
stemed word=broaden
lemmatized word=broadening



In [60]:
# Create functions to lemmatize and stem the reviews
def lemmatize_text(text):
    return [lemma.lemmatize(w) for w in text.split()]

def stem_text(text):
    return [stemmer.stem(w) for w in text.split()]

text_df = pd.DataFrame(['this was cheesy', 'she likes these books', 'wow this is great'], columns=['text'])

text_df["text_stemmed"] = text_df.text.apply(stem_text).apply(lambda x: ' '.join(x))
text_df['text_lemmatized'] = text_df.text.apply(lemmatize_text).apply(lambda x: ' '.join(x))
text_df

Unnamed: 0,text,text_stemmed,text_lemmatized
0,this was cheesy,this was cheesi,this wa cheesy
1,she likes these books,she like these book,she like these book
2,wow this is great,wow this is great,wow this is great


In [66]:
# track the time
start = time.time()

# stem the review column
df["review_stemmed"] = df.review.apply(stem_text).apply(lambda x: ' '.join(x))

# lemmatize the review column
df["review_lemmatized"] = df.review.apply(lemmatize_text).apply(lambda x: ' '.join(x))

print(f"Time taken to stem and lemmatize the original dataframe: {(time.time() - start):.2f} seconds.")

# go over the folds created
for f_ in range(5):
    start_time = time.time()
    
    # create training and validation dataframe
    train_df = df[df["kfold"] != f_].reset_index(drop=True)
    valid_df = df[df["kfold"] == f_].reset_index(drop=True)
    
    # initialize the count vectorizer using the word_tokenize as the tokenizer
    vectorizer = TfidfVectorizer(tokenizer=word_tokenize, 
                                 token_pattern=None,
                                 ngram_range=(1, 2))
    
    # fit the vectorizer on the train_df reviews
    vectorizer.fit(train_df.review_stemmed)
    
    # create training and validation data
    xtrain = vectorizer.transform(train_df.review_stemmed)
    xvalid = vectorizer.transform(valid_df.review_stemmed)
    
    # initialize logistic regression
    model = linear_model.LogisticRegression(n_jobs=-1)
    
    # fit the model on training data
    model.fit(xtrain, train_df.sentiment)
    
    # make predictions on validation set
    preds = model.predict(xvalid)
    preds_proba = model.predict_proba(xvalid)[:, 1]
    
    # calculate accuracy and roc score
    accuracy = metrics.accuracy_score(valid_df.sentiment, preds)
    roc = metrics.roc_auc_score(valid_df.sentiment, preds_proba)
    
    # display results
    print(f"Fold: {f_}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"ROC AUC score: {roc:.4f}")
    print(f"Time elapsed: {(time.time() - start_time):.2f} seconds.")
    print("")

Time taken to stem and lemmatize the original dataframe: 262.71 seconds.
Fold: 0
Accuracy: 0.8985
ROC AUC score: 0.9617
Time elapsed: 406.04 seconds.

Fold: 1
Accuracy: 0.8964
ROC AUC score: 0.9626
Time elapsed: 423.30 seconds.

Fold: 2
Accuracy: 0.9053
ROC AUC score: 0.9632
Time elapsed: 431.30 seconds.

Fold: 3
Accuracy: 0.8970
ROC AUC score: 0.9611
Time elapsed: 425.73 seconds.

Fold: 4
Accuracy: 0.8935
ROC AUC score: 0.9603
Time elapsed: 405.86 seconds.



In [67]:
# go over the folds created
for f_ in range(5):
    start_time = time.time()
    
    # create training and validation dataframe
    train_df = df[df["kfold"] != f_].reset_index(drop=True)
    valid_df = df[df["kfold"] == f_].reset_index(drop=True)
    
    # initialize the count vectorizer using the word_tokenize as the tokenizer
    vectorizer = TfidfVectorizer(tokenizer=word_tokenize, 
                                 token_pattern=None,
                                 ngram_range=(1, 2))
    
    # fit the vectorizer on the train_df reviews
    vectorizer.fit(train_df.review_lemmatized)
    
    # create training and validation data
    xtrain = vectorizer.transform(train_df.review_lemmatized)
    xvalid = vectorizer.transform(valid_df.review_lemmatized)
    
    # initialize logistic regression
    model = linear_model.LogisticRegression(n_jobs=-1)
    
    # fit the model on training data
    model.fit(xtrain, train_df.sentiment)
    
    # make predictions on validation set
    preds = model.predict(xvalid)
    preds_proba = model.predict_proba(xvalid)[:, 1]
    
    # calculate accuracy and roc score
    accuracy = metrics.accuracy_score(valid_df.sentiment, preds)
    roc = metrics.roc_auc_score(valid_df.sentiment, preds_proba)
    
    # display results
    print(f"Fold: {f_}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"ROC AUC score: {roc:.4f}")
    print(f"Time elapsed: {(time.time() - start_time):.2f} seconds.")
    print("")

Fold: 0
Accuracy: 0.8986
ROC AUC score: 0.9623
Time elapsed: 417.90 seconds.

Fold: 1
Accuracy: 0.8997
ROC AUC score: 0.9633
Time elapsed: 423.84 seconds.

Fold: 2
Accuracy: 0.9035
ROC AUC score: 0.9641
Time elapsed: 437.06 seconds.

Fold: 3
Accuracy: 0.8981
ROC AUC score: 0.9625
Time elapsed: 432.23 seconds.

Fold: 4
Accuracy: 0.8950
ROC AUC score: 0.9617
Time elapsed: 438.82 seconds.

