In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Embedding, Masking, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split,cross_val_score,cross_val_predict
from sklearn.metrics import accuracy_score
import xgboost
import gensim
from gensim.models import Word2Vec,KeyedVectors
from nltk import word_tokenize
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import StratifiedKFold,KFold,StratifiedShuffleSplit
import re,string,unicodedata
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
import warnings
warnings.filterwarnings("ignore")
import nltk

In [3]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

##### 1. Download data from Lab 2 folder on Canvas.

In [4]:
data = pd.read_csv("Part4_Dataset.csv")

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [6]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
data.describe().transpose()

Unnamed: 0,count,unique,top,freq
review,50000,49582,Loved today's show!!! It was a variety and not...,5
sentiment,50000,2,positive,25000


In [8]:
data['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

 - We see the data is balanced

In [9]:
df = data.copy()

In [10]:
df['review'][1]

'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well d

In [11]:
# converting the sentiments to binary for reference

train_li=[]
for i in range(len(df)):
    if (df['sentiment'][i]=='positive'):
        train_li.append(1)
    else:
        train_li.append(0)
df['Binary']=train_li
df.head()

Unnamed: 0,review,sentiment,Binary
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1


Data Preprocessing

In [12]:
#Removes Punctuations
def remove_punctuations(data):
    punct_tag=re.compile(r'[^\w\s]')
    data=punct_tag.sub(r'',data)
    return data

#Removes HTML syntaxes
def remove_html(data):
    html_tag=re.compile(r'<.*?>')
    data=html_tag.sub(r'',data)
    return data

#Removes URL data
def remove_url(data):
    url_clean= re.compile(r"https://\S+|www\.\S+")
    data=url_clean.sub(r'',data)
    return data

#Removes Emojis
def remove_emoji(data):
    emoji_clean= re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    data=emoji_clean.sub(r'',data)
    url_clean= re.compile(r"https://\S+|www\.\S+")
    data=url_clean.sub(r'',data)
    return data

In [13]:
#Lemmatize the corpus for root words
def lemma_traincorpus(data):
    lemmatizer=WordNetLemmatizer()
    out_data=""
    for words in data:
        out_data+= lemmatizer.lemmatize(words)
    return out_data

In [14]:
df['review']=df['review'].apply(lambda z: remove_punctuations(z))
df['review']=df['review'].apply(lambda z: remove_html(z))
df['review']=df['review'].apply(lambda z: remove_url(z))
df['review']=df['review'].apply(lambda z: remove_emoji(z))
count_good=df[df['sentiment']=='positive']
count_bad=df[df['sentiment']=='negative']
df['review']=df['review'].apply(lambda z: lemma_traincorpus(z))

##### TFIDF Vectorizer

TFIDF vectorization is non semantic frequency based algorithm which uses a logarithmic distribution over document frequencies to embed vectors based on normalized frequency of occurence of words in the corpus.

In [15]:
#TFIDF Vectorize the Data

def tfidf(data):
    tfidfv = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), lowercase=True, max_features=150000)
    fit_data_tfidf=tfidfv.fit_transform(data)
    return fit_data_tfidf

train_set=tfidf(df['review'])

In [16]:
#Train Test Split

train_y=df['sentiment']
train_x,test_x,train_y,test_y=train_test_split(train_set,train_y,test_size=0.2,random_state=42)
train_x.shape,train_y.shape,test_x.shape,test_y.shape

((40000, 150000), (40000,), (10000, 150000), (10000,))

In [17]:
#Applying Logistic Regression on split tfidf baseline
model=LogisticRegression()
model.fit(train_x,train_y)
pred=model.predict(test_x)
print("Evaluate confusion matrix for LR")
print(confusion_matrix(test_y,pred))
print(f"Accuracy Score for LR with C=1.0  ={accuracy_score(test_y,pred)}")

Evaluate confusion matrix for LR
[[4367  594]
 [ 419 4620]]
Accuracy Score for LR with C=1.0  =0.8987


In [18]:
models=[]
models.append(('LogisticRregression',LogisticRegression(C=1.0,penalty='l2')))
models.append(('KNearestNeighbors',KNeighborsClassifier()))
models.append(('DecisionTree',DecisionTreeClassifier(criterion='entropy')))

In [19]:
scoring='accuracy'
print("Statistical Model TFIDF- Baseline Evaluation")
for name,model in models:
    kfold=KFold(n_splits=10)
    results=cross_val_score(model,train_x,train_y,cv=kfold)
    predictions=cross_val_predict(model,test_x,test_y)
    accuracy = accuracy_score(predictions,test_y)
    print("=======================")
    print("Classifiers: ",name, "Has a training score of", round(results.mean(), 2) * 100, "% accuracy score")
    print("Classifiers: ",name, "Has a testing score of", round(accuracy, 2) * 100, "% accuracy score")

Statistical Model TFIDF- Baseline Evaluation
Classifiers:  LogisticRregression Has a training score of 89.0 % accuracy score
Classifiers:  LogisticRregression Has a testing score of 87.0 % accuracy score
Classifiers:  KNearestNeighbors Has a training score of 78.0 % accuracy score
Classifiers:  KNearestNeighbors Has a testing score of 74.0 % accuracy score
Classifiers:  DecisionTree Has a training score of 72.0 % accuracy score
Classifiers:  DecisionTree Has a testing score of 71.0 % accuracy score


From the Kfold cross validation results we can infer logistic regression model suits best for the TFIDF vectorized data. 

##### Count Vectorizer

This is a simpler vectorization technique which relies on frequency of occurence of a particular term in a document or corpus.

In [20]:
#count Vectorize the Data

def count_vec(data):
    count_v = CountVectorizer(stop_words='english', ngram_range=(1, 2), lowercase=True, max_features=150000)
    fit_data_count_v=count_v.fit_transform(data)
    return fit_data_count_v

train_set=count_vec(df['review'])

In [21]:
#Train Test Split

train_y=df['sentiment']
train_x,test_x,train_y,test_y=train_test_split(train_set,train_y,test_size=0.2,random_state=42)
train_x.shape,train_y.shape,test_x.shape,test_y.shape

((40000, 150000), (40000,), (10000, 150000), (10000,))

In [22]:
#Applying Logistic Regression on split 
model=LogisticRegression()
model.fit(train_x,train_y)
pred=model.predict(test_x)
print("Evaluate confusion matrix for LR")
print(confusion_matrix(test_y,pred))
print(f"Accuracy Score for LR with C=1.0  ={accuracy_score(test_y,pred)}")

Evaluate confusion matrix for LR
[[4402  559]
 [ 482 4557]]
Accuracy Score for LR with C=1.0  =0.8959


In [23]:
#Applying Linear SVC on split
model = LinearSVC(C=0.5, random_state=42)
model.fit(train_x,train_y)
pred=model.predict(test_x)
print("Evaluate confusion matrix for SVC")
print(confusion_matrix(test_y,pred))
print(f"Accuracy Score for SVC with C=0.5  ={accuracy_score(test_y,pred)}")

Evaluate confusion matrix for SVC
[[4389  572]
 [ 552 4487]]
Accuracy Score for SVC with C=0.5  =0.8876


Another variation of CountVectorizer with binary=True and in that case all zero entries will have 1

In [25]:
def count_vec(dat):
    count_v = CountVectorizer(stop_words='english', ngram_range=(1, 3), binary=True, lowercase=True, max_features=150000)
    fit_data_count_v=count_v.fit_transform(dat)
    return fit_data_count_v

train_set=count_vec(df['review'])

In [26]:
#Train Test Split

train_y=df['sentiment']
train_x,test_x,train_y,test_y=train_test_split(train_set,train_y,test_size=0.2,random_state=42)
train_x.shape,train_y.shape,test_x.shape,test_y.shape

((40000, 150000), (40000,), (10000, 150000), (10000,))

In [27]:
#Applying Logistic Regression on split 
model=LogisticRegression()
model.fit(train_x,train_y)
pred=model.predict(test_x)
print("Evaluate confusion matrix for LR")
print(confusion_matrix(test_y,pred))
print(f"Accuracy Score for LR ={accuracy_score(test_y,pred)}")

Evaluate confusion matrix for LR
[[4422  539]
 [ 507 4532]]
Accuracy Score for LR =0.8954


In [28]:
#Applying Linear SVC on split
model = LinearSVC(C=0.5, random_state=42)
model.fit(train_x,train_y)
pred=model.predict(test_x)
print("Evaluate confusion matrix for SVC")
print(confusion_matrix(test_y,pred))
print(f"Accuracy Score for SVC with C=0.5  ={accuracy_score(test_y,pred)}")

Evaluate confusion matrix for SVC
[[4401  560]
 [ 567 4472]]
Accuracy Score for SVC with C=0.5  =0.8873


Not much difference between counter vectorizer and TFIDF vectorizer, both provided similar results,

##### LSTM Model

We can apply Recurrent Neural Networks like LSTM to perform sentiment analysis and we have a different vectorization technique called Word Embeddings.
Word embeddings give us a way to use an efficient, dense representation in which similar words have a similar encoding. Importantly, we do not have to specify this encoding by hand. An embedding is a dense vector of floating point values (the length of the vector is a parameter you specify). Instead of specifying the values for the embedding manually, they are trainable parameters (weights learned by the model during training, in the same way a model learns weights for a dense layer). It is common to see word embeddings that are 8-dimensional (for small datasets), up to 1024-dimensions when working with large datasets. A higher dimensional embedding can capture fine-grained relationships between words, but takes more data to learn.
Reference: https://www.tensorflow.org/tutorials/text/word_embeddings

In [29]:
max_features = 20000
maxlen = 200
tokenizer = Tokenizer(num_words=max_features)

In [30]:
#Train Test Split
train_set = pd.DataFrame(df['review'])
train_y=df['Binary']
train_x,test_x,train_y,test_y=train_test_split(train_set,train_y,test_size=0.2,random_state=42)
train_x.shape,train_y.shape,test_x.shape,test_y.shape

((40000, 1), (40000,), (10000, 1), (10000,))

In [31]:
tokenizer.fit_on_texts(train_x['review'])
X_train_token = tokenizer.texts_to_sequences(train_x['review'])

tokenizer.fit_on_texts(test_x['review'])
X_test_token = tokenizer.texts_to_sequences(test_x['review'])

In [32]:
X_train = pad_sequences(X_train_token, maxlen=maxlen, padding='post')
X_test  = pad_sequences(X_test_token, maxlen=maxlen, padding='post')
print(X_train.shape, X_test.shape)

(40000, 200) (10000, 200)


In [33]:
y_train = train_y.copy()
y_test  = test_y.copy()

In [34]:
model = Sequential([Embedding(max_features, 64, mask_zero=True),
                    Bidirectional(LSTM(64, dropout=0.2)),
                    Dense(64, activation='sigmoid'),
                    Dense(1)])

In [35]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [36]:
history = model.fit(X_train, y_train,
                    batch_size=50,
                    epochs=3,
                    validation_data=(X_test, y_test))

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [37]:
history.history

{'accuracy': [0.5009750127792358, 0.5009750127792358, 0.5009750127792358],
 'loss': [7.697445392608643, 7.697450160980225, 7.697446823120117],
 'val_accuracy': [0.4961000084877014, 0.4961000084877014, 0.4961000084877014],
 'val_loss': [7.77263069152832, 7.77263069152832, 7.77263069152832]}

We can conclude that Bi-directional LSTM takes more time to train and is performing poorly compared to TF-IDF vectorization, Counter Vectorization.

##### Word2Vec (CBOW)

Word2Vec is one of the traditional algorithms which was emphasized based on Heirarchical Softmax as well as with simplistic RNNs. Gensim provides a great way to use and start with Word2Vec. The Word2Vec algorithm builds by using the Skipgram model as well as the Common Bag of Words Model.

In [38]:
# Converting input dataframe into list
check_df=list(df['review'].str.split())

In [39]:
# Creating CBOW model
w2v_model=Word2Vec(check_df,min_count=1)

In [40]:
#Label Encode the labels
from sklearn.preprocessing import LabelEncoder
label_y= LabelEncoder()
labels=label_y.fit_transform(df['sentiment'])
labels

array([1, 1, 1, ..., 0, 0, 0])

In [41]:
w2v_model.wv.most_similar('good')

[('decent', 0.7890726327896118),
 ('great', 0.7716896533966064),
 ('bad', 0.7483441829681396),
 ('nice', 0.7346286177635193),
 ('cool', 0.7161738872528076),
 ('fine', 0.708539605140686),
 ('solid', 0.6666469573974609),
 ('fantastic', 0.6275328397750854),
 ('mediocre', 0.6257743835449219),
 ('poor', 0.6251050233840942)]

In [42]:
#Word2Vec model returns the similar words for a given word
w2v_model.wv.most_similar('interesting', topn=10)

[('intriguing', 0.8061264157295227),
 ('exciting', 0.7619497776031494),
 ('important', 0.7323001623153687),
 ('entertaining', 0.7215830087661743),
 ('engaging', 0.708055853843689),
 ('unusual', 0.7034302949905396),
 ('enjoyable', 0.6961750984191895),
 ('fascinating', 0.6960465908050537),
 ('amusing', 0.687725305557251),
 ('odd', 0.686927318572998)]

In [43]:
w2v_model.wv.doesnt_match(["king", "george","stephen","truck"])

'truck'

Since Word2Vec creates vector embeddings for individual words in a corpus by transforming them to a manifold, we need effective document /sentence vectors from these individual vectorized words.

In [44]:
#Convert word vectors to sentence vectors/sentence vectors and apply mean pooling

def convert_sentence(data):
    vocab=[w for w in data if w in w2v_model.wv.vocab]
    avg_pool=np.mean(w2v_model[vocab],axis=0)
    return avg_pool

df['Vectorized_Reviews']=df['review'].apply(convert_sentence)

#Split the dataset into training and testing sets
train_y=df['sentiment']
train_x,test_x,train_y,test_y=train_test_split(df['Vectorized_Reviews'],train_y,test_size=0.2,random_state=42)
train_x.shape,train_y.shape,test_x.shape,test_y.shape

((40000,), (40000,), (10000,), (10000,))

In [47]:
test_x=list(test_x)
train_x=list(train_x)

In [45]:
models=[]
models.append(('LogisticRregression',LogisticRegression(C=1.0,penalty='l2')))
models.append(('KNearestNeighbors',KNeighborsClassifier()))
models.append(('LinearSVC',LinearSVC(C=0.5)))

In [48]:
scoring='accuracy'
print("Statistical Model Word2Vec- Baseline Evaluation")
for name,model in models:
    kfold=KFold(n_splits=10)
    results=cross_val_score(model,train_x,train_y,cv=kfold)
    predictions=cross_val_predict(model,test_x,test_y)
    accuracy = accuracy_score(predictions,test_y)
    print("=======================")
    print("Classifiers: ",name, "Has a training score of", round(results.mean(), 2) * 100, "% accuracy score")
    print("Classifiers: ",name, "Has a testing score of", round(accuracy, 2) * 100, "% accuracy score")

Statistical Model Word2Vec- Baseline Evaluation
Classifiers:  LogisticRregression Has a training score of 62.0 % accuracy score
Classifiers:  LogisticRregression Has a testing score of 60.0 % accuracy score
Classifiers:  KNearestNeighbors Has a training score of 54.0 % accuracy score
Classifiers:  KNearestNeighbors Has a testing score of 55.00000000000001 % accuracy score
Classifiers:  LinearSVC Has a training score of 63.0 % accuracy score
Classifiers:  LinearSVC Has a testing score of 61.0 % accuracy score


In [49]:
train_x = np.array(train_x)
test_x = np.array(test_x)

In [50]:
#Evaluating XGBoost on the dataset
from xgboost import XGBClassifier 
model_xgb= XGBClassifier()
model_xgb.fit(train_x,train_y)
y_pred_xgb=model_xgb.predict(test_x)

In [51]:
print("Accuracy score: ",(accuracy_score(test_y,y_pred_xgb)))
print("Confusion matrix")
print(confusion_matrix(test_y,y_pred_xgb))

Accuracy score:  0.6058
Confusion matrix
[[2996 1965]
 [1977 3062]]


When trained with multiple models, the best results was from Count vectorization with 89 testing scores repectively. Since the data had binary targets and the best results was from logistic regression and LinearSVC we can infer the data was linearly distributed.

Final Results:
- TFIDF Vectorization:
    - Logistic Regression: 87% 
    - KNearestNeighbour  : 74%
    - Decision Tree      : 71%
- Count Vectorization: 
    - Logistic Regression: 89% 
    - LinearSVC          : 88%
- Genism Word2Vec(CBOW):
    - Logistic Regression: 60% 
    - KNearestNeighbour  : 55%
    - LinearSVC          : 61%
    - XGBoost            : 60%
- Bidrectional LSTM: Mean Accuracy of 50% 