### Part 2 - Sentiment Classification of IMDB Dataset

Created by Liam Butler

Uses the IMDb datasets to create a model which is able to predict sentiment of a given review. 

Import packages

In [32]:
# Common imports
import numpy as np
import os
import pandas as pd 
import nltk
from nltk.corpus import stopwords
from nltk.classify import SklearnClassifier
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
import re
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import operator
import requests

from textblob import TextBlob

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [0]:
## Import Data

#Import data from GitHub Repo

root='https://raw.githubusercontent.com/LiamButler2/IMDB-Sentiment-classifier/master/datasets_coursework1/IMDb'

path= root + '/train/imdb_train_pos.txt'
path1= root + '/train/imdb_train_neg.txt'
path2= root + '/test/imdb_test_pos.txt'
path3= root + '/test/imdb_test_neg.txt'
path4= root + '/dev/imdb_dev_pos.txt'
path5= root + '/dev/imdb_dev_neg.txt'

response = requests.get(path)
train_pos = response.text.split("\n")

response = requests.get(path1)
train_neg = response.text.split("\n")

response = requests.get(path2)
test_pos = response.text.split("\n")

response = requests.get(path3)
test_neg = response.text.split("\n") 

response = requests.get(path4)
dev_pos = response.text.split("\n")

response = requests.get(path5)
dev_neg = response.text.split("\n")   




In [0]:
# Convert to pandas DF

df_train_pos = pd.DataFrame(train_pos)
df_train_neg = pd.DataFrame(train_neg)
df_test_pos = pd.DataFrame(test_pos)
df_test_neg = pd.DataFrame(test_neg)
df_dev_pos = pd.DataFrame(dev_pos)
df_dev_neg = pd.DataFrame(dev_neg)

In [35]:
# Add sentiment
df_train_pos['sentiment'] = 1
df_train_neg['sentiment'] = 0
df_test_pos['sentiment'] = 1
df_test_neg['sentiment'] = 0
df_dev_pos['sentiment'] = 1
df_dev_neg['sentiment'] = 0

# Rename columns
df_train_pos.columns = ['review', 'sentiment']
df_train_neg.columns = ['review', 'sentiment']
df_test_pos.columns = ['review', 'sentiment']
df_test_neg.columns = ['review', 'sentiment']
df_dev_pos.columns = ['review', 'sentiment']
df_dev_neg.columns = ['review', 'sentiment']

print(len(df_train_neg))
print(len(df_train_pos))


7518
7484


In [36]:
# Concatenate positive and negative dataframes
df_train = pd.concat([df_train_pos, df_train_neg], ignore_index = True)
print(len(df_train))
df_test = pd.concat([df_test_pos, df_test_neg], ignore_index = True)
print(len(df_test))
df_val = pd.concat([df_dev_pos, df_dev_neg], ignore_index = True)
print(len(df_val))

df_train.columns = ['review', 'sentiment']
df_test.columns = ['review', 'sentiment']
df_val.columns = ['review', 'sentiment']


15002
5002
5002


## Functions

In [0]:
# First, we get the stopwords list from nltk
stopwords=set(nltk.corpus.stopwords.words('english'))
# We can add more words to the stopword list, like punctuation marks
stopwords.add(".")
stopwords.add(",")
stopwords.add("-")
stopwords.add("``")

In [0]:
wn = WordNetLemmatizer()
import string

def StopWords(token):
    return  token not in stopwords and token not in list(string.punctuation)  and len(token)>2 


def clean_text(text):
  clean_text = []
  clean_text2 = []
  text = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', text) # Removing urls
  text = re.sub(r'[^\w\s]', '',text)
  text = re.sub("'", "",text)
  text=re.sub("(\\d|\\W)+"," ",text)    
  clean_text = [ wn.lemmatize(word, pos="v") for word in word_tokenize(text.lower()) if StopWords(word)]
  clean_text2 = [word for word in clean_text if StopWords(word)]
  return " ".join(clean_text2)

In [0]:
import textblob

## Add length of review as a feature
def len_text(review):
  if len(review.split())>0:
    return len(set(clean_text(review).split())) /len(review.split())
  else:
    return 0

In [0]:
# returns whether the text is objective or subjective
def subjectivity_text(review):
  return TextBlob(review).sentiment[1]

In [0]:
# returns polarity
def polarity_text(review):
  return TextBlob(review).sentiment[0]

In [0]:
## Add full length of review as a feature
def full_len_text(review):
  if len(review.split())>0:
    return len(set(clean_text(review).split()))
  else:
    return 0

In [0]:
lemmatizer = nltk.stem.WordNetLemmatizer()

# returns sentence as tokens
def get_list_tokens(string):
  sentence_split=nltk.tokenize.sent_tokenize(string)
  list_tokens=[]
  for sentence in sentence_split:
    sentence = clean_text(sentence)
    
    list_tokens_sentence=nltk.tokenize.word_tokenize(sentence)
    for token in list_tokens_sentence:
      list_tokens.append(lemmatizer.lemmatize(token).lower())
  return list_tokens

In [0]:
lemmatizer = nltk.stem.WordNetLemmatizer()

def get_vocabulary(data, num_features): # Function to retrieve vocabulary

  dict_word_frequency={}
  for sentence in data:
    sentence_tokens=get_list_tokens(sentence)
    for word1 in sentence_tokens:
      word = lemmatizer.lemmatize(word1)
      if word in stopwords: continue
      if word not in dict_word_frequency: dict_word_frequency[word]=1
      else: dict_word_frequency[word]+=1
        
  # Now we create a sorted frequency list with the top N words, using the function "sorted". Let's see the 15 most frequent words
  sorted_list = sorted(dict_word_frequency.items(), key=operator.itemgetter(1), reverse=True)[:num_features]
  i=0
  for word,frequency in sorted_list:
    i+=1
    
    
  # Finally, we create our vocabulary based on the sorted frequency list 
  vocabulary=[]
  for word,frequency in sorted_list:
    vocabulary.append(word)
  return vocabulary

In [0]:

def get_vector_text(list_vocab,string):
  vector_text=np.zeros(len(list_vocab))
  list_tokens_string=get_list_tokens(string)
  for i, word in enumerate(list_vocab):
    if word in list_tokens_string:
      vector_text[i]=list_tokens_string.count(lemmatizer.lemmatize(word))
  return vector_text

In [0]:
def get_negative_vector_text(string):
  vector_text=np.zeros(len(neg_vocabulary))
  list_tokens_string=get_list_tokens(string)
  for i, word in enumerate(neg_vocabulary):
    if word in list_tokens_string:
      vector_text[i]=list_tokens_string.count(lemmatizer.lemmatize(word))
  return vector_text

def get_positive_vector_text(string):
  vector_text=np.zeros(len(pos_vocabulary))
  list_tokens_string=get_list_tokens(string)
  for i, word in enumerate(pos_vocabulary):
    if word in list_tokens_string:
      vector_text[i]=list_tokens_string.count(lemmatizer.lemmatize(word))
  return vector_text

# Data Pre Processing

In [47]:
# Add Polarity
df_train['polarity'] = df_train['review'].apply(polarity_text)
df_test['polarity'] = df_test['review'].apply(polarity_text)
df_val['polarity'] = df_val['review'].apply(polarity_text)

df_train.head(10)

Unnamed: 0,review,sentiment,polarity
0,"For fans of Chris Farley, this is probably his...",1,0.181676
1,"Fantastic, Madonna at her finest, the film is ...",1,0.378125
2,From a perspective that it is possible to make...,1,0.111111
3,What is often neglected about Harold Lloyd is ...,1,0.122631
4,You'll either love or hate movies such as this...,1,-0.019647
5,Good (not great) little horror film with a hig...,1,0.124583
6,The word Ghilli actually means a small sharp w...,1,0.060473
7,This is one of THE century's best tv-series ev...,1,0.344444
8,It's pretty surprising that this wonderful fil...,1,0.251122
9,A fantastic cinema experience. I really enjoye...,1,0.108333


In [0]:
# Add subjectivity
df_train['subjectivity'] = df_train['review'].apply(subjectivity_text)
df_test['subjectivity'] = df_test['review'].apply(subjectivity_text)
df_val['subjectivity'] = df_val['review'].apply(subjectivity_text)

In [0]:
# relative length
df_train['length'] = df_train['review'].apply(len_text)
df_test['length'] = df_test['review'].apply(len_text)
df_val['length'] = df_val['review'].apply(len_text)


In [0]:
# Full_Length of the cleaned data
df_train['full_length'] = df_train['review'].apply(full_len_text)
df_test['full_length'] = df_test['review'].apply(full_len_text)
df_val['full_length'] = df_val['review'].apply(full_len_text)

In [51]:
df_train.head(10)

Unnamed: 0,review,sentiment,polarity,subjectivity,length,full_length
0,"For fans of Chris Farley, this is probably his...",1,0.181676,0.692045,0.605769,63
1,"Fantastic, Madonna at her finest, the film is ...",1,0.378125,0.7125,0.410714,23
2,From a perspective that it is possible to make...,1,0.111111,0.408333,0.42029,87
3,What is often neglected about Harold Lloyd is ...,1,0.122631,0.49002,0.351425,259
4,You'll either love or hate movies such as this...,1,-0.019647,0.488426,0.508197,62
5,Good (not great) little horror film with a hig...,1,0.124583,0.528782,0.538462,77
6,The word Ghilli actually means a small sharp w...,1,0.060473,0.492848,0.41954,73
7,This is one of THE century's best tv-series ev...,1,0.344444,0.361111,0.375,24
8,It's pretty surprising that this wonderful fil...,1,0.251122,0.532872,0.405263,154
9,A fantastic cinema experience. I really enjoye...,1,0.108333,0.437698,0.455752,103


Make the Custom class for feature union Transformer of sklearn

In [0]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction import DictVectorizer
class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]


class TextStatistics(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""

    def fit(self, x, y=None):
        return self

    def transform(self, data):
        return [{'pos':  row['polarity'], 'sub': row['subjectivity'],  'length': row['length'], 'full_length': row['full_length']} for _, row in data.iterrows()]

Create a pipeline

In [0]:
 from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('union', FeatureUnion(
        transformer_list=[

            # Pipeline for pulling features from the text
            ('review', Pipeline([
                ('selector', ItemSelector(key='review')),
                ('tfidf', TfidfVectorizer( min_df =3, max_df=0.3, max_features=1000, 
                    strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
                    ngram_range=(1, 10), use_idf=1,smooth_idf=1,sublinear_tf=1,
                    stop_words = None, preprocessor=clean_text)),
            ])),

            # Pipeline for pulling features
            ('statistics', Pipeline([
                ('selector', ItemSelector(key=['polarity', 'subjectivity', 'length','full_length'])),
                ('statistics', TextStatistics()),  # returns a list of dicts
                ('vect', DictVectorizer()),  # list of dicts -> feature matrix
            ])),

        ],

        # weight components in FeatureUnion
        transformer_weights={
            'review': 1,
            'statistics': 1,
        },
    ))
])

Build the pipeline

In [0]:
x_train = df_train[['review', 'polarity', 'subjectivity','length', 'full_length']]
y_train =df_train['sentiment']

x_test = df_test[['review', 'polarity', 'subjectivity','length', 'full_length']]
y_test =df_test['sentiment']

x_val = df_val[['review', 'polarity', 'subjectivity','length', 'full_length']]
y_val =df_val['sentiment']

In [55]:
pipeline.fit(x_train)

Pipeline(memory=None,
         steps=[('union',
                 FeatureUnion(n_jobs=None,
                              transformer_list=[('review',
                                                 Pipeline(memory=None,
                                                          steps=[('selector',
                                                                  ItemSelector(key='review')),
                                                                 ('tfidf',
                                                                  TfidfVectorizer(analyzer='word',
                                                                                  binary=False,
                                                                                  decode_error='strict',
                                                                                  dtype=<class 'numpy.float64'>,
                                                                                  encoding='utf-8',
                  

In [0]:
# Create our set of Vocabulary   
vocabulary=get_vocabulary(x_train.review , 500)

In [0]:
train_vector1 = pipeline.transform(x_train)
test_vector1 = pipeline.transform(x_test)
val_vector1 = pipeline.transform(x_val)


In [0]:
train_vector2=[]
test_vector2=[]
val_vector2=[]

# Create a vector for using our vocabulary

for row in x_train.review:
    vector_instance=get_vector_text(vocabulary,row)
    train_vector2.append(vector_instance)

for row in x_test.review:
    vector_instance=get_vector_text(vocabulary,row)
    test_vector2.append(vector_instance)

for row in x_val.review:
    vector_instance=get_vector_text(vocabulary,row)
    val_vector2.append(vector_instance)

In [0]:
# Merge the two vectors 
from scipy.sparse import csr_matrix, hstack
train_vector = hstack((csr_matrix(train_vector2),train_vector1))
test_vector = hstack((csr_matrix(test_vector2),test_vector1))
val_vector = hstack((csr_matrix(val_vector2),val_vector1))



Build the models

In [0]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB


# SVM classifier
svm_clf=SVC(gamma='auto') # Initialize the SVM model

# Logistic regression
lr_clf = LogisticRegression()

# Decision Tree Classifier
dt_clf = DecisionTreeClassifier()

# naive bayes
nb_clf = GaussianNB()

# random forest
rf_clf=RandomForestClassifier(n_estimators=100)


In [61]:
import pandas as pd 
from pandas import DataFrame
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

# Set up scoring
lst = []
list_result =[]
cols = ['Model', 'Fold','Accuracy']
cols1 = ['Model', 'Accuracy', 'Recall', 'Precision', 'F1 - Score']

clfs = [svm_clf, lr_clf, dt_clf, rf_clf]
cv = 5
scoring = {'acc': 'accuracy',
           'prec_macro': 'precision_macro',
           'rec_micro': 'recall_macro',
           'f1_score': 'f1'}

# Run cross validation on build table of results
for clf in clfs:
    scores = cross_validate(clf,train_vector, y_train, cv=cv, scoring=scoring )

    lst.append([str(clf), str(scores['test_acc'].mean()) ,str(scores['test_rec_micro'].mean()) ,str(scores['test_prec_macro'].mean()) ,str(scores['test_f1_score'].mean()) ]) 
    

    # Predict validation set
    clf.fit(train_vector, y_train )
    y_pred = clf.predict(val_vector)
    list_result.append((str(clf),accuracy_score(y_val, y_pred),recall_score(y_val, y_pred),precision_score(y_val, y_pred),f1_score(y_val, y_pred),))

    print('Model ' + str(clf) + ' Complete' )


Model SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False) Complete


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

Model LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False) Complete
Model DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best') Complete
Model RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None

In [62]:
print('Cross validation results')
dfResults1 = pd.DataFrame(lst, columns = cols1)
dfResults1


Cross validation results


Unnamed: 0,Model,Accuracy,Recall,Precision,F1 - Score
0,"SVC(C=1.0, break_ties=False, cache_size=200, c...",0.8065591691658336,0.8066768655609534,0.8100223432109704,0.8158639689087099
1,"LogisticRegression(C=1.0, class_weight=None, d...",0.8526859713428857,0.8527017579402856,0.8527844024625564,0.8534745750596084
2,"DecisionTreeClassifier(ccp_alpha=0.0, class_we...",0.7304359880039988,0.7304259196817384,0.7305156313121085,0.7287458056315205
3,"RandomForestClassifier(bootstrap=True, ccp_alp...",0.8341552815728092,0.8341593076919583,0.8343800787974057,0.8341437060276086


In [63]:
print('Predict Validation set - Scores')
dfResultsValPredict = pd.DataFrame(list_result, columns = cols1)
dfResultsValPredict

Predict Validation set - Scores


Unnamed: 0,Model,Accuracy,Recall,Precision,F1 - Score
0,"SVC(C=1.0, break_ties=False, cache_size=200, c...",0.80088,0.847956,0.77701,0.810934
1,"LogisticRegression(C=1.0, class_weight=None, d...",0.845462,0.857086,0.839425,0.848163
2,"DecisionTreeClassifier(ccp_alpha=0.0, class_we...",0.733507,0.732037,0.73701,0.734515
3,"RandomForestClassifier(bootstrap=True, ccp_alp...",0.833467,0.836443,0.833465,0.834951


Logistic regression is the best model so we can validate this against our test results

In [0]:
# Final model validation

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt

list_result =[]



lr_clf.fit(train_vector, y_train )
y_pred = lr_clf.predict(val_vector)
list_result.append(("Logistic Regression",accuracy_score(y_test, y_pred)))

array = confusion_matrix(y_test, y_pred)

print(list_result)
