### Part 2

Import packages

In [0]:
# Common imports
import numpy as np
import os
import pandas as pd 
import nltk
from nltk.corpus import stopwords
from nltk.classify import SklearnClassifier
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
import re
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import operator


from textblob import TextBlob

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [0]:
## Import Data

#Import data from drive
from google.colab import drive
drive.mount('/content/drive/')
path= '/content/drive/My Drive/CMT307 Applied Machine Learning/Coursework/datasets_coursework1/IMDb/train/imdb_train_pos.txt'
path1= '/content/drive/My Drive/CMT307 Applied Machine Learning/Coursework/datasets_coursework1/IMDb/train/imdb_train_neg.txt'
path2= '/content/drive/My Drive/CMT307 Applied Machine Learning/Coursework/datasets_coursework1/IMDb/test/imdb_test_pos.txt'
path3= '/content/drive/My Drive/CMT307 Applied Machine Learning/Coursework/datasets_coursework1/IMDb/test/imdb_test_neg.txt'
path4= '/content/drive/My Drive/CMT307 Applied Machine Learning/Coursework/datasets_coursework1/IMDb/dev/imdb_dev_pos.txt'
path5= '/content/drive/My Drive/CMT307 Applied Machine Learning/Coursework/datasets_coursework1/IMDb/dev/imdb_dev_neg.txt'


train_pos = open(path).readlines()
train_neg = open(path1).readlines() 
test_pos = open(path2).readlines()
test_neg = open(path3).readlines()   
dev_pos = open(path4).readlines()
dev_neg = open(path5).readlines()   




In [0]:
# Convert to pandas DF

df_train_pos = pd.DataFrame(train_pos)
df_train_neg = pd.DataFrame(train_neg)
df_test_pos = pd.DataFrame(test_pos)
df_test_neg = pd.DataFrame(test_neg)
df_dev_pos = pd.DataFrame(dev_pos)
df_dev_neg = pd.DataFrame(dev_neg)

In [0]:
df_train_pos['sentiment'] = 1
df_train_neg['sentiment'] = 0
df_test_pos['sentiment'] = 1
df_test_neg['sentiment'] = 0
df_dev_pos['sentiment'] = 1
df_dev_neg['sentiment'] = 0

# rename columns
df_train_pos.columns = ['review', 'sentiment']
df_train_neg.columns = ['review', 'sentiment']
df_test_pos.columns = ['review', 'sentiment']
df_test_neg.columns = ['review', 'sentiment']
df_dev_pos.columns = ['review', 'sentiment']
df_dev_neg.columns = ['review', 'sentiment']

print(len(df_train_neg))
print(len(df_train_pos))


In [0]:
# Concatenate positive and negative dataframes
df_train = pd.concat([df_train_pos, df_train_neg], ignore_index = True)
print(len(df_train))
df_test = pd.concat([df_test_pos, df_test_neg], ignore_index = True)
print(len(df_test))
df_val = pd.concat([df_dev_pos, df_dev_neg], ignore_index = True)
print(len(df_val))

df_train.columns = ['review', 'sentiment']
df_test.columns = ['review', 'sentiment']
df_val.columns = ['review', 'sentiment']


## Functions

In [0]:
# First, we get the stopwords list from nltk
stopwords=set(nltk.corpus.stopwords.words('english'))
# We can add more words to the stopword list, like punctuation marks
stopwords.add(".")
stopwords.add(",")
stopwords.add("-")
stopwords.add("``")

In [0]:
wn = WordNetLemmatizer()


def StopWords(token):
    return  token not in stopwords and token not in list(string.punctuation)  and len(token)>2 


def clean_text(text):
  clean_text = []
  clean_text2 = []
  text = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', text) # Removing urls
  text = re.sub(r'[^\w\s]', '',text)
  text = re.sub("'", "",text)
  text=re.sub("(\\d|\\W)+"," ",text)    
  clean_text = [ wn.lemmatize(word, pos="v") for word in word_tokenize(text.lower()) if StopWords(word)]
  clean_text2 = [word for word in clean_text if StopWords(word)]
  return " ".join(clean_text2)

In [0]:
import textblob

## Add length of review as a feature
def len_text(review):
  if len(review.split())>0:
    return len(set(clean_text(review).split())) /len(review.split())
  else:
    return 0

In [0]:
# returns whether the text is objective or subjective
def subjectivity_text(review):
  return TextBlob(review).sentiment[1]

In [0]:
# returns polarity
def polarity_text(review):
  return TextBlob(review).sentiment[0]

In [0]:
## Add full length of review as a feature
def full_len_text(review):
  if len(review.split())>0:
    return len(set(clean_text(review).split()))
  else:
    return 0

In [0]:
lemmatizer = nltk.stem.WordNetLemmatizer()

# returns sentence as tokens
def get_list_tokens(string):
  sentence_split=nltk.tokenize.sent_tokenize(string)
  list_tokens=[]
  for sentence in sentence_split:
    sentence = clean_text(sentence)
    
    list_tokens_sentence=nltk.tokenize.word_tokenize(sentence)
    for token in list_tokens_sentence:
      list_tokens.append(lemmatizer.lemmatize(token).lower())
  return list_tokens

In [0]:
lemmatizer = nltk.stem.WordNetLemmatizer()

def get_vocabulary(data, num_features): # Function to retrieve vocabulary

  dict_word_frequency={}
  for sentence in data:
    sentence_tokens=get_list_tokens(sentence)
    for word1 in sentence_tokens:
      word = lemmatizer.lemmatize(word1)
      if word in stopwords: continue
      if word not in dict_word_frequency: dict_word_frequency[word]=1
      else: dict_word_frequency[word]+=1
        
  # Now we create a sorted frequency list with the top N words, using the function "sorted". Let's see the 15 most frequent words
  sorted_list = sorted(dict_word_frequency.items(), key=operator.itemgetter(1), reverse=True)[:num_features]
  i=0
  for word,frequency in sorted_list:
    i+=1
    
    
  # Finally, we create our vocabulary based on the sorted frequency list 
  vocabulary=[]
  for word,frequency in sorted_list:
    vocabulary.append(word)
  return vocabulary

In [0]:

def get_vector_text(list_vocab,string):
  vector_text=np.zeros(len(list_vocab))
  list_tokens_string=get_list_tokens(string)
  for i, word in enumerate(list_vocab):
    if word in list_tokens_string:
      vector_text[i]=list_tokens_string.count(lemmatizer.lemmatize(word))
  return vector_text

In [0]:
def get_negative_vector_text(string):
  vector_text=np.zeros(len(neg_vocabulary))
  list_tokens_string=get_list_tokens(string)
  for i, word in enumerate(neg_vocabulary):
    if word in list_tokens_string:
      vector_text[i]=list_tokens_string.count(lemmatizer.lemmatize(word))
  return vector_text

def get_positive_vector_text(string):
  vector_text=np.zeros(len(pos_vocabulary))
  list_tokens_string=get_list_tokens(string)
  for i, word in enumerate(pos_vocabulary):
    if word in list_tokens_string:
      vector_text[i]=list_tokens_string.count(lemmatizer.lemmatize(word))
  return vector_text

# Data Pre Processing

In [0]:
df_train['polarity'] = df_train['review'].apply(polarity_text)
df_test['polarity'] = df_test['review'].apply(polarity_text)
df_val['polarity'] = df_val['review'].apply(polarity_text)

df_train.head(10)

In [0]:
df_train['subjectivity'] = df_train['review'].apply(subjectivity_text)
df_test['subjectivity'] = df_test['review'].apply(subjectivity_text)
df_val['subjectivity'] = df_val['review'].apply(subjectivity_text)

In [0]:
df_train.head(10)

In [0]:
df_train['length'] = df_train['review'].apply(len_text)
df_test['length'] = df_test['review'].apply(len_text)
df_val['length'] = df_val['review'].apply(len_text)


In [0]:
df_train.head(10)

In [0]:
# Full_Length of the cleaned data
df_train['full_length'] = df_train['review'].apply(full_len_text)
df_test['full_length'] = df_test['review'].apply(full_len_text)
df_val['full_length'] = df_val['review'].apply(full_len_text)

Check correlation of features

Make the Custom class for feature union Transformer of sklearn

In [0]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction import DictVectorizer
class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]


class TextStatistics(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""

    def fit(self, x, y=None):
        return self

    def transform(self, data):
        return [{'pos':  row['polarity'], 'sub': row['subjectivity'],  'length': row['length'], 'full_length': row['full_length']} for _, row in data.iterrows()]

Create a pipeline

In [0]:
 from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('union', FeatureUnion(
        transformer_list=[

            # Pipeline for pulling features from the text
            ('review', Pipeline([
                ('selector', ItemSelector(key='review')),
                ('tfidf', TfidfVectorizer( min_df =3, max_df=0.3, max_features=1000, 
                    strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
                    ngram_range=(1, 10), use_idf=1,smooth_idf=1,sublinear_tf=1,
                    stop_words = None, preprocessor=clean_text)),
            ])),

            # Pipeline for pulling features
            ('statistics', Pipeline([
                ('selector', ItemSelector(key=['polarity', 'subjectivity', 'length','full_length'])),
                ('statistics', TextStatistics()),  # returns a list of dicts
                ('vect', DictVectorizer()),  # list of dicts -> feature matrix
            ])),

        ],

        # weight components in FeatureUnion
        transformer_weights={
            'review': 1,
            'statistics': 1,
        },
    ))
])

Build the pipeline

In [0]:
x_train = df_train[['review', 'polarity', 'subjectivity','length', 'full_length']]
y_train =df_train['sentiment']

x_test = df_test[['review', 'polarity', 'subjectivity','length', 'full_length']]
y_test =df_test['sentiment']

x_val = df_val[['review', 'polarity', 'subjectivity','length', 'full_length']]
y_val =df_val['sentiment']

In [0]:
pipeline.fit(x_train)

In [0]:
# Create our set of Vocabulary   
vocabulary=get_vocabulary(x_train.review , 500)

In [0]:
train_vector1 = pipeline.transform(x_train)
test_vector1 = pipeline.transform(x_test)
val_vector1 = pipeline.transform(x_val)


In [0]:
train_vector2=[]
test_vector2=[]
val_vector2=[]

for row in x_train.review:
    vector_instance=get_vector_text(vocabulary,row)
    train_vector2.append(vector_instance)

for row in x_test.review:
    vector_instance=get_vector_text(vocabulary,row)
    test_vector2.append(vector_instance)

for row in x_val.review:
    vector_instance=get_vector_text(vocabulary,row)
    val_vector2.append(vector_instance)

In [0]:
from scipy.sparse import csr_matrix, hstack
train_vector = hstack((csr_matrix(train_vector2),train_vector1))
test_vector = hstack((csr_matrix(test_vector2),test_vector1))
val_vector = hstack((csr_matrix(val_vector2),val_vector1))



Build the models

In [0]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB


# SVM classifier
svm_clf=SVC(gamma='auto') # Initialize the SVM model

# Logistic regression
lr_clf = LogisticRegression()

# Decision Tree Classifier
dt_clf = DecisionTreeClassifier()

# naive bayes
nb_clf = GaussianNB()

# random forest
rf_clf=RandomForestClassifier(n_estimators=100)


In [0]:
import pandas as pd 
from pandas import DataFrame
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

lst = []
list_result =[]
cols = ['Model', 'Fold','Accuracy']
cols1 = ['Model', 'Accuracy', 'Recall', 'Precision', 'F1 - Score']

clfs = [svm_clf, lr_clf, dt_clf, rf_clf]
cv = 5
scoring = {'acc': 'accuracy',
           'prec_macro': 'precision_macro',
           'rec_micro': 'recall_macro',
           'f1_score': 'f1'}

for clf in clfs:
    scores = cross_validate(clf,train_vector, y_train, cv=cv, scoring=scoring )

    lst.append([str(clf), str(scores['test_acc'].mean()) ,str(scores['test_rec_micro'].mean()) ,str(scores['test_prec_macro'].mean()) ,str(scores['test_f1_score'].mean()) ]) 
    

    # Predict validation set
    clf.fit(train_vector, y_train )
    y_pred = clf.predict(val_vector)
    list_result.append((str(clf),accuracy_score(y_val, y_pred),recall_score(y_val, y_pred),precision_score(y_val, y_pred),f1_score(y_val, y_pred),))

    print('Model ' + str(clf) + ' Complete' )


In [0]:
print('Cross validation results')
dfResults1 = pd.DataFrame(lst, columns = cols1)
dfResults1


In [0]:
print('Predict Validation set - Scores')
dfResultsValPredict = pd.DataFrame(list_result, columns = cols1)
dfResultsValPredict

In [0]:
# Final model validation

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import seaborn as sn
import pandas as pd
import matplotlib.pyplot as plt

list_result =[]

lr_clf.fit(train_vector, y_train )
y_pred = lr_clf.predict(val_vector)
list_result.append(("Logistic Regression",accuracy_score(y_test, y_pred)))

array = confusion_matrix(y_test, y_pred)

print(list_result)

print(y_pred)



   
df_cm = pd.DataFrame(array, range(2),
                  range(2))
#plt.figure(figsize = (10,7))
sn.set(font_scale=1.4)#for label size
sn.heatmap(df_cm, annot=True,annot_kws={"size": 16})# font size

plt.show()