# **Introduction**
The competition dataset contains text from works of fiction written by spooky authors of the public domain: **Edgar Allan Poe**, **HP Lovecraft** and **Mary Shelley**. The data was prepared by chunking larger texts into sentences using CoreNLP's MaxEnt sentence tokenizer, so you may notice the odd non-sentence here and there. The objective is to accurately identify the author of the sentences in the test set.


---






# **Data Field**
**id** - a unique identifier for each sentence

**text** - some text written by one of the authors

**author** - the author of the sentence (EAP: Edgar Allan Poe, HPL: HP Lovecraft; MWS: Mary Wollstonecraft Shelley)



---
***This notebook is solely focused on model building using various machine learning algorithms. I would be focusing on getting the accuracy high .***


In [0]:
# Importing required libraries

import numpy as np
from numpy import dstack
import tensorflow as tf
import operator
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings
import multiprocessing as mp
import string
import en_core_web_sm
import spacy
from random import randrange
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split,StratifiedKFold
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential,Model
from tensorflow.keras.layers import Input,Embedding,Dense,LSTM,GRU,Bidirectional,Dropout,SimpleRNN,GlobalAvgPool1D,GlobalMaxPool1D
from tensorflow.keras.layers import Conv1D,SpatialDropout1D,BatchNormalization,Lambda,Concatenate,concatenate,GlobalMaxPooling1D
from tensorflow.keras.callbacks import  EarlyStopping
from keras.utils import to_categorical

%matplotlib inline

In [0]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
warnings.filterwarnings('ignore')
nlp = en_core_web_sm.load()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [0]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [0]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19579 entries, 0 to 19578
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      19579 non-null  object
 1   text    19579 non-null  object
 2   author  19579 non-null  object
dtypes: object(3)
memory usage: 459.0+ KB


In [0]:
# Removing id column
df.drop('id',axis=1,inplace=True)

## Removing outliers


In [0]:
df = df[df['text'].str.split().map(lambda x:len(x))<100]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26558 entries, 0 to 26680
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    26558 non-null  object
 1   author  26558 non-null  object
dtypes: object(2)
memory usage: 622.5+ KB


# **Data Cleaning**

In [0]:
class TextPreprocessing(BaseEstimator,TransformerMixin):
    def __init__(self,
                 n_jobs=1):    
      
     self.n_jobs = n_jobs
    """
        Text preprocessing transformer includes steps:
            1. Text normalization
            2. Punctuation removal
            3. Stop words removal
            4. Lemmatization
        
        n_jobs - parallel jobs to run
    """
    def fit(self, X, y=None):
        return self

    def transform(self, X, *_):
        X_copy = X.copy()
        partitions = 2
        cores = mp.cpu_count()
        if self.n_jobs <= -1:
          partitions = cores
        elif self.n_jobs <= 0:
          return X_copy.apply(self._preprocess_text)
        else:
          partitions = min(self.n_jobs, cores)
        cores = mp.cpu_count()
        data_split = np.array_split(X_copy, partitions)
        pool = mp.Pool(cores)
        data = pd.concat(pool.map(self._preprocess_part, data_split))
        pool.close()
        pool.join()
        return data

    def _preprocess_part(self, part):
        return part.apply(self._preprocess_text)

    def _preprocess_text(self, text):
        normalized_text = self._normalize(text)
        doc = nlp(normalized_text)
        removed_punct = self._remove_punct(doc)
        removed_stop_words = self._remove_stop_words(removed_punct)
        return self._lemmatize(removed_stop_words)

    def _normalize(self, text):
        # some issues in normalise package
        try:
            return ' '.join(normalise(text, verbose=False))
        except:
            return text
    def _remove_punct(self, doc):
        return [t for t in doc if t.text not in string.punctuation]

    def _remove_stop_words(self, doc):
        return [t for t in doc if not t.is_stop]

    def _lemmatize(self, doc):
        return ' '.join([t.lemma_ for t in doc])

In [0]:
#Converting the categorical column to variable

df['author'] = df['author'].map({'EAP':0,'HPL':1,'MWS':2})
df.head()

Unnamed: 0,text,author
0,"This process, however, afforded me no means of...",0
1,It never once occurred to me that the fumbling...,1
2,"In his left hand was a gold snuff box, from wh...",0
3,How lovely is spring As we looked from Windsor...,2
4,"Finding nothing else, not even gold, the Super...",1


## Using countvectorizer to convert the sentence into column of words

In [0]:
cv = CountVectorizer()
cv_df = cv.fit_transform(df['text'])

tfidf = TfidfTransformer()
tfidf.fit(cv_df)
tfidf_trans = tfidf.transform(cv_df)

print('Shape of Sparse Matrix: ', cv_df.shape)
print('Amount of Non-Zero occurences: ', cv_df.nnz)
print('Shape of Tfidf Transformed matrix',tfidf_trans.shape)


Shape of Sparse Matrix:  (19579, 25068)
Amount of Non-Zero occurences:  429602
Shape of Tfidf Transformed matrix (19579, 25068)


In [0]:
# Splitting the model into train and text split
X_train,X_test,y_train,y_test = train_test_split(df['text'],df['author'],test_size = 0.3)

# **Baseline Model**



## **1) Random prediction**



In [0]:
def random_pred(X_test):

  unique = [1,2,3]
  predicted = list()
  for i in range(len(X_test)):
    index = randrange(len(unique))
    predicted.append(unique[index])
  return predicted

y_pred = random_pred(X_test)


In [0]:

print(confusion_matrix(y_test,y_pred))
print('\n')
print(classification_report(y_test,y_pred))

[[790 777 749]
 [557 574 534]
 [614 630 622]]


              precision    recall  f1-score   support

           1       0.40      0.34      0.37      2316
           2       0.29      0.34      0.31      1665
           3       0.33      0.33      0.33      1866

    accuracy                           0.34      5847
   macro avg       0.34      0.34      0.34      5847
weighted avg       0.35      0.34      0.34      5847



### **The baseline model got an accuracy of 33% , which is reasonable**
### **because we are randomly picking the classes,therefore 1/3 chances of getting it right** 

# **2) Logistic Regression**

In [0]:
Logistic = LogisticRegression()
pipeline = Pipeline([('text',TextPreprocessing()),
                     ('count' , CountVectorizer()),
                     ('tfidf' , TfidfTransformer()),
                     ('model' , LogisticRegression())])

In [0]:
pipeline.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('text', TextPreprocessing(n_jobs=1)),
                ('count',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w...
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('model',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                           

In [0]:
pred = pipeline.predict(X_test)

In [0]:
print(confusion_matrix(y_test,pred))
print('\n')
print(classification_report(y_test,pred))

[[1986  162  168]
 [ 312 1253  100]
 [ 327  114 1425]]


              precision    recall  f1-score   support

           1       0.76      0.86      0.80      2316
           2       0.82      0.75      0.78      1665
           3       0.84      0.76      0.80      1866

    accuracy                           0.80      5847
   macro avg       0.81      0.79      0.80      5847
weighted avg       0.80      0.80      0.80      5847



### **A simple Logistic Regression model produced about 80% accuracy even without hypertuning the parameters. It would be interesting to see whether it is possible to outperform this model.**


---

# **3) Support Vector Machines**

In [0]:
pipeline = Pipeline([('text',TextPreprocessing()),
                     ('count',CountVectorizer()),
                     ('tfidf',TfidfTransformer()),
                     ('model',SVC())])

In [0]:
pipeline.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('text', TextPreprocessing(n_jobs=1)),
                ('count',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w...
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('model',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                    

In [0]:
pred = pipeline.predict(X_test)

In [0]:
print(confusion_matrix(y_test,pred))
print('\n')
print(classification_report(y_test,pred))

[[2011  133  172]
 [ 333 1235   97]
 [ 393  107 1366]]


              precision    recall  f1-score   support

           1       0.73      0.87      0.80      2316
           2       0.84      0.74      0.79      1665
           3       0.84      0.73      0.78      1866

    accuracy                           0.79      5847
   macro avg       0.80      0.78      0.79      5847
weighted avg       0.80      0.79      0.79      5847



## **Yikes! The model accuracy got reduced , all good though! Onto the next model!!**



---

# **4) Naive Bayes classifier**

A Naive Bayes text classifier is based on the Bayes's Theorem, which helps us compute the conditional probabilities of occurrence of two events based on the probabilities of occurrence of each individual event, encoding those probabilities is extremely useful.


In [0]:
pipeline = Pipeline([('text',TextPreprocessing()),
                     ('count',CountVectorizer()),
                     ('Tfidf',TfidfTransformer()),
                     ('model',MultinomialNB())])

In [0]:
pipeline.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('text', TextPreprocessing(n_jobs=1)),
                ('count',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('Tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('model',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose

In [0]:
pred = pipeline.predict(X_test)

In [0]:
print(confusion_matrix(y_test,pred))
print('\n')
print(classification_report(y_test,pred))

[[2031   84  201]
 [ 362 1194  109]
 [ 312   53 1501]]


              precision    recall  f1-score   support

           1       0.75      0.88      0.81      2316
           2       0.90      0.72      0.80      1665
           3       0.83      0.80      0.82      1866

    accuracy                           0.81      5847
   macro avg       0.83      0.80      0.81      5847
weighted avg       0.82      0.81      0.81      5847



## **The accuracy increased, this shows that naive bayes is really good for text classification tasks. The next model will be hypertuned using randomizedsearchcv**


---

# **5) RandomForest Classifier with GridSearch**

In [0]:
pipeline_forest = Pipeline([('text',TextPreprocessing()),
                     ('count',CountVectorizer()),
                     ('tfidf',TfidfTransformer()),
                     ('model_forest', RandomForestClassifier(n_estimators=1500))])

params = {'model_forest__n_estimators':[75,100,350,500,1000,5000],'model_forest__min_samples_split': [2, 5, 10]}

In [0]:
grid_forest = RandomizedSearchCV(pipeline_forest,param_distributions=params,refit=True,verbose=3,n_iter=2)
grid_forest.fit(X_train,y_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV] model_forest__n_estimators=500, model_forest__min_samples_split=5 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  model_forest__n_estimators=500, model_forest__min_samples_split=5, score=0.702, total= 3.6min
[CV] model_forest__n_estimators=500, model_forest__min_samples_split=5 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  3.6min remaining:    0.0s


[CV]  model_forest__n_estimators=500, model_forest__min_samples_split=5, score=0.693, total= 3.6min
[CV] model_forest__n_estimators=500, model_forest__min_samples_split=5 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  7.2min remaining:    0.0s


[CV]  model_forest__n_estimators=500, model_forest__min_samples_split=5, score=0.686, total= 3.6min
[CV] model_forest__n_estimators=500, model_forest__min_samples_split=5 
[CV]  model_forest__n_estimators=500, model_forest__min_samples_split=5, score=0.694, total= 3.6min
[CV] model_forest__n_estimators=500, model_forest__min_samples_split=5 
[CV]  model_forest__n_estimators=500, model_forest__min_samples_split=5, score=0.707, total= 3.7min
[CV] model_forest__n_estimators=350, model_forest__min_samples_split=2 
[CV]  model_forest__n_estimators=350, model_forest__min_samples_split=2, score=0.699, total= 3.7min
[CV] model_forest__n_estimators=350, model_forest__min_samples_split=2 
[CV]  model_forest__n_estimators=350, model_forest__min_samples_split=2, score=0.691, total= 3.7min
[CV] model_forest__n_estimators=350, model_forest__min_samples_split=2 
[CV]  model_forest__n_estimators=350, model_forest__min_samples_split=2, score=0.681, total= 3.7min
[CV] model_forest__n_estimators=350, mod

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 36.5min finished


RandomizedSearchCV(cv=None, error_score=nan,
                   estimator=Pipeline(memory=None,
                                      steps=[('text',
                                              TextPreprocessing(n_jobs=1)),
                                             ('count',
                                              CountVectorizer(analyzer='word',
                                                              binary=False,
                                                              decode_error='strict',
                                                              dtype=<class 'numpy.int64'>,
                                                              encoding='utf-8',
                                                              input='content',
                                                              lowercase=True,
                                                              max_df=1.0,
                                                              max_features=No

In [0]:
grid_forest.best_params_

{'model_forest__min_samples_split': 5, 'model_forest__n_estimators': 500}

In [0]:
y_pred_forest = grid_forest.predict(X_test)

In [0]:
print(confusion_matrix(y_test,y_pred_forest))
print('\n')
print(classification_report(y_test,y_pred_forest))

[[1698  260  358]
 [ 361 1111  193]
 [ 379  152 1335]]


              precision    recall  f1-score   support

           1       0.70      0.73      0.71      2316
           2       0.73      0.67      0.70      1665
           3       0.71      0.72      0.71      1866

    accuracy                           0.71      5847
   macro avg       0.71      0.71      0.71      5847
weighted avg       0.71      0.71      0.71      5847



## **Oops! The accuracy greatly decreased! Even with hyperparameter tuning.MultinomialNB() is still at the lead with 81% accuracy.Onto Neural Networks!!**


---
# **6) Neural Networks**


In [0]:
df_neural = df
df_neural.head()

Unnamed: 0,text,author
0,"This process, however, afforded me no means of...",0
1,It never once occurred to me that the fumbling...,1
2,"In his left hand was a gold snuff box, from wh...",0
3,How lovely is spring As we looked from Windsor...,2
4,"Finding nothing else, not even gold, the Super...",1


In [0]:
stop = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def convert_nltk_to_wordnet(text):
#To check if the given word is noun,or a verb or an adjective
  if text.startswith('J'):
    return wordnet.ADJ
  
  elif text.startswith('N'):
    return wordnet.NOUN

  elif text.startswith('V'):
    return wordnet.VERB
  
  elif text.startswith('R'):
    return wordnet.ADV
  
  else:
    return None 
 
def lemmatizes(sentence):
  tagged = nltk.pos_tag(nltk.word_tokenize(sentence))
  wordnet_tagged = map(lambda x : (x[0] , convert_nltk_to_wordnet(x[1])) , tagged)
  lemmatized_sentence = []
  for word , tag in wordnet_tagged:
    if tag is None:
      lemmatized_sentence.append(word)
    else:
      lemmatized_sentence.append(lemmatizer.lemmatize(word,tag))
  return ' '.join(lemmatized_sentence)

def clean(text):

  text = re.sub('/.',' ',text)
  text = text.lower()
  text = re.sub("aren't", "are not",text)
  text = re.sub("can't","cannot",text)
  text = re.sub("don't","do not",text)
  text = re.sub("couldn't","could not",text)
  text = re.sub("doesn't","does not",text)
  text = re.sub("hadn't","had not",text)
  text = re.sub("wouldn't","would not",text)
  text = re.sub("he'll","he will",text)
  text = re.sub("what've","what have",text)
  text = re.sub("who'd","who would",text)
  text = re.sub("who'll","who will",text)
  text = re.sub("I'll","I will",text)
  text = re.sub("you'd","you would",text)
  text = re.sub("you'll","you will",text)
  text = re.sub("you're","you are",text)
  text = re.sub("you've","you have",text)
  text = re.sub("wasn't","was not",text)
  text = re.sub("that's","that is",text)
  text = re.sub("they've","they have",text)
  text = re.sub("they're","they are",text)
  text = re.sub("what's","what is",text)
  text = re.sub("what're","what are",text)
  text = re.sub("what'll","what will",text)
  text = re.sub("that's","that is",text)
  text = re.sub("there's","there is",text)
  text = re.sub("it's","it is",text)
  text = re.sub("it'll","it will",text)
  text = re.sub("could've","could have",text)
  text = re.sub("it'll","it will",text)
  text = re.sub("shouldn't","should not",text)
  text = re.sub("should've","should have",text)
  text = re.sub("shan't","shall not",text)
  text = re.sub("won't","will not",text)
  text = re.sub("we'd","we would",text)
  text = re.sub("weren't","were not",text)
  text = re.sub('[^A-Za-z/.\s]','',text)
  text = text.lower().split()
  text = [word for word in text if word not in stop]
  text = ' '.join(text)
  final_text = lemmatizes(text)
  return final_text

df_neural['text'] = df_neural['text'].apply(lambda x : clean(x))
y = to_categorical(df['author'])
df_neural.head()

Unnamed: 0,text,author
0,process however afford mean ascertain dimensio...,0
1,never occur fumble might mere mistake .,1
2,left hand gold snuff box caper hill cut manner...,0
3,lovely spring look windsor terrace sixteen fer...,2
4,find nothing else even gold superintendent aba...,1


In [0]:
def get_embedding(name,word_index,vocab_len,dim):
  embedding_index = {}
  f = open(name,encoding='utf-8')
  for line in f:
    values = line.split()
    word = values[0]
    coeffs = np.asarray(values[1:],dtype='float32')
    embedding_index[word] = coeffs
  f.close()
  embedding_matrix = np.zeros((vocab_len+1,dim))
  for word,index in word_index.items():
    if index > vocab_len:
      break
    else:
      embedding_vector = embedding_index.get(word)
      if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector
  return embedding_matrix,embedding_index
    


In [0]:
corpus = np.asarray(df_neural['text'])
tokenizer = Tokenizer(num_words=21000)
tokenizer.fit_on_texts(corpus)
sequences = tokenizer.texts_to_sequences(corpus)
data = pad_sequences(sequences=sequences,padding='pre')
vocab_len = len(tokenizer.word_index)+1
max_len = len(data[0])

In [0]:
# Importing pre-trained glove embeddings

!wget http://nlp.stanford.edu/data/glove.6B.zip

--2020-05-08 05:18:54--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2020-05-08 05:18:54--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2020-05-08 05:18:55--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2020-0

In [0]:
!unzip glove*.zip


Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [0]:
!ls
!pwd

drive		   glove.6B.200d.txt  glove.6B.50d.txt	sample_data
glove.6B.100d.txt  glove.6B.300d.txt  glove.6B.zip	train.csv
/content


In [0]:
word_index = tokenizer.word_index
embedding_matrix1,embedding_index1 = get_embedding('glove.6B.300d.txt',word_index,vocab_len,300)

In [0]:
#Importing pretrained fasttest embeddings
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip

--2020-05-08 05:26:39--  https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.74.142, 104.22.75.142, 2606:4700:10::6816:4a8e, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.74.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 681808098 (650M) [application/zip]
Saving to: ‘wiki-news-300d-1M.vec.zip’


2020-05-08 05:27:40 (10.9 MB/s) - ‘wiki-news-300d-1M.vec.zip’ saved [681808098/681808098]



In [0]:
!unzip wiki-news-300d-1M.vec*.zip

Archive:  wiki-news-300d-1M.vec.zip
  inflating: wiki-news-300d-1M.vec   


In [0]:
!ls
!pwd

drive		   glove.6B.50d.txt  wiki-news-300d-1M.vec
glove.6B.100d.txt  glove.6B.zip      wiki-news-300d-1M.vec.zip
glove.6B.200d.txt  sample_data
glove.6B.300d.txt  train.csv
/content


In [0]:
embedding_matrix2,embedding_index2 = get_embedding('wiki-news-300d-1M.vec',word_index,vocab_len,300)

In [0]:
# Checking if we have word embeddings for the words in our vocab
def check_coverage(vocab, embeddings_index):

  known_words = {}
  unknown_words = {}
  nb_known_words = 0
  nb_unknown_words = 0
  for word in vocab.keys():
    try:
        known_words[word] = embeddings_index[word]
        nb_known_words += vocab[word]
    except:
        unknown_words[word] = vocab[word]
        nb_unknown_words += vocab[word]
        pass
  print('Found embeddings for {:.3%} of vocab'.format(len(known_words) / len(vocab)))
  print('Found embeddings for  {:.3%} of all text'.format(nb_known_words / (nb_known_words + nb_unknown_words)))
  unknown_words = sorted(unknown_words.items(), key=operator.itemgetter(1))[::-1]

  return unknown_words

In [0]:
print('Glove embeddings:\n')
Glove_embedding = check_coverage(word_index,embedding_index1)
print('\n')
print('Fasttext embeddings:\n')
Fasttext_embedding = check_coverage(word_index,embedding_index2)

Glove embeddings:

Found embeddings for 87.348% of vocab
Found embeddings for  82.289% of all text


Fasttext embeddings:

Found embeddings for 87.610% of vocab
Found embeddings for  82.676% of all text


In [0]:
#Let's see some of the unknown words of the text
Glove_embedding[:30]

[('brusquerie', 20245),
 ('tremulousness', 20238),
 ('aegidus', 20232),
 ('valentinianus', 20231),
 ('btenoir', 20227),
 ('junianus', 20226),
 ('littlewit', 20224),
 ('schweinkopf', 20219),
 ('apothegm', 20216),
 ('flatzplatz', 20215),
 ('literatim', 20211),
 ('odigies', 20209),
 ('despera', 20208),
 ('chinless', 20207),
 ('herbless', 20202),
 ('trink', 20201),
 ('deathful', 20199),
 ('contemns', 20181),
 ('servox', 20171),
 ('unpossessed', 20170),
 ('signalize', 20153),
 ('carvins', 20149),
 ('otaheit', 20146),
 ('miltonic', 20136),
 ('rayless', 20135),
 ('siroc', 20129),
 ('lascia', 20125),
 ('raggiar', 20124),
 ('lombra', 20121),
 ('othair', 20115)]

### As we can see , we found the embeddings of 85% of the words which is quite good.

In [0]:
embedding_matrix_weights = np.mean((embedding_matrix1,embedding_matrix2),axis=0)
np.shape(embedding_matrix_weights)

(20253, 300)

In [0]:
# It's important to split the data into training , test and validation set
X_train,X_test,y_train,y_test=train_test_split(data,y,test_size=0.2)




---

# **7) Deep Neural Networks**

In [0]:
def deep_1st():
  model_deep = Sequential()
  model_deep.add(Embedding(vocab_len+1,300,weights=[embedding_matrix_weights],trainable=True,input_length=max_len))
  model_deep.add(SpatialDropout1D(0.2))
  model_deep.add(Bidirectional(LSTM(128,input_shape=(64,1),return_sequences = True)))
  #model_rnn.add(Lambda(lambda x: tf.expand_dims(model_rnn.output, axis=-1)))
  model_deep.add(Bidirectional(LSTM(64,return_sequences=True)))
  model_deep.add(GlobalMaxPool1D())
  model_deep.add(Dense(128,activation='relu'))
  model_deep.add(Dropout(0.5))
  model_deep.add(BatchNormalization())
  model_deep.add(Dense(3,activation='softmax'))

  callbacks = EarlyStopping(monitor='val_loss',patience=5)

  model_deep.compile(optimizer = 'adam',loss='categorical_crossentropy',metrics=['accuracy'])
  return model_deep

In [0]:
kfold = StratifiedKFold(n_splits=5,shuffle=True)
i=1
score=[]
for train_index , test_index in kfold.split(X_train,y_train.argmax(1)):
    print('{} of KFold {}'.format(i,kfold.n_splits))
    X_train1,X_test1 = X_train[train_index],X_train[test_index]
    y_train1,y_test1 = y_train[train_index],y_train[test_index]
    model_1 = deep_neural()
    history = model_1.fit(X_train1,y_train1,batch_size=512,epochs=8,validation_split=0.2)
    print('\n')
    acc = model_1.evaluate(X_test1,y_test1)
    print('Accuracy :  ',acc[1])
    score.append(acc[1])
    print('\n')
    i+=1

1 of KFold 5
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


Accuracy :   0.8524705767631531


2 of KFold 5
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


Accuracy :   0.8500823974609375


3 of KFold 5
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


Accuracy :   0.839726984500885


4 of KFold 5
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


Accuracy :   0.8498470187187195


5 of KFold 5
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


Accuracy :   0.8413744568824768




In [0]:
pred = model_1.predict(X_test)

print(classification_report(y_test.argmax(1),pred.argmax(1)))

              precision    recall  f1-score   support

           0       0.83      0.87      0.85      2116
           1       0.85      0.87      0.86      1535
           2       0.88      0.82      0.85      1661

    accuracy                           0.85      5312
   macro avg       0.85      0.85      0.85      5312
weighted avg       0.85      0.85      0.85      5312



 **By stacking LSTM models , it got an accuracy of 85%!! Already outperforming the MultinomialNB()** 

---









---
### Since the training data set is small , it is preferred to put trainable = False, Let's see what happens if trainable=True


In [0]:
def model_true():

  inp = Input(shape=(max_len,))
  x = Embedding(vocab_len+1, 300, weights=[embedding_matrix_weights], trainable=True)(inp)
  x = SpatialDropout1D(0.3)(x)
  x1 = Bidirectional(LSTM(256, return_sequences=True))(x)
  x2 = Bidirectional(GRU(128, return_sequences=True))(x1)
  max_pool1 = GlobalMaxPool1D()(x1)
  max_pool2 = GlobalMaxPool1D()(x2)
  conc = concatenate([max_pool1, max_pool2])
  x = Dense(128,activation='relu')(conc)
  x = Dropout(0.5)(x)
  predictions = Dense(3, activation='softmax')(x)

  model = Model(inputs=inp, outputs=predictions)
  callbacks = EarlyStopping(monitor='val_loss',patience=3)

  from tensorflow.keras.optimizers import Adam,RMSprop
  adam = Adam()
  model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy'])
  return model

In [0]:
kfold=StratifiedKFold(n_splits=5,shuffle=True)
score=[]
i=1
for train_index , test_index in kfold.split(X_train,y_train.argmax(1)):
    print('{} of KFold {}'.format(i,kfold.n_splits))
    X_train_main,X_val = X_train[train_index],X_train[test_index]
    y_train_main,y_val = y_train[train_index],y_train[test_index]
    model_2nd = model_true()
    history = model_2nd.fit(X_train_main,y_train_main,epochs=5,batch_size=128,validation_split=0.2)
    print('\n')
    acc = model_2nd.evaluate(X_val,y_val)
    score.append(acc[1])
    print('Accuracy:  ',acc[1])
    i+=1


1 of KFold 5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Accuracy:   0.8404706120491028
2 of KFold 5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Accuracy:   0.8474935293197632
3 of KFold 5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Accuracy:   0.8592609763145447
4 of KFold 5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Accuracy:   0.8592609763145447
5 of KFold 5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Accuracy:   0.8505530953407288


In [0]:
print('Accuracy :  ',np.mean(score))

Accuracy :   0.8514078378677368


In [0]:
pred_2 = model_2nd.predict(X_test)

print(classification_report(y_test.argmax(1),pred_2.argmax(1)))

              precision    recall  f1-score   support

           0       0.87      0.82      0.85      2116
           1       0.84      0.87      0.85      1535
           2       0.84      0.87      0.85      1661

    accuracy                           0.85      5312
   macro avg       0.85      0.85      0.85      5312
weighted avg       0.85      0.85      0.85      5312



### **Nice! The new complex performs the same as our previous model with an accuracy of 85, and it seems that the recall of some of the classes are better too!!! Let's see if any models can outperform this.**

In [0]:
#Instead of concatenating the embeddings through axis=0 , lets try concatenating them through axis =1

embedding_axis1 = np.concatenate((embedding_matrix1,embedding_matrix2),axis=1)

In [0]:
def model_3():

  sequence_input = Input(shape=(max_len,))
  embedding_layer = Embedding(vocab_len+1,600,weights = [embedding_axis1],trainable = True,input_length=max_len)
  x = embedding_layer(sequence_input)
  x = SpatialDropout1D(0.2)(x)
  x = Bidirectional(LSTM(128,return_sequences=True))(x)
  x = Conv1D(64,kernel_size=2,padding='valid',kernel_initializer="he_uniform")(x)
  avg_pool = GlobalAvgPool1D()(x)
  max_pool = GlobalMaxPool1D()(x)
  x = concatenate([avg_pool,max_pool])
  x = Dense(128,activation='relu')(x)
  x = Dropout(0.5)(x)
  pred = Dense(3,activation='softmax')(x)

  model_3 = Model(sequence_input, pred)
  model_3.compile(optimizer='rmsprop',loss='categorical_crossentropy',metrics=['accuracy'])
  return model_3


#callbacks = EarlyStopping(monitor='val_loss',patience=3)


In [0]:
kfold=StratifiedKFold(n_splits=5,shuffle=True)
i=1
score=[]
for train_index , test_index in kfold.split(X_train,y_train.argmax(1)):
    print('{} of KFold {}'.format(i,kfold.n_splits))
    X_train_main,X_val = X_train[train_index],X_train[test_index]
    y_train_main,y_val = y_train[train_index],y_train[test_index]
    model_3rd = model_3()
    history = model_3rd.fit(X_train_main,y_train_main,epochs=5,batch_size=512,validation_split=0.2)
    print('\n')
    acc = model_3rd.evaluate(X_val,y_val)
    score.append(acc[1])
    print('\n')
    print('Accuracy:  ',acc[1])
    i+=1

1 of KFold 5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




Accuracy:   0.8425882458686829
2 of KFold 5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




Accuracy:   0.8472581505775452
3 of KFold 5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




Accuracy:   0.8406683802604675
4 of KFold 5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




Accuracy:   0.8583195805549622
5 of KFold 5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




Accuracy:   0.8272534608840942


In [0]:
print('Accuracy:  ',np.mean(score))

Accuracy:   0.8432175636291503


In [0]:
pred_3 = model_3rd.predict(X_test)

print(classification_report(y_test.argmax(1),pred_3.argmax(1)))

              precision    recall  f1-score   support

           0       0.75      0.93      0.83      2116
           1       0.92      0.77      0.84      1535
           2       0.90      0.76      0.82      1661

    accuracy                           0.83      5312
   macro avg       0.86      0.82      0.83      5312
weighted avg       0.85      0.83      0.83      5312



### **Yikes ! The accuracy decreased to 83%.!**









In [0]:
def model_4():

  input_sequence = Input(shape=(max_len,))
  embedding_layer = Embedding(vocab_len+1,600,weights=[embedding_axis1],trainable=True)
  x = embedding_layer(input_sequence)
  x = SpatialDropout1D(0.2)(x)
  x1 = Bidirectional(GRU(128,return_sequences=True))(x)
  x2 = Bidirectional(GRU(64,return_sequences=True))(x1)
  max_pool1 = GlobalMaxPool1D()(x1)
  max_pool2 = GlobalMaxPool1D()(x2)
  conc = concatenate([max_pool1, max_pool2])
  x = Dense(128,activation='relu')(conc)
  x = BatchNormalization()(x)
  pred = Dense(3,activation='softmax')(x)

  model_4 = Model(input_sequence,pred)
  model_4.compile(optimizer = 'adam',loss='categorical_crossentropy',metrics=['accuracy'])
  return model_4



In [0]:
kfold=StratifiedKFold(n_splits=5,shuffle=True)
i=1
score=[]
for train_index , test_index in kfold.split(X_train,y_train.argmax(1)):
    print('{} of KFold {}'.format(i,kfold.n_splits))
    X_train_main,X_val = X_train[train_index],X_train[test_index]
    y_train_main,y_val = y_train[train_index],y_train[test_index]
    model_4th = model_4()
    history = model_4th.fit(X_train_main,y_train_main,epochs=5,batch_size=512,validation_split=0.2)
    print('\n')
    acc = model_4th.evaluate(X_val,y_val)
    score.append(acc[1])
    print('\n')
    print('Accuracy:  ',acc[1])
    i+=1

1 of KFold 5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




Accuracy:   0.8303529620170593
2 of KFold 5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




Accuracy:   0.8088961839675903
3 of KFold 5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




Accuracy:   0.8493763208389282
4 of KFold 5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




Accuracy:   0.8559660911560059
5 of KFold 5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




Accuracy:   0.8117204308509827


In [0]:
print('Accuracy:  ',np.mean(score))

Accuracy:   0.8312623977661133


In [0]:
pred_4 = model_4th.predict(X_test)

print(classification_report(y_test.argmax(1),pred_4.argmax(1)))

              precision    recall  f1-score   support

           0       0.71      0.96      0.82      2116
           1       0.94      0.74      0.83      1535
           2       0.93      0.70      0.80      1661

    accuracy                           0.82      5312
   macro avg       0.86      0.80      0.82      5312
weighted avg       0.85      0.82      0.82      5312



### **The accuracy decreased . Let's try one more model!**

In [0]:
def model_5():

  inp = Input(shape=(max_len,))
  embedding_layer = Embedding(vocab_len+1,600,weights=[embedding_axis1],trainable=True)
  x = embedding_layer(inp)
  x = SpatialDropout1D(0.2)(x)
  x1 = Bidirectional(LSTM(64,return_sequences=True))(x)
  x1 = Conv1D(64 ,kernel_size=3,padding='same',activation='linear')(x1)
  x1 = BatchNormalization()(x1)
  x1 = Conv1D(64,kernel_size=3,padding='same',activation='linear')(x1)
  x1 = BatchNormalization()(x1)
  x2 = Conv1D(64,kernel_size=1,padding='same',activation='linear')(x)
  xmain = concatenate([x1,x2])
  xmain1 = Conv1D(64,kernel_size=3,padding='same',activation='linear')(xmain)
  xmain1 = BatchNormalization()(xmain1)
  xmain1 = Conv1D(64,kernel_size=3,padding='same',activation='linear')(xmain1)
  xmain1 = BatchNormalization()(xmain1)
  x = concatenate([xmain,xmain1])
  x = GlobalMaxPool1D()(x)
  x = Dense(182,activation='relu')(x)
  x = BatchNormalization()(x)
  x = Dropout(0.5)(x)
  x = Dense(3,activation = 'softmax')(x)

  model_last = Model(inp,x)
  model_last.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=0.001),loss='categorical_crossentropy',metrics=['accuracy'])
  return model_last

In [0]:
kfold=StratifiedKFold(n_splits=5,shuffle=True)
i=1
score=[]
for train_index , test_index in kfold.split(X_train,y_train.argmax(1)):
    print('{} of KFold {}'.format(i,kfold.n_splits))
    X_train_main,X_val = X_train[train_index],X_train[test_index]
    y_train_main,y_val = y_train[train_index],y_train[test_index]
    model_5th = model_5()
    history = model_5th.fit(X_train_main,y_train_main,epochs=5,batch_size=128,validation_split=0.2)
    print('\n')
    acc = model_5th.evaluate(X_val,y_val)
    score.append(acc[1])
    print('\n')
    print('Accuracy:  ',acc[1])
    i+=1

1 of KFold 5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




Accuracy:   0.8143529295921326
2 of KFold 5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




Accuracy:   0.8554953932762146
3 of KFold 5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




Accuracy:   0.8427865505218506
4 of KFold 5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




Accuracy:   0.8427865505218506
5 of KFold 5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5




Accuracy:   0.8394916653633118


In [0]:
print('Accuracy :  ' , np.mean(score))

Accuracy :   0.838982617855072


In [0]:
pred = model_5th.predict(X_test)

print(classification_report(y_test.argmax(1),pred.argmax(1)))

              precision    recall  f1-score   support

           0       0.79      0.89      0.84      2116
           1       0.92      0.79      0.85      1535
           2       0.86      0.83      0.84      1661

    accuracy                           0.84      5312
   macro avg       0.86      0.84      0.84      5312
weighted avg       0.85      0.84      0.84      5312



### **Nice , the accuracy again increased. Next we will do model ensembling!**



---

# **Model ensembling(Weighted Average)**

In [0]:
pred1 = model_1.predict(X_test)
pred2 = model_2nd.predict(X_test)
pred3 = model_3rd.predict(X_test)
pred4 = model_4th.predict(X_test)
pred5 = model_5th.predict(X_test)

final_pred = (pred1*0.3 + pred2*0.2 + pred3*0.2 + pred4*0.15 + pred5*0.15)

print(confusion_matrix(y_test.argmax(1),final_pred.argmax(1)))
print('\n')
print(classification_report(y_test.argmax(1),final_pred.argmax(1)))

[[1930   81  105]
 [ 177 1299   59]
 [ 230   53 1378]]


              precision    recall  f1-score   support

           0       0.83      0.91      0.87      2116
           1       0.91      0.85      0.88      1535
           2       0.89      0.83      0.86      1661

    accuracy                           0.87      5312
   macro avg       0.88      0.86      0.87      5312
weighted avg       0.87      0.87      0.87      5312



### **Wow! Our weighted average model produced an accuracy of 87%, which is our highest accuracy produced by any model!**



---

# **Model Ensembling(Stacking)**

In [0]:
def stacked_dataset(members, inputX):
	stackX = None
	for model in members:
		yhat = model.predict(inputX, verbose=0)
		# stack predictions into [rows, members, probabilities]
		if stackX is None:
			stackX = yhat
		else:
			stackX = dstack((stackX, yhat))
	# flatten predictions to [rows, members x probabilities]
	stackX = stackX.reshape((stackX.shape[0], stackX.shape[1]*stackX.shape[2]))
	return stackX
 
def fit_stacked_model(members, inputX, inputy):
	# create dataset using ensemble
	stackedX = stacked_dataset(members, inputX)
	model = LogisticRegression()
	model.fit(stackedX, inputy)
	return model
 
# make a prediction with the stacked model
def stacked_prediction(members, model, inputX):
	stackedX = stacked_dataset(members, inputX)
	yhat = model.predict(stackedX)
	return yhat


In [0]:
members = [model_1,model_2nd,model_3rd,model_4th,model_5th]
for models in members:
  _,acc = models.evaluate(X_test,y_test)
  print('Model Accuracy: ', acc)
 
model = fit_stacked_model(members, X_test, y_test.argmax(1))
yhat = stacked_prediction(members, model, X_test)

print(classification_report(y_test.argmax(1),yhat))


Model Accuracy:  0.8505271077156067
Model Accuracy:  0.8507153391838074
Model Accuracy:  0.8300075531005859
Model Accuracy:  0.8173945546150208
Model Accuracy:  0.8420557379722595
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      2116
           1       0.90      0.87      0.89      1535
           2       0.89      0.86      0.88      1661

    accuracy                           0.88      5312
   macro avg       0.88      0.88      0.88      5312
weighted avg       0.88      0.88      0.88      5312



### **Nice! By stacking up the models , it produced an accuracy of 88%,which is by far the highest!**