In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_colwidth', None)
import re
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import string
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec, KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors
from gensim.models.fasttext import FastText
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_score, recall_score, make_scorer, f1_score, accuracy_score
from sklearn.model_selection import KFold, cross_val_score
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence
from torch.nn.utils.rnn import pad_packed_sequence
from torch.nn.utils.rnn import pad_sequence
from torch.autograd import gradcheck

nlp = spacy.load('en_core_web_sm')


## Importing libraries

In [None]:
df_train = pd.read_csv('train.csv')

In [None]:
df_test = pd.read_csv('test.csv')

In [None]:
df_train = df_train.loc[:,['text','target']]

In [None]:
df_test = df_test.loc[:,['id','text']]

In [None]:
df_train

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1
3,"13,000 people receive #wildfires evacuation orders in California",1
4,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1
...,...,...
7608,Two giant cranes holding a bridge collapse into nearby homes http://t.co/STfMbbZFB5,1
7609,@aria_ahrary @TheTawniest The out of control wild fires in California even in the Northern part of the state. Very troubling.,1
7610,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. http://t.co/zDtoyd8EbJ,1
7611,Police investigating after an e-bike collided with a car in Little Portugal. E-bike rider suffered serious non-life threatening injuries.,1


In [None]:
df_test

Unnamed: 0,id,text
0,0,Just happened a terrible car crash
1,2,"Heard about #earthquake is different cities, stay safe everyone."
2,3,"there is a forest fire at spot pond, geese are fleeing across the street, I cannot save them all"
3,9,Apocalypse lighting. #Spokane #wildfires
4,11,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...
3258,10861,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTENERS XrWn
3259,10865,Storm in RI worse than last hurricane. My city&amp;3others hardest hit. My yard looks like it was bombed. Around 20000K still without power
3260,10868,Green Line derailment in Chicago http://t.co/UtbXLcBIuY
3261,10874,MEG issues Hazardous Weather Outlook (HWO) http://t.co/3X6RBQJHn3


## Data-preprocessing

In [None]:
def preprocess(text):

    #remove words associated with hash-tags
    text = re.sub('[#][\w]+','',text)

    #remove words associated with @
    text = re.sub('[@][\w]+','USER',text)

    #remove numbers
    text = re.sub(r'[-+]?[.\d]*[\d]+[:,.\d]*','NUMBER',text)

    #remove urls
    text = re.sub(r'https?://\S+|www\.\S+','URL',text)

    #remove punctuations
    table = str.maketrans('','',string.punctuation)
    text = text.translate(table)

    #remove emojis
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)

    text = emoji_pattern.sub(r'EMOJI', text)

    #Lemmatization
    lemmatizer = WordNetLemmatizer()
    text_tokens = word_tokenize(text)
    tokens_without_sw = [lemmatizer.lemmatize(word) for word in text_tokens]
    text = ' '.join([str(elem) for elem in tokens_without_sw])

    #remove non-printable characters
    text = ''.join([word for word in text if word in string.printable])

    #remove stop-words
    text_tokens = word_tokenize(text)
    tokens_without_sw = [word for word in text_tokens if not word in STOP_WORDS]
    text = ' '.join([str(elem) for elem in tokens_without_sw])

    #Lower-case the text
    text = text.lower()

    return text


In [None]:
df_train['clean_text'] = df_train['text'].apply(preprocess)

In [None]:
df_test['clean_text'] = df_test['text'].apply(preprocess)

In [None]:
df_train = df_train.loc[:,['clean_text','target']]

In [None]:
df_test = df_test.loc[:,['id','clean_text']]

In [None]:
df_train['target'].value_counts()

0    4342
1    3271
Name: target, dtype: int64

### Bag of Words (BoW)

In [None]:
vectorizer = CountVectorizer(ngram_range=(1,1), stop_words='english') # to use bigrams ngram_range=(2,2)
Count_data = vectorizer.fit_transform(df_train['clean_text'])

#create dataframe
bow_df_train = pd.DataFrame(Count_data.toarray(),columns=vectorizer.get_feature_names())

### TF-IDF Vectorizer

In [None]:
vectorizer = TfidfVectorizer(stop_words="english",ngram_range=(1, 1))
training_features = vectorizer.fit_transform(df_train['clean_text'])

#create dataframe
tf_idf_df_train = pd.DataFrame(training_features.toarray(),columns=vectorizer.get_feature_names())

### word2vec

In [None]:
corpus_list = [i.split() for i in df_train.clean_text]
model = Word2Vec(corpus_list,min_count=1,vector_size = 100)

In [None]:
model.wv.most_similar('forgive')

[('midst', 0.8053702116012573),
 ('savings', 0.8008486032485962),
 ('ronaldo', 0.8005456328392029),
 ('hieroglyphics', 0.7992174029350281),
 ('pundit', 0.7989677786827087),
 ('agency', 0.7988178730010986),
 ('charger', 0.797602653503418),
 ('holy', 0.7971658110618591),
 ('complex', 0.796215295791626),
 ('juror', 0.7961747050285339)]

### FastText

In [None]:
# Defining values for parameters
embedding_size = 300
window_size = 5
min_word = 5
down_sampling = 1e-2

model_3 = FastText(corpus_list,
                      vector_size=embedding_size,
                      window=window_size,
                      min_count=min_word,
                      sample=down_sampling,
                      workers = 4,
                      sg=1
                      )


In [None]:
model_3.wv.most_similar('forgive')

[('evil', 0.9993758797645569),
 ('clearly', 0.9991562962532043),
 ('mr', 0.999136209487915),
 ('adult', 0.999079704284668),
 ('original', 0.9989796280860901),
 ('lovely', 0.9989707469940186),
 ('source', 0.9989598989486694),
 ('spirit', 0.998907744884491),
 ('level', 0.9989072680473328),
 ('snap', 0.9988880157470703)]

### Doc2Vec

In [None]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(corpus_list)]
model_4 = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)

In [None]:
model_4.wv.most_similar('chicken', topn=10)

[('timing', 0.9965299367904663),
 ('snowball', 0.9964863657951355),
 ('indistinguishable', 0.9960567355155945),
 ('fading', 0.9956676959991455),
 ('diverse', 0.9956137537956238),
 ('decrease', 0.9900070428848267),
 ('weird', 0.9879105687141418),
 ('rush', 0.9870281219482422),
 ('mall', 0.9869320392608643),
 ('building', 0.9866396188735962)]

In [None]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word_model):
        self.word_model = word_model
        self.vector_size = word_model.wv.vector_size

    def fit(self):
        return self

    def transform(self,docs):
        doc_word_vector = self.word_average_list(docs)
        return doc_word_vector

    def word_average(self,sentence):
        mean = []
        for word in sentence:
            if word in self.word_model.wv.key_to_index:
                mean.append(self.word_model.wv.get_vector(word))
        if not mean:
            return np.zeros(self.vector_size)
        else:
            mean = np.array(mean).mean(axis = 0)
            return mean

    def word_average_list(self,docs):
        return np.vstack([self.word_average(sentence) for sentence in docs])


In [None]:
mean_vec_tr = MeanEmbeddingVectorizer(model)
word_2_vec = mean_vec_tr.transform(corpus_list)

In [None]:
word_2_vec.shape

(7613, 100)

In [None]:
est = []
est.append(('LogisticRegression', Pipeline([('LR', LogisticRegression())])))
est.append(('BernoulliNB', Pipeline([('BNB', BernoulliNB())])))
est.append(('MultinomialNB', Pipeline([('MNB', MultinomialNB())])))
est.append(('LinearSVC', Pipeline([('LNB', LinearSVC())])))

In [None]:
# Training
model_scores = {}

p_scorer = make_scorer(precision_score)
r_scorer = make_scorer(recall_score)
f1_scorer = make_scorer(f1_score)
a_scorer = make_scorer(accuracy_score)

for i in est:
    kfold = KFold(n_splits=7, shuffle=True, random_state=4)
    p_scores = cross_val_score(i[1], training_features, df_train.target, cv=kfold, scoring=p_scorer)
    r_scores = cross_val_score(i[1], training_features, df_train.target, cv=kfold, scoring=r_scorer)
    f1_scores = cross_val_score(i[1], training_features, df_train.target, cv=kfold, scoring=f1_scorer)
    a_scores = cross_val_score(i[1], training_features, df_train.target, cv=kfold, scoring=a_scorer)

    model_scores.update({ i[0]:{'accuracy': a_scores.mean(), 'f1_score':f1_scores.mean(), 'precision': p_scores.mean(), 'recall':r_scores.mean()} })

In [None]:
for i in model_scores:
    print('\n', i)
    print('\n', model_scores[i])


 LogisticRegression

 {'accuracy': 0.7931171146011303, 'f1_score': 0.7265195937972196, 'precision': 0.8405469172478994, 'recall': 0.6401119882599652}

 BernoulliNB

 {'accuracy': 0.7929866576345349, 'f1_score': 0.7293029869585108, 'precision': 0.831042122427054, 'recall': 0.6503186301194921}

 MultinomialNB

 {'accuracy': 0.7908855757114253, 'f1_score': 0.7250354320315499, 'precision': 0.8329886484597812, 'recall': 0.6422677180266317}

 LinearSVC

 {'accuracy': 0.779194215634736, 'f1_score': 0.7247341480188895, 'precision': 0.7799570060420409, 'recall': 0.6771084948914046}


In [None]:
mean_vec_tr = MeanEmbeddingVectorizer(model_3)
fast_text = mean_vec_tr.transform(corpus_list)

X = fast_text
y = df_train['target']

In [None]:
est = []
est.append(('LogisticRegression', Pipeline([('LR', LogisticRegression())])))
est.append(('BernoulliNB', Pipeline([('BNB', BernoulliNB())])))
est.append(('LinearSVC', Pipeline([('LNB', LinearSVC())])))

In [None]:
# Training
model_scores = {}

p_scorer = make_scorer(precision_score)
r_scorer = make_scorer(recall_score)
f1_scorer = make_scorer(f1_score)
a_scorer = make_scorer(accuracy_score)

for i in est:
    kfold = KFold(n_splits=7, shuffle=True, random_state=4)
    p_scores = cross_val_score(i[1], X, df_train.target, cv=kfold, scoring=p_scorer)
    r_scores = cross_val_score(i[1], X, df_train.target, cv=kfold, scoring=r_scorer)
    f1_scores = cross_val_score(i[1], X, df_train.target, cv=kfold, scoring=f1_scorer)
    a_scores = cross_val_score(i[1], X, df_train.target, cv=kfold, scoring=a_scorer)

    model_scores.update({ i[0]:{'accuracy': a_scores.mean(), 'f1_score':f1_scores.mean(), 'precision': p_scores.mean(), 'recall':r_scores.mean()} })

In [None]:
for i in model_scores:
    print('\n', i)
    print('\n', model_scores[i])


 LogisticRegression

 {'accuracy': 0.7166710232851189, 'f1_score': 0.635810221194838, 'precision': 0.7103300923799487, 'recall': 0.5757528770867761}

 BernoulliNB

 {'accuracy': 0.6892178041869922, 'f1_score': 0.6538805831188377, 'precision': 0.6271570538567748, 'recall': 0.6836158273763593}

 LinearSVC

 {'accuracy': 0.7279683550052957, 'f1_score': 0.6470543562063579, 'precision': 0.7311756840329633, 'recall': 0.5806616079670547}


In [None]:
mean_vec_tr = MeanEmbeddingVectorizer(model_4)
doc_2_vec = mean_vec_tr.transform(corpus_list)

X = doc_2_vec
y = df_train['target']

In [None]:
est = []
est.append(('LogisticRegression', Pipeline([('LR', LogisticRegression())])))
est.append(('BernoulliNB', Pipeline([('BNB', BernoulliNB())])))
est.append(('LinearSVC', Pipeline([('LNB', LinearSVC())])))

In [None]:
# Training
model_scores = {}

p_scorer = make_scorer(precision_score)
r_scorer = make_scorer(recall_score)
f1_scorer = make_scorer(f1_score)
a_scorer = make_scorer(accuracy_score)

for i in est:
    kfold = KFold(n_splits=7, shuffle=True, random_state=4)
    p_scores = cross_val_score(i[1], X, df_train.target, cv=kfold, scoring=p_scorer)
    r_scores = cross_val_score(i[1], X, df_train.target, cv=kfold, scoring=r_scorer)
    f1_scores = cross_val_score(i[1], X, df_train.target, cv=kfold, scoring=f1_scorer)
    a_scores = cross_val_score(i[1], X, df_train.target, cv=kfold, scoring=a_scorer)

    model_scores.update({ i[0]:{'accuracy': a_scores.mean(), 'f1_score':f1_scores.mean(), 'precision': p_scores.mean(), 'recall':r_scores.mean()} })



In [None]:
for i in model_scores:
    print('\n', i)
    print('\n', model_scores[i])


 LogisticRegression

 {'accuracy': 0.6315500268644716, 'f1_score': 0.4550204362474727, 'precision': 0.6241924468385536, 'recall': 0.3585259356986403}

 BernoulliNB

 {'accuracy': 0.5711273124705264, 'f1_score': 0.023418609263662158, 'precision': 0.5483080411651841, 'recall': 0.011996360490107652}

 LinearSVC

 {'accuracy': 0.6307617285644709, 'f1_score': 0.4447448403936193, 'precision': 0.6282664992526501, 'recall': 0.3447367298862553}


In [None]:
est = []
est.append(('LogisticRegression', Pipeline([('LR', LogisticRegression())])))
est.append(('BernoulliNB', Pipeline([('BNB', BernoulliNB())])))
est.append(('MultinomialNB', Pipeline([('MNB', MultinomialNB())])))
est.append(('LinearSVC', Pipeline([('LNB', LinearSVC())])))

In [None]:
# Training
model_scores = {}

p_scorer = make_scorer(precision_score)
r_scorer = make_scorer(recall_score)
f1_scorer = make_scorer(f1_score)
a_scorer = make_scorer(accuracy_score)

for i in est:
    kfold = KFold(n_splits=7, shuffle=True, random_state=4)
    p_scores = cross_val_score(i[1], tf_idf_df_train, df_train.target, cv=kfold, scoring=p_scorer)
    r_scores = cross_val_score(i[1], tf_idf_df_train, df_train.target, cv=kfold, scoring=r_scorer)
    f1_scores = cross_val_score(i[1], tf_idf_df_train, df_train.target, cv=kfold, scoring=f1_scorer)
    a_scores = cross_val_score(i[1], tf_idf_df_train, df_train.target, cv=kfold, scoring=a_scorer)

    model_scores.update({ i[0]:{'accuracy': a_scores.mean(), 'f1_score':f1_scores.mean(), 'precision': p_scores.mean(), 'recall':r_scores.mean()} })

In [None]:
for i in model_scores:
    print('\n', i)
    print('\n', model_scores[i])


 LogisticRegression

 {'accuracy': 0.7931171146011303, 'f1_score': 0.7265195937972196, 'precision': 0.8405469172478994, 'recall': 0.6401119882599652}

 BernoulliNB

 {'accuracy': 0.7929866576345349, 'f1_score': 0.7293029869585108, 'precision': 0.831042122427054, 'recall': 0.6503186301194921}

 MultinomialNB

 {'accuracy': 0.7908855757114253, 'f1_score': 0.7250354320315499, 'precision': 0.8329886484597812, 'recall': 0.6422677180266317}

 LinearSVC

 {'accuracy': 0.779194215634736, 'f1_score': 0.7247341480188895, 'precision': 0.7799570060420409, 'recall': 0.6771084948914046}


In [None]:
X = word_2_vec
y = df_train['target']

In [None]:
est = []
est.append(('LogisticRegression', Pipeline([('LR', LogisticRegression())])))
est.append(('BernoulliNB', Pipeline([('BNB', BernoulliNB())])))
est.append(('LinearSVC', Pipeline([('LNB', LinearSVC())])))

In [None]:
# Training
model_scores = {}

p_scorer = make_scorer(precision_score)
r_scorer = make_scorer(recall_score)
f1_scorer = make_scorer(f1_score)
a_scorer = make_scorer(accuracy_score)

for i in est:
    kfold = KFold(n_splits=7, shuffle=True, random_state=4)
    p_scores = cross_val_score(i[1], X, df_train.target, cv=kfold, scoring=p_scorer)
    r_scores = cross_val_score(i[1], X, df_train.target, cv=kfold, scoring=r_scorer)
    f1_scores = cross_val_score(i[1], X, df_train.target, cv=kfold, scoring=f1_scorer)
    a_scores = cross_val_score(i[1], X, df_train.target, cv=kfold, scoring=a_scorer)

    model_scores.update({ i[0]:{'accuracy': a_scores.mean(), 'f1_score':f1_scores.mean(), 'precision': p_scores.mean(), 'recall':r_scores.mean()} })

In [None]:
for i in model_scores:
    print('\n', i)
    print('\n', model_scores[i])


 LogisticRegression

 {'accuracy': 0.6238037820924138, 'f1_score': 0.36450915171024517, 'precision': 0.6651327549486655, 'recall': 0.2514706585128233}

 BernoulliNB

 {'accuracy': 0.5685031947461597, 'f1_score': 0.5658523186694605, 'precision': 0.4983113348605916, 'recall': 0.6549464198610193}

 LinearSVC

 {'accuracy': 0.6632127782115605, 'f1_score': 0.5205045876825314, 'precision': 0.6704973761862104, 'recall': 0.4254655655649334}


In [None]:
# separate into training and testing set
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(
    df_train['clean_text'],  # predictors
    df_train['target'],  # target
    test_size=0.3,  # percentage of obs in test set
    random_state=0)  # seed to ensure reproducibility

In [None]:
X_train.reset_index(inplace = True,drop = True)
y_train.reset_index(inplace = True,drop = True)

X_valid.reset_index(inplace = True,drop = True)
y_valid.reset_index(inplace = True,drop = True)

In [None]:
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self,X,y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self,idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        text = self.X.iloc[idx]
        text = np.array(text)
        labels = self.y.iloc[idx]
        labels = np.array(labels)
        sample = (labels,text)

        return sample

class TestDataset(Dataset):
    def __init__(self,X):
        self.X = X

    def __len__(self):
        return len(self.X)

    def __getitem__(self,idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        text = self.X.iloc[idx]
        text = np.array(text)
        sample = (text)

        return sample

In [None]:
train_dataset = CustomDataset(X_train,y_train)
valid_dataset = CustomDataset(X_valid,y_valid)
test_dataset = TestDataset(df_test['clean_text'])

In [None]:
from collections import Counter
from torchtext.vocab import Vocab
counter = Counter()
for (label, line) in train_dataset:
    counter.update(str(line).split())

vocab = Vocab(counter)

text_pipeline = lambda x : [vocab[token] for token in str(x).split()]
label_pipeline = lambda x : int(x)

In [None]:
def collate_batch(batch):
    label_list, text_list = [],[]
    for (label,text) in batch:
        label_list.append(label_pipeline(label))
        processed_text = torch.tensor(text_pipeline(text), dtype = torch.int64)
        text_list.append(processed_text)
    label_list = torch.tensor(label_list, dtype = torch.int64)
    return label_list, text_list

In [None]:
def collate_test_batch(batch):
    text_list = []
    for (text) in batch:
        processed_text = torch.tensor(text_pipeline(text), dtype = torch.int64)
        text_list.append(processed_text)
    return text_list

In [None]:
batch_size=32
train_loader= torch.utils.data.DataLoader(dataset=train_dataset,
                                        batch_size=batch_size,
                                        shuffle=True,
                                        collate_fn=collate_batch,
                                        num_workers=1)

valid_loader = torch.utils.data.DataLoader(dataset=valid_dataset,
                                        batch_size=batch_size,
                                        shuffle=True,
                                        collate_fn=collate_batch,
                                        num_workers=1)


test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                        batch_size=batch_size,
                                        shuffle=False,
                                        collate_fn=collate_test_batch,
                                        num_workers=1)

### RNN

In [None]:
class RNN(nn.Module):
    def __init__(self, num_layers, num_classes, input_size, hidden_size,vocab, bidirectional,dropout):
        super(RNN,self).__init__()
        self.num_layers = num_layers
        self.num_classes = num_classes
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bidirectional = 1 if bidirectional == False else 2
        self.embedding = nn.Embedding(len(vocab),input_size)
        self.embedding.weight =  nn.init.xavier_normal_(self.embedding.weight)
        self.rnn = nn.RNN(input_size, hidden_size,num_layers,nonlinearity = 'relu', batch_first=True, bias = True, bidirectional = bidirectional, dropout = dropout)
        self.linear = nn.Linear(hidden_size, 1) if bidirectional == False else nn.Linear(hidden_size*2, 1)
        self.linear.weight = nn.init.xavier_normal_(self.linear.weight)
        self.sigmoid = nn.Sigmoid()

    def forward(self,x):
        lens = list(map(len, x))
        padded = pad_sequence(x, batch_first=True)
        output_embedding=self.embedding(padded)
        packed = pack_padded_sequence(input = output_embedding,lengths = lens, batch_first=True, enforce_sorted=False)
        input_rnn = packed
        h011 = torch.zeros(self.num_layers * self.bidirectional,32,self.hidden_size)
        output_11, hidden_11 = self.rnn(input_rnn,h011)
        output_padded, output_lengths = pad_packed_sequence(output_11, batch_first=True)
        if self.bidirectional == 2:
            final_output_11 = self.linear(torch.cat((hidden_11[0,:,:],hidden_11[1,:,:]),dim=1))
        else:
            final_output_11 = self.linear(hidden_11) if self.num_layers == 1 else self.linear(hidden_11[-1,:,:])
        prob_11 = self.sigmoid(final_output_11)

        return output_padded, hidden_11, prob_11

In [None]:
num_layers_rnn_one_layer = 1
num_layers_rnn_two_layers = 2
num_classes = 2
input_size = 5
hidden_size = 3
criterion = nn.BCELoss()
dropout = 0.5
rnn_1 = RNN(num_layers=num_layers_rnn_one_layer, num_classes=num_classes, input_size=input_size, hidden_size=hidden_size,vocab=vocab,bidirectional = False, dropout=dropout)
rnn_2 = RNN(num_layers=num_layers_rnn_two_layers, num_classes=num_classes, input_size=input_size, hidden_size=hidden_size,vocab=vocab,bidirectional = False, dropout=dropout)
rnn_3 = RNN(num_layers=num_layers_rnn_one_layer, num_classes=num_classes, input_size=input_size, hidden_size=hidden_size,vocab=vocab,bidirectional = True, dropout=dropout)
epochs = 10
lr = 0.001
weight_decay=0.01

  "num_layers={}".format(dropout, num_layers))


In [None]:
def train_loop(model,criterion,optimizer,train_loader,valid_loader,num_layers,epochs,early_stopping=False,patience =1):
    train_losses= []
    valid_losses= []
    best_score=None
    counter_early_stop=0
    early_stop=False
    valid_loss_min= np.inf
    delta = 0

    for epoch in range(epochs):
        train_loss=0
        print('Epoch : ',epoch+1)
        for label,text in train_loader:
            optimizer.zero_grad()
            if model == rnn_3:
                output,hidden,prob = rnn_3.forward(text)
            elif model == rnn_1:
                output,hidden,prob = rnn_1.forward(text)
            elif model == rnn_2:
                output,hidden,prob = rnn_2.forward(text)
            prob = torch.flatten(prob)
            label = torch.tensor(label, dtype = torch.float32)
            loss=criterion(prob,label)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_loss=train_loss/len(train_loader)


        valid_loss=0

        with torch.no_grad():
            correct=0
            total=0
            for label,text in valid_loader:
                if model == rnn_3:
                    output,hidden,prob = rnn_3.forward(text)
                elif model == rnn_1:
                    output,hidden,prob = rnn_1.forward(text)
                elif model == rnn_2:
                    output,hidden,prob = rnn_2.forward(text)
                prob = torch.flatten(prob)
                label = torch.tensor(label, dtype = torch.float32)
                loss=criterion(prob,label)
                valid_loss += loss.item()
                p = torch.tensor([1 if i > 0.5 else 0 for i in prob.data], dtype = torch.float32)
                total += label.size(0)
                correct += (p == label).sum().item()
            valid_loss=valid_loss/len(valid_loader)
            accuracy = 100 * correct / total
            print('Accuracy : ',accuracy)
        scheduler.step(accuracy)
        train_losses.append(train_loss)
        valid_losses.append(valid_loss)

        print(f'Epoch {epoch+1:<2d}/{epochs} --> Train Loss: {train_loss:.4f} |  Valid Loss: {valid_loss:.4f}')

        if early_stopping:
            score=valid_loss

            if best_score is None:
                best_score=score
                print(f'Validation loss has decreased ({valid_loss_min:.6f} --> {valid_loss:.6f}). Saving model...')
                valid_loss_min = valid_loss

            elif score > best_score + delta:
                counter_early_stop += 1
                print(f'Early stoping counter: {counter_early_stop} out of {patience}')
                if counter_early_stop >= patience:
                    early_stop= True

            else:
                best_score = score
                print(f'Validation loss has decreased ({valid_loss_min:.6f} --> {valid_loss:.6f}). Saving model...')
                counter_early_stop=0
                valid_loss_min= valid_loss

            if early_stop:
                print('Early Stopping')
                break


In [None]:
from torch.optim.lr_scheduler import StepLR
from torch.optim.lr_scheduler import ReduceLROnPlateau

print('RNN with one layer')
print('\n')
optimizer = torch.optim.Adam(rnn_1.parameters(), lr=lr)
scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.4, patience=5, verbose=True) # need to change scheduler.step
train_loop(model=rnn_1,criterion=criterion,optimizer=optimizer,train_loader=train_loader,valid_loader=valid_loader,num_layers=num_layers_rnn_one_layer,epochs=epochs,early_stopping=True,patience =2)
print('\n')
print('RNN with two layers')
print('\n')
optimizer = torch.optim.Adam(rnn_2.parameters(), lr=lr)
scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.4, patience=5, verbose=True) # need to change scheduler.step
train_loop(model=rnn_2,criterion=criterion,optimizer=optimizer,train_loader=train_loader,valid_loader=valid_loader,num_layers=num_layers_rnn_two_layers,epochs=epochs,early_stopping=True,patience =2)
print('\n')
print('Bi-directional RNN with one layer')
print('\n')
optimizer = torch.optim.Adam(rnn_3.parameters(), lr=lr)
scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.4, patience=5, verbose=True) # need to change scheduler.step
train_loop(model=rnn_3,criterion=criterion,optimizer=optimizer,train_loader=train_loader,valid_loader=valid_loader,num_layers=num_layers_rnn_one_layer,epochs=epochs,early_stopping=True,patience =2)

RNN with one layer


Epoch :  1




Accuracy :  65.06129597197898
Epoch 1 /10 --> Train Loss: 0.6887 |  Valid Loss: 0.6801
Validation loss has decreased (inf --> 0.680091). Saving model...
Epoch :  2
Accuracy :  78.67775831873905
Epoch 2 /10 --> Train Loss: 0.6359 |  Valid Loss: 0.5833
Validation loss has decreased (0.680091 --> 0.583321). Saving model...
Epoch :  3
Accuracy :  79.2907180385289
Epoch 3 /10 --> Train Loss: 0.4986 |  Valid Loss: 0.5273
Validation loss has decreased (0.583321 --> 0.527343). Saving model...
Epoch :  4
Accuracy :  78.37127845884413
Epoch 4 /10 --> Train Loss: 0.4098 |  Valid Loss: 0.5270
Validation loss has decreased (0.527343 --> 0.527021). Saving model...
Epoch :  5
Accuracy :  78.19614711033275
Epoch 5 /10 --> Train Loss: 0.3469 |  Valid Loss: 0.5271
Early stoping counter: 1 out of 2
Epoch :  6
Accuracy :  76.40105078809107
Epoch 6 /10 --> Train Loss: 0.2970 |  Valid Loss: 0.6436
Early stoping counter: 2 out of 2
Early Stopping


RNN with two layers


Epoch :  1
Accuracy :  50.525394045534

### GRU

In [None]:
class GRU(nn.Module):
    def __init__(self, num_layers, num_classes, input_size, hidden_size,vocab, bidirectional,dropout):
        super(GRU,self).__init__()
        self.num_layers = num_layers
        self.num_classes = num_classes
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bidirectional = 1 if bidirectional == False else 2
        self.embedding = nn.Embedding(len(vocab),input_size)
        nn.init.xavier_uniform(self.embedding.weight)
        self.gru = nn.GRU(input_size, hidden_size,num_layers, batch_first=True, bias = True, bidirectional = bidirectional, dropout = dropout)
        self.linear = nn.Linear(hidden_size, 1) if bidirectional == False else nn.Linear(hidden_size*2, 1)
        nn.init.xavier_uniform_(self.linear.weight)
        self.linear.bias.data.fill_(1.01)
        self.sigmoid = nn.Sigmoid()

    def forward(self,x):
        lens = list(map(len, x))
        padded = pad_sequence(x, batch_first=True)
        output_embedding=self.embedding(padded)
        packed = pack_padded_sequence(input = output_embedding,lengths = lens, batch_first=True, enforce_sorted=False)
        input_gru = packed
        h011 = torch.zeros(self.num_layers * self.bidirectional,32,self.hidden_size)
        output_11, (hidden_11) = self.gru(input_gru,h011)
        output_padded, output_lengths = pad_packed_sequence(output_11, batch_first=True)
        if self.bidirectional == 2:
            final_output_11 = self.linear(torch.cat((hidden_11[0,:,:],hidden_11[1,:,:]),dim=1))
        else:
            final_output_11 = self.linear(hidden_11) if self.num_layers == 1 else self.linear(hidden_11[-1,:,:])
        prob_11 = self.sigmoid(final_output_11)

        return output_padded, hidden_11, prob_11

In [None]:
num_layers_gru_one_layer = 1
num_layers_gru_two_layers = 2
num_classes = 2
input_size = 5
hidden_size = 3
criterion = nn.BCELoss()
dropout = 0.5
gru_1 = GRU(num_layers=num_layers_gru_one_layer, num_classes=num_classes, input_size=input_size, hidden_size=hidden_size,vocab=vocab,bidirectional = False, dropout=dropout)
gru_2 = GRU(num_layers=num_layers_gru_two_layers, num_classes=num_classes, input_size=input_size, hidden_size=hidden_size,vocab=vocab,bidirectional = False, dropout=dropout)
gru_3 = GRU(num_layers=num_layers_gru_one_layer, num_classes=num_classes, input_size=input_size, hidden_size=hidden_size,vocab=vocab,bidirectional = True, dropout=dropout)

epochs = 10
lr = 0.001
weight_decay=0.01

  # Remove the CWD from sys.path while we load stuff.


In [None]:
def train_loop(model,criterion,optimizer,train_loader,valid_loader, num_layers,epochs,early_stopping=False,patience =1):
    train_losses= []
    valid_losses= []
    best_score=None
    counter_early_stop=0
    early_stop=False
    valid_loss_min= np.inf
    delta = 0

    for epoch in range(epochs):
        train_loss=0
        for label,text in train_loader:
            optimizer.zero_grad()
            if model == gru_1:
                output,hidden,prob = gru_1.forward(text)
            elif model == gru_2:
                output,hidden,prob = gru_2.forward(text)
            elif model == gru_3:
                output,hidden,prob = gru_3.forward(text)
            prob = torch.flatten(prob)
            label = torch.tensor(label, dtype = torch.float32)
            loss=criterion(prob,label)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_loss=train_loss/len(train_loader)


        valid_loss=0

        with torch.no_grad():
            correct=0
            total=0
            for label,text in valid_loader:
                if model == gru_1:
                    output,hidden,prob = gru_1.forward(text)
                elif model == gru_2:
                    output,hidden,prob = gru_2.forward(text)
                elif model == gru_3:
                    output,hidden,prob = gru_3.forward(text)
                prob = torch.flatten(prob)
                label = torch.tensor(label, dtype = torch.float32)
                loss=criterion(prob,label)
                valid_loss += loss.item()
                p = torch.tensor([1 if i > 0.5 else 0 for i in prob.data], dtype = torch.float32)

                total += label.size(0)

                correct += (p == label).sum().item()
            valid_loss=valid_loss/len(valid_loader)
            accuracy = 100 * correct / total
            print('Accuracy : ',accuracy)
        scheduler.step(accuracy)
        train_losses.append(train_loss)
        valid_losses.append(valid_loss)

        print(f'Epoch {epoch+1:<2d}/{epochs} --> Train Loss: {train_loss:.4f} |  Valid Loss: {valid_loss:.4f}')

        if early_stopping:
            score=-valid_loss
            if best_score is None:
                best_score=score
                print(f'Validation loss has decreased ({valid_loss_min:.6f} --> {valid_loss:.6f}). Saving model...')
                valid_loss_min = valid_loss

            elif score < best_score + delta:
                counter_early_stop += 1
                print(f'Early stoping counter: {counter_early_stop} out of {patience}')
                if counter_early_stop >= patience:
                    early_stop= True

            else:
                best_score = score
                print(f'Validation loss has decreased ({valid_loss_min:.6f} --> {valid_loss:.6f}). Saving model...')
                counter_early_stop=0
                valid_loss_min= valid_loss

            if early_stop:
                print('Early Stopping')
                break




In [None]:
# from torch.optim.lr_scheduler import StepLR
from torch.optim.lr_scheduler import ReduceLROnPlateau

print('GRU with one layer')
print('\n')
optimizer = torch.optim.Adam(gru_1.parameters(), lr=lr)
scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.4, patience=5, verbose=True) # need to change scheduler.step
train_loop(model=gru_1,criterion=criterion,optimizer=optimizer,train_loader=train_loader,valid_loader=valid_loader,num_layers=num_layers_gru_one_layer,epochs=epochs,early_stopping=True,patience =2)
print('\n')
print('GRU with two layers')
print('\n')
optimizer = torch.optim.Adam(gru_2.parameters(), lr=lr)
scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.4, patience=5, verbose=True) # need to change scheduler.step
train_loop(model=gru_2,criterion=criterion,optimizer=optimizer,train_loader=train_loader,valid_loader=valid_loader,num_layers=num_layers_gru_two_layers,epochs=epochs,early_stopping=True,patience =2)
print('\n')
print('Bi-directional GRU with one layer')
print('\n')
optimizer = torch.optim.Adam(gru_3.parameters(), lr=lr)
scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.4, patience=5, verbose=True) # need to change scheduler.step
train_loop(model=gru_3,criterion=criterion,optimizer=optimizer,train_loader=train_loader,valid_loader=valid_loader,num_layers=num_layers_gru_two_layers,epochs=epochs,early_stopping=True,patience =2)

GRU with one layer






Accuracy :  57.31173380035026
Epoch 1 /10 --> Train Loss: 0.7179 |  Valid Loss: 0.6798
Validation loss has decreased (inf --> 0.679846). Saving model...
Accuracy :  69.30823117338004
Epoch 2 /10 --> Train Loss: 0.6525 |  Valid Loss: 0.6269
Validation loss has decreased (0.679846 --> 0.626869). Saving model...
Accuracy :  74.78108581436076
Epoch 3 /10 --> Train Loss: 0.5641 |  Valid Loss: 0.5503
Validation loss has decreased (0.626869 --> 0.550334). Saving model...
Accuracy :  78.15236427320491
Epoch 4 /10 --> Train Loss: 0.4228 |  Valid Loss: 0.4713
Validation loss has decreased (0.550334 --> 0.471320). Saving model...
Accuracy :  79.24693520140104
Epoch 5 /10 --> Train Loss: 0.2921 |  Valid Loss: 0.4633
Validation loss has decreased (0.471320 --> 0.463304). Saving model...
Accuracy :  78.98423817863397
Epoch 6 /10 --> Train Loss: 0.2195 |  Valid Loss: 0.4948
Early stoping counter: 1 out of 2
Accuracy :  78.32749562171628
Epoch 7 /10 --> Train Loss: 0.1738 |  Valid Loss: 0.5302
Early s

### LSTM

In [None]:
class LSTM(nn.Module):
    def __init__(self, num_layers, num_classes, input_size, hidden_size,vocab, bidirectional,dropout):
        super(LSTM,self).__init__()
        self.num_layers = num_layers
        self.num_classes = num_classes
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bidirectional = 1 if bidirectional == False else 2
        self.embedding = nn.Embedding(len(vocab),input_size)
        nn.init.xavier_uniform(self.embedding.weight)
        self.lstm = nn.LSTM(input_size, hidden_size,num_layers, batch_first=True, bias = True, bidirectional = bidirectional, dropout = dropout)
        self.linear = nn.Linear(hidden_size, 1) if bidirectional == False else nn.Linear(hidden_size*2, 1)
        nn.init.xavier_uniform_(self.linear.weight)
        self.linear.bias.data.fill_(1.01)
        self.sigmoid = nn.Sigmoid()

    def forward(self,x):
        lens = list(map(len, x))
        padded = pad_sequence(x, batch_first=True)
        output_embedding=self.embedding(padded)
        packed = pack_padded_sequence(input = output_embedding,lengths = lens, batch_first=True, enforce_sorted=False)
        input_lstm = packed
        h011 = torch.zeros(self.num_layers * self.bidirectional,32,self.hidden_size)
        c011 = torch.zeros(self.num_layers * self.bidirectional,32,self.hidden_size)
        output_11, (hidden_11,cell_11) = self.lstm(input_lstm,(h011,c011))
        output_padded, output_lengths = pad_packed_sequence(output_11, batch_first=True)
        if self.bidirectional == 2:

            final_output_11 = self.linear(torch.cat((hidden_11[0,:,:],hidden_11[1,:,:]),dim=1))
        else:
            final_output_11 = self.linear(hidden_11) if self.num_layers == 1 else self.linear(hidden_11[-1,:,:])
        prob_11 = self.sigmoid(final_output_11)

        return output_padded, hidden_11,cell_11 ,prob_11

In [None]:
num_layers_lstm_one_layer = 1
num_layers_lstm_two_layers = 2
num_classes = 2
input_size = 5
hidden_size = 3
criterion = nn.BCELoss()
dropout = 0.5
lstm_1 = LSTM(num_layers=num_layers_lstm_one_layer, num_classes=num_classes, input_size=input_size, hidden_size=hidden_size,vocab=vocab,bidirectional = False, dropout=dropout)
lstm_2 = LSTM(num_layers=num_layers_lstm_two_layers, num_classes=num_classes, input_size=input_size, hidden_size=hidden_size,vocab=vocab,bidirectional = False, dropout=dropout)
lstm_3 = LSTM(num_layers=num_layers_lstm_one_layer, num_classes=num_classes, input_size=input_size, hidden_size=hidden_size,vocab=vocab,bidirectional = True, dropout=dropout)

epochs = 10
lr = 0.001
weight_decay=0.01

  # Remove the CWD from sys.path while we load stuff.


In [None]:
a = []
def train_loop(model,criterion,optimizer,train_loader,valid_loader,test_loader,num_layers,epochs,early_stopping=False,patience =1):
    train_losses= []
    valid_losses= []
    global a
    best_score=None
    counter_early_stop=0
    early_stop=False
    valid_loss_min= np.inf
    delta = 0

    for epoch in range(epochs):
        train_loss=0
        for label,text in train_loader:
            optimizer.zero_grad()
            if model == lstm_1:
                output,hidden,cell,prob = lstm_1.forward(text)
            elif model == lstm_2:
                output,hidden,cell,prob = lstm_2.forward(text)
            elif model == lstm_3:
                output,hidden,cell,prob = lstm_3.forward(text)
            prob = torch.flatten(prob)
            label = torch.tensor(label, dtype = torch.float32)
            loss=criterion(prob,label)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_loss=train_loss/len(train_loader)


        valid_loss=0

        with torch.no_grad():
            correct=0
            total=0
            for label,text in valid_loader:
                if model == lstm_1:
                    output,hidden,cell,prob = lstm_1.forward(text)
                elif model == lstm_2:
                    output,hidden,cell,prob = lstm_2.forward(text)
                elif model == lstm_3:
                    output,hidden,cell,prob = lstm_3.forward(text)
                prob = torch.flatten(prob)
                label = torch.tensor(label, dtype = torch.float32)
                loss=criterion(prob,label)
                valid_loss += loss.item()
                p = torch.tensor([1 if i > 0.5 else 0 for i in prob.data], dtype = torch.float32)

                total += label.size(0)

                correct += (p == label).sum().item()
            valid_loss=valid_loss/len(valid_loader)
            accuracy = 100 * correct / total
            print('Accuracy : ',accuracy)
        scheduler.step(accuracy)
        train_losses.append(train_loss)
        valid_losses.append(valid_loss)

        print(f'Epoch {epoch+1:<2d}/{epochs} --> Train Loss: {train_loss:.4f} |  Valid Loss: {valid_loss:.4f}')

        if early_stopping:
            score=-valid_loss
            if best_score is None:
                best_score=score
                print(f'Validation loss has decreased ({valid_loss_min:.6f} --> {valid_loss:.6f}). Saving model...')
                valid_loss_min = valid_loss

            elif score < best_score + delta:
                counter_early_stop += 1
                print(f'Early stoping counter: {counter_early_stop} out of {patience}')
                if counter_early_stop >= patience:
                    early_stop= True

            else:
                best_score = score
                print(f'Validation loss has decreased ({valid_loss_min:.6f} --> {valid_loss:.6f}). Saving model...')
                counter_early_stop=0
                valid_loss_min= valid_loss

            if early_stop:
                print('Early Stopping')
                break


    if model == lstm_3:
        for text in test_loader:
            output,hidden,cell,prob = lstm_3.forward(text)
            prob = torch.flatten(prob)
            a.extend([1 if i > 0.5 else 0 for i in prob.data])


In [None]:
# from torch.optim.lr_scheduler import StepLR
from torch.optim.lr_scheduler import ReduceLROnPlateau

print('LSTM with one layer')
print('\n')
optimizer = torch.optim.Adam(lstm_1.parameters(), lr=lr)
scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.4, patience=5, verbose=True) # need to change scheduler.step
train_loop(model=lstm_1,criterion=criterion,optimizer=optimizer,train_loader=train_loader,valid_loader=valid_loader,test_loader=test_loader,num_layers=num_layers_gru_one_layer,epochs=epochs,early_stopping=True,patience =2)
print('\n')
print('LSTM with two layers')
print('\n')
optimizer = torch.optim.Adam(lstm_2.parameters(), lr=lr)
scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.4, patience=5, verbose=True) # need to change scheduler.step
train_loop(model=lstm_2,criterion=criterion,optimizer=optimizer,train_loader=train_loader,valid_loader=valid_loader,test_loader=test_loader,num_layers=num_layers_gru_two_layers,epochs=epochs,early_stopping=True,patience =2)
print('\n')
print('Bi-directional LSTM with one layer')
print('\n')
optimizer = torch.optim.Adam(lstm_3.parameters(), lr=lr)
scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.4, patience=5, verbose=True) # need to change scheduler.step
train_loop(model=lstm_3,criterion=criterion,optimizer=optimizer,train_loader=train_loader,valid_loader=valid_loader,test_loader=test_loader,num_layers=num_layers_gru_two_layers,epochs=epochs,early_stopping=True,patience =2)

LSTM with one layer






Accuracy :  56.523642732049034
Epoch 1 /10 --> Train Loss: 0.7469 |  Valid Loss: 0.6860
Validation loss has decreased (inf --> 0.686047). Saving model...
Accuracy :  73.24868651488616
Epoch 2 /10 --> Train Loss: 0.6502 |  Valid Loss: 0.6118
Validation loss has decreased (0.686047 --> 0.611829). Saving model...
Accuracy :  77.27670753064798
Epoch 3 /10 --> Train Loss: 0.5172 |  Valid Loss: 0.5050
Validation loss has decreased (0.611829 --> 0.504964). Saving model...
Accuracy :  77.97723292469352
Epoch 4 /10 --> Train Loss: 0.3669 |  Valid Loss: 0.4755
Validation loss has decreased (0.504964 --> 0.475521). Saving model...
Accuracy :  77.84588441330999
Epoch 5 /10 --> Train Loss: 0.2653 |  Valid Loss: 0.4896
Early stoping counter: 1 out of 2
Accuracy :  78.32749562171628
Epoch 6 /10 --> Train Loss: 0.2007 |  Valid Loss: 0.5223
Early stoping counter: 2 out of 2
Early Stopping


LSTM with two layers


Accuracy :  53.02101576182137
Epoch 1 /10 --> Train Loss: 0.7553 |  Valid Loss: 0.6909
Val