### IMPORT MODULES

Make sure you run requirements.txt to import all the modules

In [71]:
import os
import sys
import pickle
from pathlib import Path
import pandas as pd
import json
import pickle
import re
from pathlib import Path
import nltk
from time import time
from emoji import demojize
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Embedding, SpatialDropout1D, LSTM
from tensorflow.keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D
from tensorflow.keras.layers import Bidirectional, Conv1D, Dense, concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelBinarizer

### DATASET

Dataset class defines how the data is handled throught the model

In [72]:
class Dataset:
    def __init__(self, filename, label_col='label', text_col='text'):
        self.filename = filename
        self.label_col = label_col
        self.text_col = text_col

    def data(self):
        data = self.dataframe[[self.label_col, self.text_col]].copy()
        data.columns = ['label', 'text']
        return data

    def cleaned_data(self):
        data =  self.dataframe[[self.label_col, 'cleaned']]
        data.columns = ['label', 'text']
        return data

    def load(self):
        df = pd.read_csv(Path(self.filename).resolve())
        self.dataframe = df

    def preprocess_texts(self, quiet=False):
        self.dataframe['cleaned'] = preprocess(self.dataframe[self.text_col], quiet)

### DATA PREPROCESSING

1. Lowercasing the post content
2. Remove hyperlinks 
3. Converting emojis to text
4. Remove stopwords

In [73]:
def preprocess(texts, quiet=False):
    texts = texts.str.lower()
    texts = texts.str.replace(r"(http|@)\S+", "")
    texts = texts.apply(demojize)
    texts = texts.str.replace(r"::", ": :")
    texts = texts.str.replace(r"’", "'")
    texts = texts.str.replace(r"[^a-z\':_]", " ")
    pattern = re.compile(r"(.)\1{2,}", re.DOTALL)
    texts = texts.str.replace(pattern, r"\1")
    texts = texts.str.replace(r"(can't|cannot)", 'can not')
    texts = texts.str.replace(r"n't", ' not')
    stopwords = nltk.corpus.stopwords.words('english')
    stopwords.remove('not')
    stopwords.remove('nor')
    stopwords.remove('no')
    texts = texts.apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords]))
    print("Preprocessing done")
    return texts

### TRAIN FILE

The JSON formatted train file is converted to CSV format with the columns - id, label and text

In [74]:
def train_file(f):
    file = open(f)
    data = json.load(file)
    df = pd.DataFrame(columns=["id","label","text"])
    for key in data:
        emotions = data[key]["emotion"]
        for emo,value in emotions.items():
            if(value == True):
                df = df.append({"id":key,"label":emo,"text":data[key]["body"]}, ignore_index = True)
    df.to_csv("nlp_train.csv")
    
#Change the name of the JSON Train file accordingly
train_file("nlp_train.json")

In [75]:
#Project path is added to the ENV VARIABLE - PYTHONPATH
sys.path.append(Path(os.path.join(os.path.abspath(''), '../')).resolve().as_posix())
dataset_path = Path('nlp_train.csv').resolve()
dataset = Dataset(dataset_path)
dataset.load()
dataset.preprocess_texts()
cleaned_df = dataset.cleaned_data()
cleaned_df.head()

Preprocessing done


Unnamed: 0,label,text
0,anger,answering question criticism individual referr...
1,disgust,answering question criticism individual referr...
2,pessimism,answering question criticism individual referr...
3,anger,i'm going start today's discussion thread pers...
4,anticipation,i'm going start today's discussion thread pers...


### TOKENIZE

The input text words are converted into word vectors and saved to "tokenizer.pkl" 

In [76]:
num_words = cleaned_df['text'].str.split().str.len().sum()
tokenizer = Tokenizer(num_words=num_words, lower=True)
tokenizer.fit_on_texts(cleaned_df.text)
file_to_save = Path('tokenizer.pickle').resolve()
with file_to_save.open('wb') as file:
    pickle.dump(tokenizer, file)

### TRAIN, TEST AND VALIDATION DATA

The test data is also split here

In [84]:
train = pd.DataFrame(columns=['label', 'text'])
validation = pd.DataFrame(columns=['label', 'text'])
test = pd.DataFrame(columns=['label','test'])
for label in cleaned_df.label.unique():
    label_data = cleaned_df[cleaned_df.label == label]
    train_data, validation_data = train_test_split(label_data, test_size=0.2)
    train = pd.concat([train, train_data])
    validation = pd.concat([validation, validation_data])

test.to_csv("nlp_test.csv") #Test data created to test the data

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  import sys
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  if __name__ == '__main__':
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  # Remove the CWD from sys.path while we load stuff.


### NEURAL MODEL  

A Stacked Bidirectional LSTM, CNN model is created

In [78]:
input_dim = min(tokenizer.num_words, len(tokenizer.word_index) + 1)
num_classes = len(cleaned_df.label.unique())
embedding_dim = 500
input_length = 100
lstm_units = 128
lstm_dropout = 0.1
recurrent_dropout = 0.1
spatial_dropout=0.2
filters=64
kernel_size=3

input_layer = Input(shape=(input_length,))
output_layer = Embedding(
  input_dim=input_dim,
  output_dim=embedding_dim,
  input_shape=(input_length,)
)(input_layer)

output_layer = SpatialDropout1D(spatial_dropout)(output_layer)

output_layer = Bidirectional(
LSTM(lstm_units, return_sequences=True,
     dropout=lstm_dropout, recurrent_dropout=recurrent_dropout)
)(output_layer)

output_layer = Bidirectional(
LSTM(lstm_units, return_sequences=True,
     dropout=lstm_dropout, recurrent_dropout=recurrent_dropout)
)(output_layer)

output_layer = Conv1D(filters, kernel_size=kernel_size, padding='valid',
                    kernel_initializer='glorot_uniform')(output_layer)

avg_pool = GlobalAveragePooling1D()(output_layer)
max_pool = GlobalMaxPooling1D()(output_layer)
output_layer = concatenate([avg_pool, max_pool])

output_layer = Dense(num_classes, activation='softmax')(output_layer)

model = Model(input_layer, output_layer)

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "model_6"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            [(None, 100)]        0                                            
__________________________________________________________________________________________________
embedding_6 (Embedding)         (None, 100, 500)     13523000    input_7[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_6 (SpatialDro (None, 100, 500)     0           embedding_6[0][0]                
__________________________________________________________________________________________________
bidirectional_10 (Bidirectional (None, 100, 256)     644096      spatial_dropout1d_6[0][0]        
____________________________________________________________________________________________

### ENCODING & TRAINING

In [79]:
train_sequences = [text.split() for text in train.text]
validation_sequences = [text.split() for text in validation.text]
list_tokenized_train = tokenizer.texts_to_sequences(train_sequences)
list_tokenized_validation = tokenizer.texts_to_sequences(validation_sequences)
x_train = pad_sequences(list_tokenized_train, maxlen=input_length)
x_validation = pad_sequences(list_tokenized_validation, maxlen=input_length)

encoder = LabelBinarizer()
encoder.fit(cleaned_df.label.unique())

encoder_path = Path('../', 'encoder.pickle')
with encoder_path.open('wb') as file:
    pickle.dump(encoder, file)

y_train = encoder.transform(train.label)
y_validation = encoder.transform(validation.label)

In [80]:
batch_size = 128
epochs = 25
model.fit(
    x_train,
    y=y_train,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=(x_validation, y_validation)
)

Train on 4080 samples, validate on 516 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x130b44b50>

In [81]:
model_file = Path('../model_weights.h5').resolve()
model.save_weights(model_file.as_posix())