In [28]:
# Importing required libraries

import numpy as np
import pandas as pd
from random import sample

import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers import Bidirectional
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence, text

from sklearn.model_selection import train_test_split

import nltk
nltk.download('wordnet')
import nltk
from nltk.stem import WordNetLemmatizer 

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [29]:
data = pd.read_csv("../input/twitter-and-reddit-sentimental-analysis-dataset/Twitter_Data.csv", encoding = "utf-8")

In [30]:
data.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [31]:
data["category"].value_counts()

 1.0    72250
 0.0    55213
-1.0    35510
Name: category, dtype: int64

Due to imblance in class, the model will not rain properly. Hence as the lowest value is 35510, which large enough, we can apply undersampling
to make all the categories equal.

In [32]:
pos_data = data[data["category"] == 1]
neu_data = data[data["category"] == 0]
neg_data = data[data["category"] == -1]

In [33]:
pos_data = pos_data.sample(len(neg_data))
neu_data = neu_data.sample(len(neg_data))

In [34]:
new_data = pd.concat([pos_data, neu_data, neg_data], axis = 0)

In [35]:
new_data = new_data.sample(frac = 1)

In [36]:
new_data["Positive"]=0
new_data["Neutral"]=0
new_data["Negative"]=0

In [37]:
for i in range(len(new_data)):
    cat = data.iloc[i, 1]
    if cat < 0:
        data.loc[i, "Negative"] = 1
    if cat == 0:
        data.loc[i, "Neutral"] = 1
    if cat > 1:
        data.loc[i, "Positive"] = 1

data.drop("category", axis = 1, inplace = True)

In [38]:
len(new_data), len(data)

(106530, 162980)

In [39]:
new_data.drop("category", axis = 1, inplace = True)

In [40]:
X = new_data.loc[:,"clean_text"]
X = X.values
y = new_data.drop(["clean_text"], axis = 1)
y = y.values

In [41]:
len(X)

106530

In [42]:
# Lemmatization
lemmatizer = WordNetLemmatizer()

for i in range(len(X)):
    sentence = str(X[i])
    words = nltk.word_tokenize(sentence)
    lemmatized_sentence = ' '.join([lemmatizer.lemmatize(w) for w in words])
    X[i] = lemmatized_sentence

In [43]:
# Max words in a sentence

max_length = new_data['clean_text'].apply(lambda x:len(str(x).split())).max()
max_length

52

In [44]:
# Tokenizing all words

tokenizer = text.Tokenizer()

tokenizer.fit_on_texts(list(X))

x_seq = tokenizer.texts_to_sequences(X)
x_pad = sequence.pad_sequences(x_seq, maxlen = max_length, padding ="post")

word_ids = tokenizer.word_index

In [45]:
len(x_pad), len(y)

(106530, 106530)

In [46]:
x_train, x_test, y_train, y_test = train_test_split(x_pad, y, 
                                                  random_state=42, 
                                                  test_size=0.2, shuffle=True)

In [47]:
# Using GloVe(length = 200, due to memory restrictions) dataset for word embedding

glove = open("../input/glove6b200d/glove.6B.200d.txt")

# creating an embedding matrix for the embedding layer

embd_arrays = {}

for line in glove:
    line = line.split(" ")
    word = line[0]
    word_emd = np.asarray([integer for integer in line[1:]])
    embd_arrays[word] = word_emd

glove.close()

In [48]:
# Preparing embedding matrix containing embedding vectors from glove

embed_matrix = np.zeros((len(word_ids) + 1, 200))
for word, i in list(word_ids.items()):
    embed_vector = embd_arrays.get(word)
    if embed_vector is not None:
        embed_matrix[i] = embed_vector

In [52]:
# Creating the model 

opt = tf.keras.optimizers.Adam(clipnorm=1.)

model_lstm = Sequential()
model_lstm.add(Embedding(len(word_ids) + 1,
                 200,
                 weights=[embed_matrix],
                 input_length=max_length,
                 trainable=True))


model_lstm.add((LSTM(64, dropout=0.3, recurrent_dropout=0.3)))
model_lstm.add(tf.keras.layers.BatchNormalization())
model_lstm.add(Dense(128, activation = 'relu'))
model_lstm.add(Dense(64, activation = 'relu'))
model_lstm.add(Dense(3, activation='sigmoid'))
model_lstm.compile(loss=tf.keras.losses.BinaryCrossentropy(), optimizer=opt,metrics=['accuracy'])

model_lstm.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 52, 200)           16045600  
_________________________________________________________________
lstm_4 (LSTM)                (None, 64)                67840     
_________________________________________________________________
batch_normalization (BatchNo (None, 64)                256       
_________________________________________________________________
dense_12 (Dense)             (None, 128)               8320      
_________________________________________________________________
dense_13 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_14 (Dense)             (None, 3)                 195       
Total params: 16,130,467
Trainable params: 16,130,339
Non-trainable params: 128
________________________________________

In [53]:
model_lstm.fit(x_train, y_train, epochs=3, batch_size=64, validation_split = 0.2)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f6fcdb22e50>

In [54]:
model_lstm.fit(x_train, y_train, epochs=3, batch_size=64, validation_split = 0.2)

Epoch 1/3
Epoch 2/3
Epoch 3/3

KeyboardInterrupt: 

In [55]:
model_lstm.evaluate(x_test, y_test)



[2.9614120045096115e-09, 0.9719327688217163]

In [56]:
from sklearn.metrics import precision_score

In [57]:
y_preds = model_lstm.predict(x_test)

In [58]:
precision_score(y_test, y_preds)

ValueError: Classification metrics can't handle a mix of multilabel-indicator and continuous-multioutput targets

In [51]:
model_lstm.save_weights("v1.h5")