In [None]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.pyplot import xticks
from nltk.corpus import stopwords
import nltk
import re
from nltk.stem import WordNetLemmatizer
import string
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from collections import defaultdict
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import ModelCheckpoint, EarlyStopping, Callback
import tensorflow as tf
from sklearn.metrics import f1_score
from wordcloud import WordCloud,STOPWORDS
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from keras.preprocessing.sequence import pad_sequences
from numpy import array
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense,Flatten,Embedding,Activation,Dropout
from keras.layers import Conv1D,MaxPooling1D,GlobalMaxPooling1D,LSTM
from keras.layers import Bidirectional

In [None]:
from google.colab import files


In [None]:
from google.colab import drive


In [None]:
drive.mount('/content/gdrive',force_remount=True)


In [None]:
path = 'gdrive/My Drive/grrr/'


In [None]:
d0 = pd.read_csv(path+"final_tweets.csv",encoding='utf-8')

In [None]:
d0.head()

In [None]:
d0.rename(columns = {'Cleaned Tweet 1':'text'}, inplace = True)

In [None]:
d0 = d0[['text', 'target']].copy()

In [None]:
d0.text

In [None]:
from collections import Counter
def create_vocab(df):
    vocab = Counter()
    for i in range(df.shape[0]):
        vocab.update(df.text[i].split())
    return(vocab)

In [None]:
d0[["text"]] = d0[["text"]].astype(str) 

In [None]:
d0.dtypes

In [None]:

# call vocabulary creation function on master dataset
vocab = create_vocab(d0)

# lets check the no. of words in the vocabulary
len(vocab)

In [None]:
# lets check the most common 50 words in the vocabulary
vocab.most_common(50)

In [None]:
# create the final vocab by considering words with more than one occurence
final_vocab = []
min_occur = 2
for k,v in vocab.items():
    if v >= min_occur:
        final_vocab.append(k)

In [None]:
# lets check the no. of the words in the final vocabulary
vocab_size = len(final_vocab)
vocab_size

In [None]:
# function to filter the dataset, keep only words which are present in the vocab
def filter(tweet):
    sentence = ""
    for word in tweet.split():  
        if word in final_vocab:
            sentence = sentence + word + ' '
    return(sentence)

In [None]:
# apply filter function on the train and test datasets
d0['text'] = d0['text'].apply(lambda s : filter(s))

In [None]:
# the different units into which you can break down text (words, characters, or n-grams) are called tokens, 
# and breaking text into such tokens is called tokenization, this can be achieved using Tokenizer in Keras

from keras.preprocessing.text import Tokenizer
# fit a tokenizer
def create_tokenizer(lines):
    # num_words = vocab_size will create a tokenizer,configured to only take into account the vocab_size(6025)
    tokenizer = Tokenizer(num_words=vocab_size)
    # Build th word index, Turns strings into lists of integer indices
    tokenizer.fit_on_texts(lines) 
    return tokenizer

In [None]:
# function to calculate f1 score for each epoch
import keras.backend as K
def get_f1(y_true, y_pred): #taken from old keras source code
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [None]:
from keras.layers import Embedding
# The Embedding layer takes at least two arguments: the number of possible tokens (here, 5,000: 1 + maximum word index)
#and the dimensionality of the embeddings (here, 64).
#embedding_layer = Embedding(5000, 64)
# Number of words to consider as features
max_features = vocab_size

# Cuts off the text after this number of words (among the max_features most common words)
maxlen = 100
# create and apply tokenizer on the training dataset
tokenizer = create_tokenizer(d0.text)
from keras import preprocessing
# conver text to sequences
sequences = tokenizer.texts_to_sequences(d0.text)
#print(sequences)
# Turns the lists of integers into a 2D integer tensor of shape (samples, maxlen), padding shorter sequences with 0s
train_text = preprocessing.sequence.pad_sequences(sequences, maxlen=maxlen)

In [None]:
# Test train split 
X_train, X_test, y_train, y_test = train_test_split(train_text, d0.target, test_size = 0.2, random_state = 42)

In [None]:
# build the model
model = Sequential()
# Specifies the maximum input length to the Embedding layer so you can later flatten the embedded inputs. 

# After the Embedding layer, the activations have shape (samples, maxlen, 8)
model.add(Embedding(vocab_size, 8, input_length=maxlen))

# Flattens the 3D tensor of embeddings into a 2D tensor of shape (samples, maxlen * 8)
model.add(Flatten())

# Dense layer for classification
model.add(Dense(1, activation='sigmoid'))

# compile the model
model.compile(optimizer='rmsprop', loss='binary_crossentropy',metrics=[get_f1])
model.summary()


In [None]:
callbacks_list = [
EarlyStopping(
monitor='get_f1',
patience=1,
),
ModelCheckpoint(filepath=path+'./embd.h5',monitor='val_loss',save_best_only=True)
]

In [None]:
# train the model
history = model.fit(X_train, y_train,
epochs=100,
batch_size=32,
callbacks=callbacks_list,
validation_split=0.2)

In [None]:
# check model performance
acc = history.history['get_f1']
val_acc = history.history['val_get_f1']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

In [None]:
import tensorflow as tf
dependencies = {
    'get_f1': get_f1
}


# load the model from disk
loaded_model_embd = tf.keras.models.load_model(path+'./embd.h5',custom_objects=dependencies)

In [None]:
# prediction on the test dataset
y_pred = loaded_model_embd.predict_classes(X_test)

# important metrices
print(classification_report(y_test, y_pred))


In [None]:
#simple rnn

In [None]:
max_words = 100000

In [None]:
from keras.layers import Embedding, SimpleRNN
model = Sequential()
model.add(Embedding(max_words, 32))
model.add(SimpleRNN(32))
model.add(Dense(1, activation='sigmoid'))
model.summary()

In [None]:
callbacks_list = [
EarlyStopping(
monitor='get_f1',
patience=1,
),
ModelCheckpoint(filepath=path+'./SRNN.h5',monitor='val_loss',save_best_only=True)
]
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=[get_f1])



In [None]:
history = model.fit(X_train, y_train,
epochs=100,
batch_size=128,
callbacks=callbacks_list,
validation_split=0.2)

In [None]:
# check model performance
acc = history.history['get_f1']
val_acc = history.history['val_get_f1']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

In [None]:
dependencies = {
    'get_f1': get_f1
}


# load the model from disk
loaded_model_SRNN = keras.models.load_model(path+'./SRNN.h5',custom_objects=dependencies)

In [None]:
#X_test_Set = tokenizer.texts_to_matrix(X_test, mode = 'freq')
y_pred = loaded_model_SRNN.predict_classes(X_test)

# important metrices
print(classification_report(y_test, y_pred))

In [None]:
#gru

In [None]:
from keras.layers import GRU

model = Sequential()
model.add(Embedding(max_features, 32))
model.add(GRU(32))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
loss='binary_crossentropy',
metrics=[get_f1])
callbacks_list = [
EarlyStopping(
monitor='get_f1',
patience=1,
),
ModelCheckpoint(filepath=path+'./GRU.h5',monitor='val_loss',save_best_only=True)
]
history = model.fit(X_train, y_train,
epochs=100,
batch_size=128,
callbacks=callbacks_list,
validation_split=0.2)

In [None]:
# check model performance
acc = history.history['get_f1']
val_acc = history.history['val_get_f1']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

In [None]:
dependencies = {
    'get_f1': get_f1
}

# load the model from disk
loaded_model_GRU = tf.keras.models.load_model(path+'./GRU.h5',custom_objects=dependencies)


In [None]:
# prediction on the test dataset
#X_test_Set = tokenizer.texts_to_matrix(X_test, mode = 'freq')
y_pred = loaded_model_GRU.predict_classes(X_test)

# important metrices
print(classification_report(y_test, y_pred))
