In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
import pickle
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, Bidirectional, LSTM, Dense, Dropout
from keras.metrics import Precision, Recall
from tensorflow.keras.optimizers import SGD
import keras.backend as K
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('unrest_2.csv')
df

Unnamed: 0,clean_text,category
0,allnews2nite The international Society should...,1
1,rosslynpark have already been through a rigor...,1
2,Join the race to curb the silence on breast ca...,0
3,Family I need your attention please we are ...,0
4,PradeepgyawaliK communist ko suvkamana yeti s...,0
...,...,...
6182,I wanna throw my feelings in the trash,0
6183,Current love situation I strike to burn and ...,0
6184,Why are we here Intelligence community should...,1
6185,Business is time or nothing something that th...,0


In [3]:
df.isnull().sum()
df.dropna(axis=0, inplace=True)

In [4]:
df

Unnamed: 0,clean_text,category
0,allnews2nite The international Society should...,1
1,rosslynpark have already been through a rigor...,1
2,Join the race to curb the silence on breast ca...,0
3,Family I need your attention please we are ...,0
4,PradeepgyawaliK communist ko suvkamana yeti s...,0
...,...,...
6182,I wanna throw my feelings in the trash,0
6183,Current love situation I strike to burn and ...,0
6184,Why are we here Intelligence community should...,1
6185,Business is time or nothing something that th...,0


In [5]:
def tweet_to_words(tweet):
    text = tweet.lower()
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    words = text.split()
    words = [w for w in words if w not in stopwords.words("english")]
    words = [PorterStemmer().stem(w) for w in words]
    return words

print("\nOriginal tweet ->", df['clean_text'][0])
print("\nProcessed tweet ->", tweet_to_words(df['clean_text'][0]))


Original tweet ->  allnews2nite The international Society should listen to the Libyan people voice by all means   listening and talking to the Armed groups looks like having dialogue with Mafias and gangs   those do not care about concept of building respected  civilized state 

Processed tweet -> ['allnews2nit', 'intern', 'societi', 'listen', 'libyan', 'peopl', 'voic', 'mean', 'listen', 'talk', 'arm', 'group', 'look', 'like', 'dialogu', 'mafia', 'gang', 'care', 'concept', 'build', 'respect', 'civil', 'state']


In [6]:
X = list(map(tweet_to_words, df['clean_text'][:10000]))

In [7]:
Y = df['category'][:10000]

In [8]:
y = pd.get_dummies(df['category'][:10000])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)

In [9]:
vocabulary_size = 5000
count_vector = CountVectorizer(max_features=vocabulary_size,
                                preprocessor=lambda x: x,
                               tokenizer=lambda x: x) 
X_train = count_vector.fit_transform(X_train).toarray()
X_test = count_vector.transform(X_test).toarray()

In [10]:
max_words = 5000
max_len=50

def tokenize_pad_sequences(text):
    tokenizer = Tokenizer(num_words=max_words, lower=True, split=' ')
    tokenizer.fit_on_texts(text)
    X = tokenizer.texts_to_sequences(text)
    X = pad_sequences(X, padding='post', maxlen=max_len)
    return X, tokenizer

print('Before Tokenization & Padding \n', df['clean_text'][0])
X, tokenizer = tokenize_pad_sequences(df['clean_text'])
print('After Tokenization & Padding \n', X[0])

Before Tokenization & Padding 
  allnews2nite The international Society should listen to the Libyan people voice by all means   listening and talking to the Armed groups looks like having dialogue with Mafias and gangs   those do not care about concept of building respected  civilized state 
After Tokenization & Padding 
 [4689    1  197  435   89 1190    3    1  806   38 1191   28   37  376
 2125    8  762    3    1   61  260  807   75  478  547   21 4690    8
 4691  127   53   25  339   58 2126    5  808 2127 4692  169    0    0
    0    0    0    0    0    0    0    0]


In [11]:
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [12]:
y = pd.get_dummies(df['category'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
print('Train Set ->', X_train.shape, y_train.shape)
print('Test Set ->', X_test.shape, y_test.shape)

Train Set -> (4949, 50) (4949, 2)
Test Set -> (1238, 50) (1238, 2)


In [13]:
def f1_score(precision, recall):
    
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [14]:
vocab_size = 5000
embedding_size = 32
epochs=20
learning_rate = 0.1
decay_rate = learning_rate / epochs
momentum = 0.8

sgd = SGD(lr=learning_rate, momentum=momentum, decay=decay_rate, nesterov=False)
model= Sequential()
model.add(Embedding(vocab_size, embedding_size, input_length=max_len))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Bidirectional(tf.compat.v1.keras.layers.CuDNNLSTM(32)))
model.add(Dropout(0.2))
model.add(Dense(2, activation='softmax'))

  super(SGD, self).__init__(name, **kwargs)





In [18]:
print(model.summary())

model.compile(loss='binary_crossentropy', optimizer=sgd, 
               metrics=['accuracy', Precision(), Recall()])


history = model.fit(X_train, y_train,
                      validation_data=(X_test, y_test),
                      batch_size=64, epochs=30, verbose=1)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 32)            160000    
                                                                 
 conv1d (Conv1D)             (None, 50, 32)            3104      
                                                                 
 max_pooling1d (MaxPooling1D  (None, 25, 32)           0         
 )                                                               
                                                                 
 bidirectional (Bidirectiona  (None, 64)               16896     
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense (Dense)               (None, 2)                 1

KeyboardInterrupt: 

In [16]:
loss, accuracy, precision, recall = model.evaluate(X_test, y_test, verbose=0)
print('')
print('Accuracy  : {:.4f}'.format(accuracy))
print('Precision : {:.4f}'.format(precision))
print('Recall    : {:.4f}'.format(recall))
print('F1 Score  : {:.4f}'.format(f1_score(precision, recall)))


Accuracy  : 0.7561
Precision : 0.7561
Recall    : 0.7561
F1 Score  : 0.7561


In [17]:
model.save('model-v4.h5')