In [1]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, Dense
from tensorflow.keras.layers import SpatialDropout1D, Conv1D, GlobalMaxPooling1D, Bidirectional
from tensorflow.keras.optimizers import Adam
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc

nltk.download('stopwords')
nltk.download('punkt')

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    stop_words.add('br')
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
    return ' '.join(filtered_text)

def preprocess_text(data):
    data['text'] = data['text'].apply(remove_stopwords)#Remove stopwords
    data['text'] = data['text'].apply(lambda x: re.sub('https?:\/\/.*[\r\n]*', ' ', x))#Remove URLs
    data['text'] = data['text'].apply(lambda x: re.sub('[^a-zA-Z0-9 \n]', ' ', x))#Remove non-alphanumeric characters
    data['text'] = data['text'].apply(lambda x: re.sub('@[\w]*', '', x))#Remove Twitter usernames
    data['text'] = data['text'].apply(lambda x: re.sub('\d+', ' ', x))#Remove digits
    data['text'] = data['text'].apply(lambda x: re.sub('user', '', x))#Remove Twitter usernames
    data['text'] = data['text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>1]))#Remove Short Words
    data['text'] = data['text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))#Remove punctuation
    data['text'] = data['text'].str.lower()
    return data

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [7]:
#Loading data
data = pd.read_csv('/content/twitter_E6oV3lV.csv')
data.columns = ['id', 'label', 'text']

#Category balancing processing
class_0 = data[data['label'] == 0]
class_1 = data[data['label'] == 1]

majority_class = 0 if len(class_0) > len(class_1) else 1

class_majority_downsampled = resample(class_0 if majority_class == 0 else class_1,
                                      replace=False,
                                      n_samples=len(class_1),
                                      random_state=42)

balanced_data = pd.concat([class_majority_downsampled, class_1] if majority_class == 0 else [class_0, class_majority_downsampled])

train_data, test_data = train_test_split(balanced_data, test_size=0.2, random_state=42)

train_data.columns = ['id', 'label', 'text']
test_data.columns = ['id', 'label', 'text']


In [8]:
#Preprocessing
train_data = preprocess_text(train_data)
test_data = preprocess_text(test_data)

tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_data['text'])
train_sequences = tokenizer.texts_to_sequences(train_data['text'])
test_sequences = tokenizer.texts_to_sequences(test_data['text'])

X_train = pad_sequences(train_sequences, maxlen=100)
X_test = pad_sequences(test_sequences, maxlen=100)

y_train = train_data['label'].values
y_test = test_data['label'].values

train_data.head()

Unnamed: 0,id,label,text
5279,5280,0,sad world orlando tuerie terrorism usa
15985,15986,1,know ref malevote amp womenvote profiling ref ...
6017,6018,0,priority tou choro koi nd option tk ni banata ...
27314,27315,0,arr look taylor slide sister
3138,3139,1,might libtard libtard sjw liberal politics


In [10]:
class NLPModel:
    def __init__(self):
        pass

    def import_weights(self, dataset):
        pass

    def test_model(self, input):
        pass

    def display_stats(self, y_true, y_pred):
        pass

class RNN(NLPModel):
    def __init__(self):
        super().__init__()

    def import_weights(self, dataset):
        with open(f'RNN_{dataset}.pkl', 'rb') as f:
            self.model = pickle.load(f)

    def test_model(self, input):
        self.predictions = (self.model.predict(input) > 0.5).astype("int32").flatten()
        return self.predictions

    def display_stats(self, y_true, y_pred):
        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)
        print(f"f1_score: {f1:.4f}")


        # cm = confusion_matrix(y_true, y_pred)
        # plt.figure(figsize=(4, 3))
        # sns.heatmap(cm, annot=True, fmt='g', cmap='Blues', cbar=False)
        # plt.xlabel('Predicted')
        # plt.ylabel('True')
        # plt.title('Confusion Matrix')
        # plt.xticks(ticks=[0.5, 1.5], labels=['Negative', 'Positive'])
        # plt.yticks(ticks=[0.5, 1.5], labels=['Negative', 'Positive'], rotation=0)
        # plt.show()

        # print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")

class LSTM(NLPModel):
    def __init__(self):
        super().__init__()

    def import_weights(self, dataset):
        with open(f'LSTM_{dataset}.pkl', 'rb') as f:
            self.model = pickle.load(f)

    def test_model(self, input):
        self.predictions = (self.model.predict(input) > 0.5).astype("int32").flatten()
        return self.predictions

    def display_stats(self, y_true, y_pred):
        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)
        print(f"f1_score: {f1:.4f}")


        # cm = confusion_matrix(y_true, y_pred)
        # plt.figure(figsize=(4, 3))
        # sns.heatmap(cm, annot=True, fmt='g', cmap='Blues', cbar=False)
        # plt.xlabel('Predicted')
        # plt.ylabel('True')
        # plt.title('Confusion Matrix')
        # plt.xticks(ticks=[0.5, 1.5], labels=['Negative', 'Positive'])
        # plt.yticks(ticks=[0.5, 1.5], labels=['Negative', 'Positive'], rotation=0)
        # plt.show()

        # print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")

class CNNBiLSTM(NLPModel):
    def __init__(self):
        super().__init__()

    def import_weights(self, dataset):
        with open(f'CNNBiLSTM_{dataset}.pkl', 'rb') as f:
            self.model = pickle.load(f)

    def test_model(self, input):
        self.predictions = (self.model.predict(input) > 0.5).astype("int32").flatten()
        return self.predictions

    def display_stats(self, y_true, y_pred):
        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)
        print(f"f1_score: {f1:.4f}")


        # cm = confusion_matrix(y_true, y_pred)
        # plt.figure(figsize=(4, 3))
        # sns.heatmap(cm, annot=True, fmt='g', cmap='Blues', cbar=False)
        # plt.xlabel('Predicted')
        # plt.ylabel('True')
        # plt.title('Confusion Matrix')
        # plt.xticks(ticks=[0.5, 1.5], labels=['Negative', 'Positive'])
        # plt.yticks(ticks=[0.5, 1.5], labels=['Negative', 'Positive'], rotation=0)
        # plt.show()

        # print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")


In [11]:
rnn_model = RNN()
lstm_model = LSTM()
cnnbilstm_model = CNNBiLSTM()

rnn_model.import_weights("tweet")
lstm_model.import_weights("tweet")
cnnbilstm_model.import_weights("tweet")

rnn_predictions = rnn_model.test_model(X_test)
lstm_predictions = lstm_model.test_model(X_test)
cnnbilstm_predictions = cnnbilstm_model.test_model(X_test)

print("RNN Model:")
rnn_model.display_stats(y_test, rnn_predictions)

print("LSTM Model:")
lstm_model.display_stats(y_test, lstm_predictions)

print("CNNBiLSTM Model:")
cnnbilstm_model.display_stats(y_test, cnnbilstm_predictions)


RNN Model:
f1_score: 0.8069
LSTM Model:
f1_score: 0.8645
CNNBiLSTM Model:
f1_score: 0.8636
