In [1]:
import pickle
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [2]:
train_df = pd.read_csv('./data/final_dataset.csv')
train_df_shuffled = train_df.sample(frac=1, random_state=42) # shuffle with random_state=42 for reproducibility
train_df_shuffled.head()

Unnamed: 0,content,label,Unnamed: 2,Unnamed: 3
10650,fucking beautiful shoes.,0,,
2041,fuck you change your name. Do you know who I ...,1,,
8668,Per my other tweets I've (hopefully temporari...,0,,
1114,"""If you not on twitter you're a fucking moron""...",1,,
13902,"Yep he left with supposed ""mercury poisoning""...",0,,


In [3]:
train_df.label.value_counts()

0    12179
1     7822
Name: label, dtype: int64

In [4]:
train_sentences, val_sentences, train_labels, val_labels = train_test_split(train_df_shuffled["content"].to_numpy(),
                                                                            train_df_shuffled["label"].to_numpy(),
                                                                            test_size=0.1,
                                                                            random_state=42)

In [5]:
max_vocab_length = 10000
max_length = 15

text_vectorizer = TextVectorization(max_tokens=max_vocab_length,
                                    standardize="lower_and_strip_punctuation",
                                    split="whitespace",
                                    ngrams=None,
                                    output_mode="int",
                                    output_sequence_length=max_length)

In [6]:
text_vectorizer.adapt(train_sentences)

In [7]:
sample_sentence = "There's a flood in my street!"
text_vectorizer([sample_sentence])

<tf.Tensor: shape=(1, 15), dtype=int64, numpy=
array([[ 336,    6,    1,   14,   12, 1089,    0,    0,    0,    0,    0,
           0,    0,    0,    0]], dtype=int64)>

In [8]:
def calculate_results(y_true, y_pred):
  model_accuracy = accuracy_score(y_true, y_pred) * 100
  model_precision, model_recall, model_f1, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")
  model_results = {"accuracy": model_accuracy,
                  "precision": model_precision,
                  "recall": model_recall,
                  "f1": model_f1}
  return model_results

In [9]:
embedding = layers.Embedding(input_dim=max_vocab_length,
                                     output_dim=128,
                                     embeddings_initializer="uniform",
                                     input_length=max_length,
                                     name="embedding")

inputs = layers.Input(shape=(1,), dtype="string")
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.Bidirectional(layers.LSTM(64))(x)
outputs = layers.Dense(1, activation="sigmoid")(x)
model = tf.keras.Model(inputs, outputs, name="model_4_Bidirectional")

In [10]:
model.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(),
                metrics=["accuracy"])

In [11]:
model_history = model.fit(train_sentences,
                              train_labels,
                              epochs=15,
                              validation_data=(val_sentences, val_labels))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [19]:
text = "fuck"
token = text_vectorizer([text])
# print(val_sentences)

model_pred_probs = model.predict([text])
model_pred_probs[:10]



array([[0.94859457]], dtype=float32)

In [20]:
model.save("Cyber_Disaster")



INFO:tensorflow:Assets written to: Cyber_Disaster\assets


INFO:tensorflow:Assets written to: Cyber_Disaster\assets


In [21]:
loaded_model = tf.keras.models.load_model("Cyber_Disaster")
loaded_model.predict([text])



array([[0.94859457]], dtype=float32)