In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from wordcloud import STOPWORDS

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay

In [None]:
from scr.models.embeddings import read_glove_embeddings, create_embeddings_matrix, build_embeddings
from scr.models.models import build_NN, build_DistilBert, train_model
from scr.models.metrics import performance_history

In [None]:

def cm_plot(y_true, y_pred):
    fig, ax = plt.subplots(1, 2, sharey=True, figsize=(5, 3))
    ConfusionMatrixDisplay.from_predictions(y_true, y_pred, ax=ax[0], colorbar=False)
    ConfusionMatrixDisplay.from_predictions(
        y_true,
        y_pred,
        normalize="true",
        values_format=".2f",
        ax=ax[1],
        im_kw={"vmin": 0, "vmax": 1},
        colorbar=False,
    )
    fig.suptitle("Confusion Matrix")
    ax[0].set_title("Counts")
    ax[1].set_title("Proportions")
    ax[1].set_ylabel(None)
    #plt.savefig("../figures/40.binary_confusion_matrix.png")

In [None]:
raw = pd.read_csv("./data/raw/train.csv")

clean = pd.read_csv("./data/clean/train.csv")
clean = clean.dropna()

X= clean["text"]
y = clean["target"].values

# X= raw["text"]
# y = raw["target"].values

MAX_LEN=32
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(X)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# X_train_pad = preprocessing(X_train, tokenizer, MAX_LEN)
# X_val_pad = preprocessing(X_val, tokenizer, MAX_LEN)

In [None]:
trained_model = {}

DistilBert

In [None]:
glove_embeddings_dict = read_glove_embeddings("./models/embeddings/glove.6B/glove.6B.100d.txt")
embeddings_matrix = create_embeddings_matrix(tokenizer, glove_embeddings_dict, 100)
embeddings_layer = build_embeddings(embeddings_matrix, 32)

In [None]:
for flavour in ["LSTM", "GRU", "RNN"]:
    
    print(f"====== {flavour} ======")

    model = build_NN(embeddings_layer,tokenizer,flavour=flavour)
    model, history = train_model(model, X_train, y_train, X_val, y_val, epoch=100)
    trained_model[flavour] = model
    performance_history(history, name=flavour, directory_path="./figures/")
    plt.show()
    
    y_pred = model.predict(X_val)
    y_pred = y_pred = np.argmax(y_pred, axis=1)
    print(classification_report(y_val, y_pred))

    cm_plot(y_val, y_pred)
    plt.savefig(f"./figures/{flavour}.confusion_matrix.png")
    plt.show()
    
    break

In [None]:
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt


# Calculate ROC AUC scores for each model

model_auc = []

fig, ax = plt.subplots(1,2, figsize=(6, 3), width_ratios=(3,1))

for name in trained_model.keys():
    if name == "distilBert":
        y_pred_proba = trained_model[name].predict(X_val)
    else:
        y_pred_proba = trained_model[name].predict(X_val_pad)
    
    fpr, tpr, _ = roc_curve(y_val,  y_pred_proba[:,1])
    y_pred = np.argmax(y_pred_proba, axis=1)
    auc = roc_auc_score(y_val, y_pred)
    model_auc.append(auc)
    ax[0].plot(fpr,tpr,label=f"{name}, AUC = {auc:.2f}")
    

ax[0].legend(loc='lower right')
ax[0].set_title("Receiver Operating Characteristic (ROC) Curve", size=10)
ax[0].set_xlabel("FPR")
ax[0].set_ylabel("TPR")

# Plot ROC AUC scores
ax[1].barh(list(trained_model.keys())[::-1],model_auc[::-1])
ax[1].set_xlim(0.5, 1)
ax[1].set_ylabel("Model")
ax[1].set_xlabel("AUC")
ax[1].set_title("ROC Area Under\nCurve (AUC)", size=10)

plt.tight_layout()
plt.savefig("./figures/Model.competition.png")
plt.show()


In [16]:
import pandas as pd
df = pd.read_csv("./data/clean/test.csv")
X = df["text"].dropna()

In [9]:
from scr.models.preprocessing import preprocessing

In [3]:
preprocessing_path = "./models/tokenizer.pkl"
with open(preprocessing_path, "rb") as file:
    tokenizer = pickle.load(file)

In [17]:
for x in X:
    preprocessing(pd.Series(x), tokenizer, 32)

In [20]:
X

0                             happened terrible car crash
1       heard earthquake different cities stay safe ev...
2       forest fire spot pond geese fleeing across str...
3                   apocalypse lighting spokane wildfires
4                     typhoon soudelor kills china taiwan
                              ...                        
3258    earthquake safety los angeles safety fasteners...
3259    storm ri worse last hurricane others hardest h...
3260                        green line derailment chicago
3261             meg issues hazardous weather outlook hwo
3262    city calgary activated municipal emergency pla...
Name: text, Length: 3259, dtype: object