In [1]:
# %pip install nltk
# %pip install pandas
# %pip install scikit-learn
# %pip install matplotlib
# %pip install gradio
# %pip install transformers
# %pip install tensorflow
# %pip install keras
# %pip install Keras-Preprocessing
# %pip install torch
# %pip install datasets
# %pip install evaluate
# %pip install numpy
# %pip install accelerate
# %pip install emoji==0.6.0
# %pip install torch torchvision torchaudio


In [2]:
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gradio as gr

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from string import punctuation

from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import torch
import evaluate
from transformers import BertForSequenceClassification, AutoModelForSequenceClassification
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import BertweetTokenizer
from transformers import AlbertTokenizer, AlbertModel
from transformers import AutoModel
from transformers import AutoConfig
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
from torch.utils.data import TensorDataset
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler,random_split

import os
os.environ["KERAS_BACKEND"] = "tensorflow"
import keras
from keras.models import Sequential
from keras import layers
from keras.backend import clear_session
from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

from typing import Iterable
from gradio.themes.base import Base
from gradio.themes.utils import colors, fonts, sizes


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to /Users/guna/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/guna/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/guna/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/guna/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
input_df = pd.read_csv('dataset/train.csv')

In [4]:
input_df

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


## Data Cleanup

In [5]:
%%capture
def data_cleanup(train_df):
    train_df['text'] = train_df['text'].str.lower()
    train_df['text'] = train_df['text'].str.strip()
    train_df['text'] = train_df['text'].replace(to_replace ='http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', value = '', regex = True)
    train_df['text'] = train_df['text'].replace(to_replace ='\?*', value = '', regex = True)
    train_df['text'] = train_df['text'].replace(to_replace ='(RT|rt)', value = '', regex = True)
    train_df['text'] = train_df['text'].replace(to_replace ='@[a-z,_]*', value = '', regex = True)
    train_df['text'] = train_df['text'].replace(to_replace ='([0-9]*:[0-9]*)', value = '', regex = True)
    train_df['text'] = train_df['text'].replace(to_replace ='([0-9]*\.[0-9]*)', value = '', regex = True)
    train_df['text'] = train_df['text'].replace(to_replace ='(utc|gmt)', value = '', regex = True)
    train_df['text'] = train_df['text'].replace(to_replace ='_[\S]', value = '', regex = True)
    train_df['text'] = train_df['text'].replace(to_replace ='&amp;?', value = 'and', regex = False)
    train_df['text'] = train_df['text'].replace(to_replace ='&lt;', value = '<', regex = False)
    train_df['text'] = train_df['text'].replace(to_replace ='&gt;', value = '>', regex = False)
    train_df['text'] = train_df['text'].replace(to_replace ='[ ]{2, }', value = ' ', regex = True)
    train_df['text'] = train_df['text'].replace(to_replace ='([^\w\d ]+)', value = '', regex = True)
    return train_df['text']

In [6]:
%%capture
input_df['text'] = data_cleanup(input_df)

In [7]:
input_df

Unnamed: 0,id,keyword,location,text,target
0,1,,,our deeds are the reason of this eahquake may ...,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,all residents asked to shelter in place are be...,1
3,6,,,13000 people receive wildfires evacuation orde...,1
4,7,,,just got sent this photo from ruby alaska as s...,1
...,...,...,...,...,...
7608,10869,,,two giant cranes holding a bridge collapse int...,1
7609,10870,,,the out of control wild fires in california ...,1
7610,10871,,,m 5km s of volcano hawaii,1
7611,10872,,,police investigating after an ebike collided w...,1


## Train and Test Split

In [8]:
#Training and Test Dataset split
tweet_texts = input_df['text']
class_labels = input_df['target']
train_tweets, test_tweets, train_labels, test_labels = train_test_split(tweet_texts,class_labels,test_size=0.2, random_state=42, stratify=class_labels)

In [9]:
#Concat tweets and labels series from the split into dataframe
train_cols = [pd.Series(train_tweets, name='text'), pd.Series(train_labels, name='labels')]
train_df = pd.concat(train_cols, axis = 1)
test_cols = [pd.Series(test_tweets, name='text'), pd.Series(test_labels, name='labels')]
test_df = pd.concat(test_cols,axis = 1)

## Helper Functions

In [10]:
#Text Preprocessor
def preprocessing(text):
   word_lemma = []
   tweet_tokenize = TweetTokenizer()
   tokens = tweet_tokenize.tokenize((text).lower())
   tokens = [w for w in tokens if w not in punctuation and not w.isdigit() and not len(w) < 3]
   stop_words = stopwords.words ('english')
   tweet_without_stopwords = [t for t in tokens if t not in stop_words]
   text = " ".join (tweet_without_stopwords)
   word_lemma = [WordNetLemmatizer().lemmatize(t) for t in tweet_tokenize.tokenize(text)]
   pp_text = " ".join (word_lemma)
   return pp_text

In [11]:
def get_performance_score(actual_label : list, predicted_label : list):
    '''Function to calculate the performance metric using sklearn.
    
    Parameters
    ----------
    actual_label : list
      Actual(Ground Truth) class label from the dataset.
    predicted_label : pd.DataFrame
      Class label predicted by the model
    
    Return
    ------
    f1_score : float
    accuracy : float
    precision : float
    recall : float
    AUROC : float
    '''
    precision = metrics.precision_score(actual_label, predicted_label, pos_label=1)
    recall = metrics.recall_score(actual_label, predicted_label,pos_label=1)
    AUROC = metrics.roc_auc_score(actual_label, predicted_label)
    accuracy = metrics.accuracy_score(actual_label, predicted_label)
    f1_score = metrics.f1_score(actual_label, predicted_label,pos_label=1)
    confusion_mat = metrics.confusion_matrix(actual_label, predicted_label)
    metrics_list = [accuracy, precision, recall, AUROC, f1_score]
    metrics_list = pd.DataFrame(metrics_list).T
    metrics_df = metrics_list.rename(columns={0:'Accuracy',1:'Precision',2:'Recall',3:'AUROC', 4:'F1'})
    return metrics_df, confusion_mat

In [12]:
def plot_confusion_matrix(confusion_mat, model_name):
    _, cm_ax = plt.subplots(facecolor='#212936')
    cm_plot = ConfusionMatrixDisplay(confusion_matrix=confusion_mat,display_labels=['Not Disaster','Disaster'])
    title = model_name + " Confusion Matrix"
    cm_plot.plot(cmap=plt.cm.Greens, ax=cm_ax)
    plt.title(title)
    plt.show()


In [13]:
def consolidate_perf_score(models_list, perf_score_list):
  '''Function to consolidate the performance metrics of all the models(KNeighborsClassifier, RandomForestClassifier, LogisticRegression, MLPClassifier) 
  and return a pd.DataFrame.

    Parameters
    ----------
    models_list : list
      List of models.
    perf_score_list : list
      List of performance metrics data frame from various models.
      
    Return
    ------
    consolidated_metrics_df : pd.DataFrame
    '''
  
  consolidated_perf_score_df = pd.concat(perf_score_list)
  consolidated_perf_score_df = consolidated_perf_score_df.rename(columns={0:'Accuracy',1:'Precision',2:'Recall',3:'AUROC', 4:'F1'})
  consolidated_perf_score_df.insert(0,'Model',models_list)
  return consolidated_perf_score_df

In [14]:
def plot_bar_plots(perf_results_df, themes = ['light', 'dark']):
    model_dict = {'NB':'Naive Bayes', 'LR':'Logistic Regression','SVC':'SVM','KNC':'K-Nearest Neighbor',
                'CNN':'CNN','RNN':'RNN',
                'BERTweet':'BERTweet','RoBERTa':'RoBERTa'}
    for theme in themes:
        for shortName, longName in model_dict.items():
            if theme == 'light':
                plt.rcParams['text.color'] = 'black'
                plt.rcParams['axes.labelcolor'] = 'black'
                plt.rcParams['xtick.color'] = 'black'
                plt.rcParams['ytick.color'] = 'black'
                plt.rcParams['axes.edgecolor'] = 'black'
                _, bar_ax = plt.subplots(facecolor='#FFFFFF')
                bar_plot = perf_results_df[shortName].plot(figsize=(8,4), title=f"{longName} Performance Metrics", kind='bar', ax = bar_ax)
                bar_plot.set_facecolor('#FFFFFF')
            elif theme == 'dark':
                plt.rcParams['text.color'] = 'white'
                plt.rcParams['axes.labelcolor'] = 'white'
                plt.rcParams['xtick.color'] = 'white'
                plt.rcParams['ytick.color'] = 'white'
                plt.rcParams['axes.edgecolor'] = '#ffffff'
                _, bar_ax = plt.subplots(facecolor='#212936')
                bar_plot = perf_results_df[shortName].plot(figsize=(8,4), title=f"{longName} Performance Metrics", kind='bar', ax=bar_ax)
                bar_plot.set_facecolor('#212936')
            bar_labels = bar_plot.bar(perf_results_df.index, perf_results_df[shortName], color = 'g', width=0.5)
            bar_plot.bar_label(bar_labels, label_type='edge')
            bar_plot.set_ylim([0, 1])
            plt.savefig(f'results/images/performance/{longName}_{theme}.png', bbox_inches='tight')
            plt.close()

In [15]:
def plot_confusion_matrix(confusion_mat, theme, model):
    if theme == 'dark':
        plt.rcParams['text.color'] = 'white'
        plt.rcParams['axes.labelcolor'] = 'white'
        plt.rcParams['xtick.color'] = 'white'
        plt.rcParams['ytick.color'] = 'white'
        cm_fig, cm_ax = plt.subplots(facecolor='#212936', figsize=(6,4))
    elif theme == 'light':
        plt.rcParams['text.color'] = 'black'
        plt.rcParams['axes.labelcolor'] = 'black'
        plt.rcParams['xtick.color'] = 'black'
        plt.rcParams['ytick.color'] = 'black'
        cm_fig, cm_ax = plt.subplots(facecolor='#FFFFFF', figsize=(6,4))
    cm_plot = ConfusionMatrixDisplay(confusion_matrix=confusion_mat,display_labels=['Not Disaster','Disaster'])
    title = model + " Confusion Matrix"
    cm_plot.plot(cmap=plt.cm.Greens, ax=cm_ax)
    plt.title(title)
    cm_fig.savefig(f'results/images/confusion/{model}_{theme}.png', bbox_inches='tight')
    plt.close()

In [16]:
def plot_bar_all(perf_results_df, theme):
    if theme == 'light':
        plt.rcParams['text.color'] = 'black'
        plt.rcParams['axes.labelcolor'] = 'black'
        plt.rcParams['xtick.color'] = 'black'
        plt.rcParams['ytick.color'] = 'black'
        plt.rcParams['axes.edgecolor'] = 'black'
        plt.rcParams['legend.facecolor'] = '#ffffff'
        _, bar_ax = plt.subplots(facecolor='#FFFFFF')
        bar_plot = perf_results_df.plot(figsize=(20,4), title=f"Performance Metrics", kind='bar', ax = bar_ax)
        bar_plot.set_facecolor('#FFFFFF')
    elif theme == 'dark':
        plt.rcParams['text.color'] = 'white'
        plt.rcParams['axes.labelcolor'] = 'white'
        plt.rcParams['xtick.color'] = 'white'
        plt.rcParams['ytick.color'] = 'white'
        plt.rcParams['axes.edgecolor'] = '#ffffff'
        plt.rcParams['legend.facecolor'] = '#212936'
        _, bar_ax = plt.subplots(facecolor='#212936')
        bar_plot = perf_results_df.plot(figsize=(20,4), title=f"Performance Metrics", kind='bar', ax=bar_ax, cmap=plt.cm.YlGn)
        bar_plot.set_facecolor('#212936')
    bar_ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    bar_plot.set_ylim([0, 1])
    plt.savefig(f'results/images/performance/all_{theme}.png', bbox_inches='tight')
    plt.close()

## Traditional Classifiers

In [17]:
vectorizer = CountVectorizer(preprocessor=preprocessing,ngram_range = (1,2))
vectors_train = vectorizer.fit_transform(train_df['text'])
vectors_test = vectorizer.transform(test_df['text'])
train_labels = train_df['labels']
test_labels = test_df['labels']

In [18]:
def cv_grid_search(model, param_grid, cv, scoring, train_tweet, train_label):
  '''Function to perform grid search.

    Parameters
    ----------
    model : Object
    norm : str
    param_grid : list
    cv : int
    scoring : str
    
    Return
    ------
    grid_search : Object
    '''
  grid_search = GridSearchCV(model, param_grid, cv=cv, scoring=scoring, n_jobs=-1, return_train_score=True, error_score = True)
  grid_search.fit(train_tweet, train_label)
  return grid_search

In [19]:
def initialize_trad_models():
    '''Function to initialize the traditional models from sklearn.

    Return
    ------
    nb : Object
    knc : Object
    lr : Object
    svc : Object
    '''
    nb = MultinomialNB()
    knc = KNeighborsClassifier()
    lr = LogisticRegression()
    svc = SVC()
    return nb, knc,lr, svc

In [20]:
nb, knc, lr, svc = initialize_trad_models()

In [21]:
%%capture
param_grid = {
      'alpha': (0.01,0.2,0.4,1.0),
      'fit_prior': (True,False)}
nb_gs = cv_grid_search(nb, param_grid, 10, 'f1', vectors_train, train_labels)

In [22]:
param_grid = [{'n_neighbors': [3, 5, 10, 12],
               'p': [1, 2],
               'weights':['uniform', 'distance'],
               'algorithm': ['auto', 'brute']}]
knc_gs = cv_grid_search(knc, param_grid, 10, 'f1', vectors_train, train_labels)

In [23]:
param_grid = [
    {'penalty': ['l2'],
     'tol': [1e-3, 1e-4],
     'solver':['lbfgs', 'liblinear'],
     'max_iter': [1000, 5000, 10000],
     'random_state': [42]}
  ]
lr_gs = cv_grid_search(lr, param_grid, 10, 'f1', vectors_train, train_labels)

In [24]:
param_grid = [
    {'kernel': ['poly', 'sigmoid', 'rbf'],
     'gamma' : ['scale', 'auto'],
     'random_state': [42]}
  ]
svc_gs = cv_grid_search(svc, param_grid, 10, 'f1', vectors_train, train_labels)

In [25]:
best_nb, best_knc, best_lr, best_svc = initialize_trad_models()

In [26]:
# Set the best parameter values for each model from grid search for the hyperparameters 
best_nb.set_params(**nb_gs.best_params_)
best_knc.set_params(**knc_gs.best_params_)
best_lr.set_params(**lr_gs.best_params_)
best_svc.set_params(**svc_gs.best_params_)

#Train the each model with the best parameters
best_nb.fit(vectors_train,train_labels)
best_knc.fit(vectors_train,train_labels)
best_lr.fit(vectors_train,train_labels)
best_svc.fit(vectors_train,train_labels)

#Predict the labels on test dataset using the trained models
nb_predict = best_nb.predict(vectors_test)
knc_predict = best_knc.predict(vectors_test)
lr_predict = best_lr.predict(vectors_test)
svc_predict = best_svc.predict(vectors_test)
nb_predicted_labels = np.array(nb_predict, dtype = int)
knc_predicted_labels = np.array(knc_predict, dtype = int)
lr_predictted_labels = np.array(lr_predict, dtype = int)
svc_predicted_labels = np.array(svc_predict, dtype = int)
actual_labels = np.array(test_labels, dtype = int)

#Calculate the performance metrics based on the predicted labels and actual labels in test dataset
nb_perf_scores, nb_cm = get_performance_score(nb_predicted_labels, actual_labels)
knc_perf_scores, knc_cm = get_performance_score(knc_predicted_labels, actual_labels)
lr_perf_scores, lr_cm = get_performance_score(lr_predictted_labels, actual_labels)
svc_perf_scores, svc_cm = get_performance_score(svc_predicted_labels, actual_labels)


In [27]:
confusion_mats = {
    'Naive Bayes': nb_cm,
    'K-Nearest Neighbor': knc_cm,
    'Logistic Regression': lr_cm,
    'SVM': svc_cm
}

# Preprocessing for NN

In [28]:
tweet_texts = input_df['text']
class_labels = input_df['target']
train_tweets, test_tweets, train_labels, test_labels = train_test_split(tweet_texts,class_labels,test_size=0.2, random_state=42, stratify=class_labels)
train_tweets, validation_tweets, train_labels, validation_labels = train_test_split(train_tweets, train_labels, test_size = 0.2, random_state = 42, stratify=train_labels)

In [29]:
train_tweets=train_tweets.to_frame()
validation_tweets = validation_tweets.to_frame()
test_tweets = test_tweets.to_frame()

In [30]:
train_tweets['processed_text'] = train_tweets['text'].apply(preprocessing)
validation_tweets['processed_text'] = validation_tweets['text'].apply(preprocessing)
test_tweets['processed_text'] = test_tweets['text'].apply(preprocessing)

In [31]:
tokenizer = Tokenizer(num_words=18000)
tokenizer.fit_on_texts(train_tweets['processed_text'])

X_train = tokenizer.texts_to_sequences(train_tweets['processed_text'])
X_val = tokenizer.texts_to_sequences(validation_tweets['processed_text'])
X_test = tokenizer.texts_to_sequences(test_tweets['processed_text'])

vocab_size = len(tokenizer.word_index) + 1

In [32]:
maxlen = 128

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_val = pad_sequences(X_val, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [33]:
clear_session()

## CNN

In [34]:
embedding_dim = 50
inputs = keras.Input(shape=(None,), dtype="int64")
cnn_layer = layers.Embedding(20000, embedding_dim)(inputs)
cnn_layer = layers.Dropout(0.5)(cnn_layer)


cnn_layer = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(cnn_layer)
# cnn_layer = layers.Conv1D(128, 7, padding="valid", activation="softmax", strides=3)(cnn_layer)
# cnn_layer = layers.Conv1D(128, 7, padding="valid", activation="softmax", strides=3)(cnn_layer)
cnn_layer = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(cnn_layer)
cnn_layer = layers.GlobalMaxPooling1D()(cnn_layer)


cnn_layer = layers.Dense(128, activation="relu")(cnn_layer)
cnn_layer = layers.Dropout(0.5)(cnn_layer)

predictions = layers.Dense(1, activation="sigmoid", name="predictions")(cnn_layer)

cnn_model = keras.Model(inputs, predictions)

cnn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
cnn_model.summary()

In [35]:
# cnn_model.fit(X_train, train_labels,
#                     epochs=2,
#                     verbose=False,
#                     validation_data=(X_val, validation_labels),
#                     batch_size=10)

In [36]:
# cnn_model.save('model/cnn/cnn_model.keras', include_optimizer=True)

In [37]:
cnn_model = keras.models.load_model('model/cnn/cnn_model.keras')
cnn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [38]:
# pred_test = cnn_model.predict(X_test)
# pred_test = np.where(pred_test > 0.7, 1, 0)
# pred_test = pred_test.flatten()
# metrics_df_cnn, confusion_mat_cnn = get_performance_score(test_labels, pred_test)

## RNN

In [39]:
embedding_dim = 50

inputs = keras.Input(shape=(None,), dtype="int64")
rnn_layer = layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=maxlen)(inputs)
rnn_layer = layers.Dropout(0.5)(rnn_layer)

# model = Sequential()
rnn_layer = layers.GlobalMaxPool1D()(rnn_layer)
# model.add(layers.Flatten())
rnn_layer = layers.Dense(10, activation='relu')(rnn_layer)
rnn_layer = layers.Dense(10, activation='softmax')(rnn_layer)
rnn_layer = layers.Dense(10, activation='tanh')(rnn_layer)
rnn_layer = layers.Dense(10, activation='gelu')(rnn_layer)
rnn_layer = layers.Dense(10, activation='relu')(rnn_layer)
predictions = layers.Dense(1, activation='sigmoid')(rnn_layer)
rnn_model = keras.Model(inputs, predictions)
rnn_model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
rnn_model.summary()



In [40]:
# rnn_model.fit(X_train, train_labels,
#                     epochs=2,
#                     verbose=False,
#                     validation_data=(X_val, validation_labels),
#                     batch_size=10)

In [41]:
# rnn_model.save("model/rnn/rnn_model.keras", include_optimizer=True)

In [42]:
rnn_model = keras.models.load_model('model/rnn/rnn_model.keras')
rnn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

  saveable.load_own_variables(weights_store.get(inner_path))


In [43]:
# pred_test = rnn_model.predict(X_test)
# pred_test = np.where(pred_test > 0.7, 1, 0)
# pred_test = pred_test.flatten()
# metrics_df_rnn, confusion_mat_rnn = get_performance_score(test_labels, pred_test)

In [44]:
# consolidate_perf_score(['RNN', 'CNN'],[metrics_df_rnn, metrics_df_cnn])

## BERTweet

In [45]:
for index, row in input_df.iterrows():
        text = row['text']
        pp_text = preprocessing(text)
        input_df.at[index, 'text'] = pp_text

In [46]:
#Training and Test Dataset split
tweet_texts = input_df['text']
class_labels = input_df['target']
train_tweets, test_tweets, train_labels, test_labels = train_test_split(tweet_texts,class_labels,test_size=0.2, random_state=42, stratify=class_labels)

In [47]:
##raining and Dev Dataset split
tweet_texts = train_tweets
class_labels = train_labels
train_tweets, dev_tweets, train_labels, dev_labels = train_test_split(tweet_texts,class_labels,test_size=0.2, random_state=42, stratify=class_labels)

In [48]:
#Concat tweets and labels series from the split into dataframe
train_cols = [pd.Series(train_tweets, name='text'), pd.Series(train_labels, name='labels')]
train_df = pd.concat(train_cols, axis = 1)
dev_cols = [pd.Series(dev_tweets, name='text'), pd.Series(dev_labels, name='labels')]
dev_df = pd.concat(dev_cols, axis = 1)
test_cols = [pd.Series(test_tweets, name='text'), pd.Series(test_labels, name='labels')]
test_df = pd.concat(test_cols,axis = 1)

In [49]:
#Define constants for BERTweet model
model_name = "vinai/bertweet-base"
# model_name = "model/bertweet/v1"
max_length = 32
trucate = True
padding='max_length'
batch_size = 32
id2text = {0: "not_disaster", 1: "disaster"}
text2id = {"not_disaster": 0, "disaster": 1}

In [50]:
#Intialize tokenizer, data_collector and classifier for BERTweet
tokenizer = BertweetTokenizer.from_pretrained(model_name)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
classifier = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, id2label=id2text, label2id=text2id)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [51]:
def preprocessor(input):
     token_ids_dict = tokenizer.encode_plus(input['text'], add_special_tokens = True, padding=padding, max_length=max_length, truncation=trucate,return_attention_mask = True)
     token_ids_dict['label'] = input['labels']
     return token_ids_dict

In [52]:
#Convert the input text into token_ids, attention_mask and token_type_ids dataset
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(dev_df)
test_dataset = Dataset.from_pandas(test_df)
train_map = train_dataset.map(preprocessor)
dev_map = eval_dataset.map(preprocessor)
test_map = test_dataset.map(preprocessor)

Map: 100%|██████████| 4872/4872 [00:00<00:00, 7842.38 examples/s]
Map: 100%|██████████| 1218/1218 [00:00<00:00, 6073.91 examples/s]
Map: 100%|██████████| 1523/1523 [00:00<00:00, 6200.26 examples/s]


In [53]:
def calculate_score(labels):
    f1 = evaluate.load("accuracy")
    predicted, actual = labels
    predicted = np.argmax(predicted, axis=1)
    return f1.compute(predictions=predicted, references=actual)

In [56]:
#Intialize Trainer and Training Arguments for finetuning BERTweet
training_args = TrainingArguments(
    output_dir="trainer_cache",
    overwrite_output_dir=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model = 'accuracy',
    greater_is_better=True,
    num_train_epochs=8,
    learning_rate = 1e-5,
    adam_epsilon = 1e-5,
    weight_decay = 1e-5,
    adafactor = False,

)

bt_trainer = Trainer(
    model=classifier,
    args=training_args,
    train_dataset=train_map,
    eval_dataset=dev_map,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=calculate_score,
)

In [57]:
#Finetune BERTweet
bt_trainer.train()

                                                  
 12%|█▎        | 609/4872 [05:18<10:46,  6.59it/s]

{'loss': 0.3811, 'grad_norm': 9.013161659240723, 'learning_rate': 8.973727422003284e-06, 'epoch': 0.82}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                  
[A                                               

 12%|█▎        | 609/4872 [05:40<10:46,  6.59it/s]
[A
[A

{'eval_loss': 0.5155385732650757, 'eval_accuracy': 0.825944170771757, 'eval_runtime': 4.7974, 'eval_samples_per_second': 253.886, 'eval_steps_per_second': 31.892, 'epoch': 1.0}


                                                  
 12%|█▎        | 609/4872 [06:42<10:46,  6.59it/s] 

{'loss': 0.3995, 'grad_norm': 46.756500244140625, 'learning_rate': 7.947454844006569e-06, 'epoch': 1.64}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                  
[A                                                

 12%|█▎        | 609/4872 [07:20<10:46,  6.59it/s]
[A
[A

{'eval_loss': 0.4666399657726288, 'eval_accuracy': 0.8292282430213465, 'eval_runtime': 4.6475, 'eval_samples_per_second': 262.077, 'eval_steps_per_second': 32.921, 'epoch': 2.0}


                                                  
 12%|█▎        | 609/4872 [08:05<10:46,  6.59it/s] 

{'loss': 0.3633, 'grad_norm': 4.07218074798584, 'learning_rate': 6.9211822660098524e-06, 'epoch': 2.46}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                  
[A                                                

 12%|█▎        | 609/4872 [08:59<10:46,  6.59it/s]
[A
[A

{'eval_loss': 0.5602150559425354, 'eval_accuracy': 0.8333333333333334, 'eval_runtime': 4.6927, 'eval_samples_per_second': 259.553, 'eval_steps_per_second': 32.604, 'epoch': 3.0}


                                                  
 12%|█▎        | 609/4872 [09:29<10:46,  6.59it/s] 

{'loss': 0.3362, 'grad_norm': 56.42657470703125, 'learning_rate': 5.894909688013136e-06, 'epoch': 3.28}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                  
[A                                                

 12%|█▎        | 609/4872 [10:40<10:46,  6.59it/s]
[A
[A

{'eval_loss': 0.7704036831855774, 'eval_accuracy': 0.8045977011494253, 'eval_runtime': 4.6564, 'eval_samples_per_second': 261.577, 'eval_steps_per_second': 32.858, 'epoch': 4.0}


                                                  
 12%|█▎        | 609/4872 [10:53<10:46,  6.59it/s] 

{'loss': 0.3002, 'grad_norm': 0.20008423924446106, 'learning_rate': 4.868637110016421e-06, 'epoch': 4.11}


                                                  
 12%|█▎        | 609/4872 [12:09<10:46,  6.59it/s] 

{'loss': 0.2908, 'grad_norm': 0.23964877426624298, 'learning_rate': 3.842364532019705e-06, 'epoch': 4.93}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                  
[A                                                

 12%|█▎        | 609/4872 [12:21<10:46,  6.59it/s]
[A
[A

{'eval_loss': 0.83719801902771, 'eval_accuracy': 0.8037766830870279, 'eval_runtime': 4.6289, 'eval_samples_per_second': 263.131, 'eval_steps_per_second': 33.053, 'epoch': 5.0}


                                                  
 12%|█▎        | 609/4872 [13:33<10:46,  6.59it/s] 

{'loss': 0.2338, 'grad_norm': 18.457305908203125, 'learning_rate': 2.8160919540229887e-06, 'epoch': 5.75}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                  
[A                                                

 12%|█▎        | 609/4872 [14:01<10:46,  6.59it/s]
[A
[A

{'eval_loss': 0.8814917802810669, 'eval_accuracy': 0.7980295566502463, 'eval_runtime': 4.8396, 'eval_samples_per_second': 251.673, 'eval_steps_per_second': 31.614, 'epoch': 6.0}


                                                  
 12%|█▎        | 609/4872 [14:57<10:46,  6.59it/s] 

{'loss': 0.2261, 'grad_norm': 0.3064159154891968, 'learning_rate': 1.7898193760262728e-06, 'epoch': 6.57}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                  
[A                                                

 12%|█▎        | 609/4872 [15:41<10:46,  6.59it/s]
[A
[A

{'eval_loss': 0.9191906452178955, 'eval_accuracy': 0.7980295566502463, 'eval_runtime': 4.6142, 'eval_samples_per_second': 263.97, 'eval_steps_per_second': 33.159, 'epoch': 7.0}


                                                  
 12%|█▎        | 609/4872 [16:21<10:46,  6.59it/s] 

{'loss': 0.1923, 'grad_norm': 0.14674822986125946, 'learning_rate': 7.635467980295568e-07, 'epoch': 7.39}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                  
[A                                                

 12%|█▎        | 609/4872 [17:22<10:46,  6.59it/s]
[A
[A

{'eval_loss': 0.8857409954071045, 'eval_accuracy': 0.8119868637110016, 'eval_runtime': 4.6332, 'eval_samples_per_second': 262.883, 'eval_steps_per_second': 33.022, 'epoch': 8.0}


                                                  
100%|██████████| 4872/4872 [13:23<00:00,  6.06it/s]

{'train_runtime': 803.9221, 'train_samples_per_second': 48.482, 'train_steps_per_second': 6.06, 'train_loss': 0.29546034394813875, 'epoch': 8.0}





TrainOutput(global_step=4872, training_loss=0.29546034394813875, metrics={'train_runtime': 803.9221, 'train_samples_per_second': 48.482, 'train_steps_per_second': 6.06, 'total_flos': 640938530856960.0, 'train_loss': 0.29546034394813875, 'epoch': 8.0})

In [58]:
actual_label = test_df['labels']
predictions_prob = bt_trainer.predict(test_map)
predictions =  predictions_prob.predictions
predictions = np.argmax(predictions,axis=1)
predicted_lables = np.array(predictions, dtype = int)
actual_labels = np.array(actual_label, dtype = int)
bt_metrics_df, bt_confusion_mat = get_performance_score(actual_labels, predicted_lables)

100%|██████████| 191/191 [00:06<00:00, 31.22it/s]


In [59]:
confusion_mats['BERTweet'] = bt_confusion_mat

In [60]:
bt_trainer.save_model(output_dir = 'model/bertweet/v2/')

## RoBERTa

In [61]:
model_name = "FacebookAI/xlm-roberta-base"
# model_name = 'model/roberta/v1'
max_length = 32
trucate = True
padding='max_length'
id2text = {0: "not_disaster", 1: "disaster"}
text2id = {"not_disaster": 0, "disaster": 1}

In [62]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
classifier = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, id2label=id2text, label2id=text2id)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [63]:
def preprocessor(input):
     token_ids_dict = tokenizer.encode_plus(input['text'], add_special_tokens = True, padding=padding, max_length=max_length, truncation=trucate,return_attention_mask = True)
     token_ids_dict['label'] = input['labels']
     return token_ids_dict

In [64]:
#Convert the input text into token_ids, attention_mask and token_type_ids dataset
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(dev_df)
test_dataset = Dataset.from_pandas(test_df)
train_map = train_dataset.map(preprocessor)
dev_map = eval_dataset.map(preprocessor)
test_map = test_dataset.map(preprocessor)



Map: 100%|██████████| 4872/4872 [00:00<00:00, 7575.79 examples/s]
Map: 100%|██████████| 1218/1218 [00:00<00:00, 9968.74 examples/s] 
Map: 100%|██████████| 1523/1523 [00:00<00:00, 9820.85 examples/s]


In [65]:
training_args = TrainingArguments(
    output_dir="trainer_cache",
    overwrite_output_dir=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model = 'accuracy',
    greater_is_better=True,
    num_train_epochs=8,
    learning_rate = 1e-5,
    adam_epsilon = 1e-5,
    weight_decay = 1e-5,
    adafactor = False,
    use_mps_device=False

)

rb_trainer = Trainer(
    model=classifier,
    args=training_args,
    train_dataset=train_map,
    eval_dataset=dev_map,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=calculate_score,
)

In [66]:
rb_trainer.train()

                                                  
 12%|█▎        | 609/4872 [19:37<10:46,  6.59it/s]

{'loss': 0.5741, 'grad_norm': 9.048018455505371, 'learning_rate': 8.973727422003284e-06, 'epoch': 0.82}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                  

[A[A                                           
 12%|█▎        | 609/4872 [20:07<10:46,  6.59it/s]
[A
[A

{'eval_loss': 0.5399500727653503, 'eval_accuracy': 0.7430213464696224, 'eval_runtime': 4.7979, 'eval_samples_per_second': 253.864, 'eval_steps_per_second': 31.889, 'epoch': 1.0}


                                                  
 12%|█▎        | 609/4872 [21:42<10:46,  6.59it/s] 

{'loss': 0.5034, 'grad_norm': 29.211271286010742, 'learning_rate': 7.947454844006569e-06, 'epoch': 1.64}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                  

[A[A                                           
 12%|█▎        | 609/4872 [22:38<10:46,  6.59it/s] 
[A
[A

{'eval_loss': 0.573973536491394, 'eval_accuracy': 0.777504105090312, 'eval_runtime': 4.6747, 'eval_samples_per_second': 260.549, 'eval_steps_per_second': 32.729, 'epoch': 2.0}


                                                  
 12%|█▎        | 609/4872 [23:48<10:46,  6.59it/s] 

{'loss': 0.437, 'grad_norm': 11.050125122070312, 'learning_rate': 6.9211822660098524e-06, 'epoch': 2.46}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                  

[A[A                                           
 12%|█▎        | 609/4872 [25:08<10:46,  6.59it/s] 
[A
[A

{'eval_loss': 0.46417587995529175, 'eval_accuracy': 0.8284072249589491, 'eval_runtime': 4.9044, 'eval_samples_per_second': 248.348, 'eval_steps_per_second': 31.196, 'epoch': 3.0}


                                                  
 12%|█▎        | 609/4872 [25:53<10:46,  6.59it/s] 

{'loss': 0.4239, 'grad_norm': 90.51426696777344, 'learning_rate': 5.894909688013136e-06, 'epoch': 3.28}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                  

[A[A                                           
 12%|█▎        | 609/4872 [27:39<10:46,  6.59it/s] 
[A
[A

{'eval_loss': 0.6456968188285828, 'eval_accuracy': 0.7832512315270936, 'eval_runtime': 4.7505, 'eval_samples_per_second': 256.394, 'eval_steps_per_second': 32.207, 'epoch': 4.0}


                                                  
 12%|█▎        | 609/4872 [27:59<10:46,  6.59it/s] 

{'loss': 0.4107, 'grad_norm': 5.348498821258545, 'learning_rate': 4.868637110016421e-06, 'epoch': 4.11}


                                                  
 12%|█▎        | 609/4872 [29:55<10:46,  6.59it/s] 

{'loss': 0.4021, 'grad_norm': 17.72875213623047, 'learning_rate': 3.842364532019705e-06, 'epoch': 4.93}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                  

[A[A                                           
 12%|█▎        | 609/4872 [30:10<10:46,  6.59it/s] 
[A
[A

{'eval_loss': 0.5385130643844604, 'eval_accuracy': 0.819376026272578, 'eval_runtime': 4.5661, 'eval_samples_per_second': 266.751, 'eval_steps_per_second': 33.508, 'epoch': 5.0}


                                                  
 12%|█▎        | 609/4872 [32:01<10:46,  6.59it/s] 

{'loss': 0.3724, 'grad_norm': 78.7192611694336, 'learning_rate': 2.8160919540229887e-06, 'epoch': 5.75}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                  

[A[A                                           
 12%|█▎        | 609/4872 [32:42<10:46,  6.59it/s] 
[A
[A

{'eval_loss': 0.7128706574440002, 'eval_accuracy': 0.8111658456486043, 'eval_runtime': 4.7692, 'eval_samples_per_second': 255.388, 'eval_steps_per_second': 32.081, 'epoch': 6.0}


                                                  
 12%|█▎        | 609/4872 [34:08<10:46,  6.59it/s] 

{'loss': 0.3671, 'grad_norm': 69.55010986328125, 'learning_rate': 1.7898193760262728e-06, 'epoch': 6.57}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                  

[A[A                                           
 12%|█▎        | 609/4872 [35:14<10:46,  6.59it/s] 
[A
[A

{'eval_loss': 0.6669760346412659, 'eval_accuracy': 0.8185550082101807, 'eval_runtime': 4.7752, 'eval_samples_per_second': 255.065, 'eval_steps_per_second': 32.04, 'epoch': 7.0}


                                                  
 12%|█▎        | 609/4872 [36:14<10:46,  6.59it/s] 

{'loss': 0.3341, 'grad_norm': 72.45380401611328, 'learning_rate': 7.635467980295568e-07, 'epoch': 7.39}



[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
                                                  

[A[A                                           
 12%|█▎        | 609/4872 [37:45<10:46,  6.59it/s] 
[A
[A

{'eval_loss': 0.679907500743866, 'eval_accuracy': 0.8226600985221675, 'eval_runtime': 4.7157, 'eval_samples_per_second': 258.289, 'eval_steps_per_second': 32.445, 'epoch': 8.0}


                                                  
100%|██████████| 4872/4872 [20:14<00:00,  4.01it/s]

{'train_runtime': 1214.8889, 'train_samples_per_second': 32.082, 'train_steps_per_second': 4.01, 'train_loss': 0.4203346734759451, 'epoch': 8.0}





TrainOutput(global_step=4872, training_loss=0.4203346734759451, metrics={'train_runtime': 1214.8889, 'train_samples_per_second': 32.082, 'train_steps_per_second': 4.01, 'total_flos': 640938530856960.0, 'train_loss': 0.4203346734759451, 'epoch': 8.0})

In [67]:
actual_label = test_df['labels']
predictions_prob = rb_trainer.predict(test_map)
predictions =  predictions_prob.predictions
predictions = np.argmax(predictions,axis=1)
predicted_lables = np.array(predictions, dtype = int)
actual_labels = np.array(actual_label, dtype = int)
rb_metrics_df, rb_confusion_mat = get_performance_score(actual_labels, predicted_lables)

100%|██████████| 191/191 [00:05<00:00, 33.92it/s]


In [68]:
rb_trainer.save_model(output_dir = 'model/roberta/v2/')

In [69]:
confusion_mats['RoBERTa'] = rb_confusion_mat

In [70]:
# consolidate_perf_score(['NB', 'KNN', 'LR', 'SVM', 'CNN', 'RNN', 'BERTweet', 'RoBERTa'], [nb_perf_scores, knc_perf_scores, lr_perf_scores, svc_perf_scores, metrics_df_cnn, metrics_df_rnn, bt_metrics_df, rb_metrics_df])
consolidate_perf_score(['NB', 'KNN', 'LR', 'SVM', 'BERTweet', 'RoBERTa'], [nb_perf_scores, knc_perf_scores, lr_perf_scores, svc_perf_scores, bt_metrics_df, rb_metrics_df])


Unnamed: 0,Model,F1,Accuracy,Precision,Recall,AUROC
0,NB,0.7664,0.808273,0.732416,0.803691,0.807455
0,KNN,0.621535,0.533815,0.891437,0.477087,0.620603
0,LR,0.773289,0.819435,0.717125,0.838998,0.823545
0,SVM,0.761578,0.817466,0.678899,0.867188,0.829736
0,BERTweet,0.804416,0.837163,0.830619,0.779817,0.830069
0,RoBERTa,0.796923,0.826658,0.801858,0.792049,0.822377


In [71]:
def plot_confusion_mats(confusion_mats, themes = ['light', 'dark']):
    for theme in themes:
        for name, confusion_mat in confusion_mats.items():
            plot_confusion_matrix(confusion_mat, theme, name)

## GUI

In [72]:
#Intialize Trainer and Training Arguments for finetuning BERTweet
training_args = TrainingArguments(
    output_dir="trainer_cache",
    overwrite_output_dir=True,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model = 'f1',
    greater_is_better=True,
    num_train_epochs=1,
    learning_rate = 1e-5,
    adam_epsilon = 1e-5,
    weight_decay = 1e-5,
    adafactor = False,

)

bt_trainer = Trainer(
    model=classifier,
    args=training_args,
    train_dataset=train_map,
    eval_dataset=dev_map,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=calculate_score,
)

In [73]:
def prod_preprocessor(input):
     token_ids_dict = tokenizer.encode_plus(input['text'], add_special_tokens = True, padding=padding, max_length=max_length, truncation=trucate,return_attention_mask = True)
     return token_ids_dict

In [74]:
# perf_results_df = pd.read_csv('results/csv/performance_results.csv')
# perf_results_df = perf_results_df.T
# perf_results_df.columns=['NB', 'KNC', 'LR', 'SVC', 'BERTweet', 'RoBERTa', 'RNN', 'CNN']
# perf_results_df=perf_results_df[1:]
# perf_results_df = perf_results_df.astype('float32')
# perf_results_df = perf_results_df.round(2)
# perf_results_df

In [75]:
# plot_confusion_mats(confusion_mats, themes=['dark'])
# plot_bar_plots(perf_results_df, themes=['dark'])
# plot_bar_all(perf_results_df, themes=['dark'])

In [76]:
def prod_file_predict(path):
    prod_input_df = pd.read_csv(path)
    prod_input_display = pd.DataFrame()
    prod_input_display['Tweet Text'] = prod_input_df['text']
    prod_input_df['text'] = data_cleanup(prod_input_df)
    prod_input_df['text'] = prod_input_df['text'].apply(preprocessing)
    prod_test_dataset = Dataset.from_pandas(prod_input_df)
    prod_test_dataset_map = prod_test_dataset.map(prod_preprocessor)
    prod_predictions_prob = bt_trainer.predict(prod_test_dataset_map)
    prod_predictions =  prod_predictions_prob.predictions
    prod_predictions = np.argmax(prod_predictions,axis=1)
    prod_predicted_lables = np.array(prod_predictions, dtype = int)
    prod_output_df = pd.DataFrame(data=prod_input_df['text'], columns=['text'])
    prod_output_df['label'] = pd.Series(prod_predicted_lables)
    output_map = {
        0: 'Not Disaster',
        1: 'Disaster'
    }
    prod_output_df['label'] = prod_output_df['label'].map(output_map)
    prod_input_display['Class Label'] = prod_output_df['label']
    return prod_input_display
    

In [77]:
def prod_text_predict(text):
    text_dict = {}
    text_dict['text'] = text
    prod_input_df = pd.DataFrame(text_dict.values(), columns=['text'])
    prod_input_df['text'] = data_cleanup(prod_input_df)
    prod_input_df['text'] = prod_input_df['text'].apply(preprocessing)
    prod_test_dataset = Dataset.from_pandas(prod_input_df)
    prod_test_dataset_map = prod_test_dataset.map(prod_preprocessor)
    prod_predictions_prob = bt_trainer.predict(prod_test_dataset_map)
    prod_predictions =  prod_predictions_prob.predictions
    prod_predictions = np.argmax(prod_predictions,axis=1)
    prod_predicted_lables = np.array(prod_predictions, dtype = int)
    prod_output_df = pd.DataFrame(data=prod_input_df['text'], columns=['text'])
    prod_output_df['label'] = pd.Series(prod_predicted_lables)
    output_map = {
        0: 'Not Disaster',
        1: 'Disaster'
    }
    prod_output_df['label'] = prod_output_df['label'].map(output_map)
    return prod_output_df

In [78]:
def get_file_name(metrics_input,model_input):
    theme = 'dark'
    metric_dict = {'Performance Metrics':'./results/images/performance/', 
               'Confusion Matrix':'./results/images/confusion/'}
    model_dict = {'NB':'Naive Bayes', 'LR':'Logistic Regression','SVM':'SVM','KNN':'K-Nearest Neighbor',
              'CNN':'CNN','RNN':'RNN',
              'BERTweet':'BERTweet','RoBERTa':'RoBERTa'}
    base_dir = metric_dict[metrics_input]
    filename = model_dict[model_input]
    file_path = base_dir+filename+'_'+theme+'.png'
    return file_path



In [79]:
class customtheme(Base):
    def __init__(
        self,
        *,
        primary_hue: colors.Color | str = colors.green,
        secondary_hue: colors.Color | str = colors.green,
        neutral_hue: colors.Color | str = colors.gray,
        spacing_size: sizes.Size | str = sizes.spacing_sm,
        radius_size: sizes.Size | str = sizes.radius_lg,
        text_size: sizes.Size | str = sizes.text_lg,
        font: fonts.Font
        | str
        | Iterable[fonts.Font | str] = (
            fonts.GoogleFont("Quicksand"),
            "ui-sans-serif",
            "sans-serif",
        ),
        font_mono: fonts.Font
        | str
        | Iterable[fonts.Font | str] = (
            fonts.GoogleFont("IBM Plex Mono"),
            "ui-monospace",
            "monospace",
        ),
    ):
        super().__init__(
            primary_hue=primary_hue,
            secondary_hue=secondary_hue,
            neutral_hue=neutral_hue,
            spacing_size=spacing_size,
            radius_size=radius_size,
            text_size=text_size,
            font=font,
            font_mono=font_mono,
        )
        super().set(
            block_title_text_weight="300",
            block_border_width="3px",
            block_shadow="*shadow_drop_lg",
            button_shadow="*shadow_drop_lg",
            button_large_padding="12px",
        )

In [80]:
def text_predict(input):
    output = prod_text_predict(input)
    if output.iloc[0]['label'] == "Disaster":
        output_statement = "This text is related to disaster"
    if output.iloc[0]['label'] == "Not Disaster":
        output_statement = "This text is not related to disaster"
    return output_statement

def file_predict(input, progress=gr.Progress(track_tqdm=True)):
    output = prod_file_predict(input)
    return output

def api_monitor(input):
    output = "This feature is yet to be implemented"
    return output

def model_performance(metrics_input,model_input):
    file_path = get_file_name(metrics_input,model_input)
    output_plot = gr.Image(file_path, height = 600, width = 600)
    return output_plot

def clear(text_input, text_output):
    text_input = None
    text_output = None
    return text_input, text_output

customtheme_obj = customtheme()
with gr.Blocks(theme=customtheme_obj) as gui_demo:
    gr.Markdown("Automated Classification of Disaster-Related Tweets")
    with gr.Tab("Text"):
        with gr.Row():
            with gr.Column():
                text_input = gr.Textbox(label= "Input Tweet")
                with gr.Row():
                    text_button = gr.Button("Predict", variant='primary')
                    text_clear_button = gr.ClearButton([text_input])
            with gr.Column():
                text_output = gr.Textbox(label = "Prediction")
        
    with gr.Tab("File"):
        # with gr.Row():
        file_input = gr.File(label= "Input File (as .csv)")
        default_display_df = pd.DataFrame(index = range(1), columns=['Tweet Text', 'Class Label'])
        file_output = gr.DataFrame(value = default_display_df, label = "Prediction")
        file_button = gr.Button("Predict", variant='primary')
    with gr.Tab("API"):
        with gr.Row():
            with gr.Column():
                api_input = gr.Textbox(label= "API Endpoint")
                with gr.Row():
                    api_button = gr.Button("Monitor", variant='primary')
                    api_clear_button = gr.ClearButton([api_input])
            with gr.Column():
                api_output = gr.Textbox(label = "Disaster Related Tweets")
    with gr.Tab("Model Performance"):
        with gr.Row():
            with gr.Column():
                metric_choices = ['Performance Metrics', 'Confusion Matrix']
                metrics_input = gr.Radio(choices = metric_choices, 
                                        value = 'Performance Metrics', 
                                        type = 'value',
                                        show_label = True, 
                                        interactive = True,
                                        label= "Select metric")
                model_choices = ['NB', 'LR', 'SVM', 'KNN', 'CNN', 'RNN', 'BERTweet', 'RoBERTa', 'All' ]
                model_input = gr.Radio(choices = model_choices, 
                                        value = 'NB', 
                                        type = 'value',
                                        show_label = True, 
                                        interactive = True,
                                        label= "Select model")
                model_perf_button = gr.Button("View Results", variant='primary')
            with gr.Column():
                model_output = gr.Image()
       

    text_button.click(text_predict, inputs=text_input, outputs=text_output)
    file_button.click(file_predict, inputs=file_input, outputs=file_output)
    api_button.click(api_monitor, inputs=api_input, outputs=api_output)
    model_perf_button.click(model_performance, inputs=[metrics_input,model_input] , outputs=model_output)

gui_demo.launch()


Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




In [82]:
gui_demo.close()  

Closing server running on port: 7860
