In [1]:
# %pip install nltk
# %pip install pandas
# %pip install scikit-learn
# %pip install matplotlib
# %pip install gradio


In [2]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
from string import punctuation
from nltk import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas as pd
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\viswa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\viswa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\viswa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\viswa\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
input_df = pd.read_csv('dataset/train.csv')

In [4]:
input_df

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1


In [5]:
# %%capture
def data_cleanup(train_df):
    train_df['text'] = train_df['text'].str.lower()
    train_df['text'] = train_df['text'].str.strip()
    train_df['text'] = train_df['text'].replace(to_replace ='http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', value = '', regex = True)
    train_df['text'] = train_df['text'].replace(to_replace ='\?*', value = '', regex = True)
    train_df['text'] = train_df['text'].replace(to_replace ='(RT|rt)', value = '', regex = True)
    train_df['text'] = train_df['text'].replace(to_replace ='@[a-z,_]*', value = '', regex = True)
    train_df['text'] = train_df['text'].replace(to_replace ='([0-9]*:[0-9]*)', value = '', regex = True)
    train_df['text'] = train_df['text'].replace(to_replace ='([0-9]*\.[0-9]*)', value = '', regex = True)
    train_df['text'] = train_df['text'].replace(to_replace ='(utc|gmt)', value = '', regex = True)
    train_df['text'] = train_df['text'].replace(to_replace ='_[\S]', value = '', regex = True)
    train_df['text'] = train_df['text'].replace(to_replace ='&amp;?', value = 'and', regex = False)
    train_df['text'] = train_df['text'].replace(to_replace ='&lt;', value = '<', regex = False)
    train_df['text'] = train_df['text'].replace(to_replace ='&gt;', value = '>', regex = False)
    train_df['text'] = train_df['text'].replace(to_replace ='[ ]{2, }', value = ' ', regex = True)
    train_df['text'] = train_df['text'].replace(to_replace ='([^\w\d ]+)', value = '', regex = True)
    return train_df['text']

In [6]:
# %%capture
# train_df = pd.read_csv('dataset/train.csv')
input_df['text'] = data_cleanup(input_df)

In [7]:
input_df

Unnamed: 0,id,keyword,location,text,target
0,1,,,our deeds are the reason of this eahquake may ...,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,all residents asked to shelter in place are be...,1
3,6,,,13000 people receive wildfires evacuation orde...,1
4,7,,,just got sent this photo from ruby alaska as s...,1
...,...,...,...,...,...
7608,10869,,,two giant cranes holding a bridge collapse int...,1
7609,10870,,,the out of control wild fires in california ...,1
7610,10871,,,m 5km s of volcano hawaii,1
7611,10872,,,police investigating after an ebike collided w...,1


In [8]:
#Training and Test Dataset split
tweet_texts = input_df['text']
class_labels = input_df['target']
train_tweets, test_tweets, train_labels, test_labels = train_test_split(tweet_texts,class_labels,test_size=0.2, random_state=42, stratify=class_labels)

In [9]:
#Concat tweets and labels series from the split into dataframe
train_cols = [pd.Series(train_tweets, name='text'), pd.Series(train_labels, name='labels')]
train_df = pd.concat(train_cols, axis = 1)
test_cols = [pd.Series(test_tweets, name='text'), pd.Series(test_labels, name='labels')]
test_df = pd.concat(test_cols,axis = 1)

In [10]:
#Text Preprocessor
def preprocessing(text):
   word_lemma = []
   tweet_tokenize = TweetTokenizer()
   tokens = tweet_tokenize.tokenize((text).lower())
   tokens = [w for w in tokens if w not in punctuation and not w.isdigit() and not len(w) < 3]
   stop_words = stopwords.words ('english')
   tweet_without_stopwords = [t for t in tokens if t not in stop_words]
   text = " ".join (tweet_without_stopwords)
   word_lemma = [WordNetLemmatizer().lemmatize(t) for t in tweet_tokenize.tokenize(text)]
   pp_text = " ".join (word_lemma)
   return pp_text

In [11]:
def get_performance_score(actual_label : list, predicted_label : list):
    '''Function to calculate the performance metric using sklearn.
    
    Parameters
    ----------
    actual_label : list
      Actual(Ground Truth) class label from the dataset.
    predicted_label : pd.DataFrame
      Class label predicted by the model
    
    Return
    ------
    f1_score : float
    accuracy : float
    precision : float
    recall : float
    AUROC : float
    '''
    precision = metrics.precision_score(actual_label, predicted_label, pos_label=1)
    recall = metrics.recall_score(actual_label, predicted_label,pos_label=1)
    AUROC = metrics.roc_auc_score(actual_label, predicted_label)
    accuracy = metrics.accuracy_score(actual_label, predicted_label)
    f1_score = metrics.f1_score(actual_label, predicted_label,pos_label=1)
    confusion_mat = metrics.confusion_matrix(actual_label, predicted_label)
    metrics_list = [f1_score, accuracy, precision, recall, AUROC]
    metrics_list = pd.DataFrame(metrics_list).T
    metrics_df = metrics_list.rename(columns={0:'F1',1:'Accuracy',2:'Precision',3:'Recall',4:'AUROC'})
    return metrics_df, confusion_mat

In [12]:
def plot_confusion_matrix(confusion_mat, model_name):
    _, cm_ax = plt.subplots(facecolor='#212936')
    cm_plot = ConfusionMatrixDisplay(confusion_matrix=confusion_mat,display_labels=['Not Disaster','Disaster'])
    title = model_name + " Confusion Matrix"
    cm_plot.plot(cmap=plt.cm.Greens, ax=cm_ax)
    plt.title(title)
    plt.show()


In [13]:
def plot_confusion_matrix(confusion_mat, theme, model):
    if theme == 'dark':
        plt.rcParams['text.color'] = 'white'
        plt.rcParams['axes.labelcolor'] = 'white'
        plt.rcParams['xtick.color'] = 'white'
        plt.rcParams['ytick.color'] = 'white'
        cm_fig, cm_ax = plt.subplots(facecolor='#212936', figsize=(6,4))
    elif theme == 'light':
        plt.rcParams['text.color'] = 'black'
        plt.rcParams['axes.labelcolor'] = 'black'
        plt.rcParams['xtick.color'] = 'black'
        plt.rcParams['ytick.color'] = 'black'
        cm_fig, cm_ax = plt.subplots(facecolor='#FFFFFF', figsize=(6,4))
    cm_plot = ConfusionMatrixDisplay(confusion_matrix=confusion_mat,display_labels=['Not Disaster','Disaster'])
    title = model + " Confusion Matrix"
    cm_plot.plot(cmap=plt.cm.Greens, ax=cm_ax)
    plt.title(title)
    cm_fig.savefig(f'results/images/confusion/{model}_{theme}.png', bbox_inches='tight')
    # plt.show()
    plt.close()

## Variable definitions
 - train_tweets - Preprocessed tweets for training
 - test_tweets - Preprocessed tweets for testing
 - train_labels - class label for training tweets
 - test_labels - class label for test tweets

## Baseline
1. Implement traditional model(MultinomialNB, LogisticRegression, SVC, KNeighborsClassifier) from sklearn
2. Train and test the default model without tuning hyperparameter values
3. Use grid search(GridSearchCV) from sklearn to identify best values for hyperparameters
4. Train the model with best hypermeter values and test it on test set(test_tweets)

## Traditional Classifiers

In [14]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [15]:
vectorizer = CountVectorizer(preprocessor=preprocessing,ngram_range = (1,2))
vectors_train = vectorizer.fit_transform(train_df['text'])
vectors_test = vectorizer.transform(test_df['text'])
train_labels = train_df['labels']
test_labels = test_df['labels']

In [16]:
def cv_grid_search(model, param_grid, cv, scoring, train_tweet, train_label):
  '''Function to perform grid search.

    Parameters
    ----------
    model : Object
    norm : str
    param_grid : list
    cv : int
    scoring : str
    
    Return
    ------
    grid_search : Object
    '''
  grid_search = GridSearchCV(model, param_grid, cv=cv, scoring=scoring, n_jobs=-1, return_train_score=True, error_score = True)
  grid_search.fit(train_tweet, train_label)
  return grid_search

In [17]:
def consolidate_perf_score(models_list, perf_score_list):
  '''Function to consolidate the performance metrics of all the models(KNeighborsClassifier, RandomForestClassifier, LogisticRegression, MLPClassifier) 
  and return a pd.DataFrame.

    Parameters
    ----------
    models_list : list
      List of models.
    perf_score_list : list
      List of performance metrics data frame from various models.
      
    Return
    ------
    consolidated_metrics_df : pd.DataFrame
    '''
  
  consolidated_perf_score_df = pd.concat(perf_score_list)
  consolidated_perf_score_df = consolidated_perf_score_df.rename(columns={0:'F1',1:'Accuracy',2:'Precision',3:'Recall'})
  consolidated_perf_score_df.insert(0,'Model',models_list)
  return consolidated_perf_score_df

In [18]:
def initialize_trad_models():
    '''Function to initialize the traditional models from sklearn.

    Return
    ------
    nb : Object
    knc : Object
    lr : Object
    svc : Object
    '''
    nb = MultinomialNB()
    knc = KNeighborsClassifier()
    lr = LogisticRegression()
    svc = SVC()
    return nb, knc,lr, svc

In [19]:
nb, knc, lr, svc = initialize_trad_models()

In [20]:
%%capture
param_grid = {
      'alpha': (0.01,0.2,0.4,1.0),
      'fit_prior': (True,False)}
nb_gs = cv_grid_search(nb, param_grid, 10, 'f1', vectors_train, train_labels)

In [21]:
param_grid = [{'n_neighbors': [3, 5, 10, 12],
               'p': [1, 2],
               'weights':['uniform', 'distance'],
               'algorithm': ['auto', 'brute']}]
knc_gs = cv_grid_search(knc, param_grid, 10, 'f1', vectors_train, train_labels)

In [22]:
param_grid = [
    {'penalty': ['l2'],
     'tol': [1e-3, 1e-4],
     'solver':['lbfgs', 'liblinear'],
     'max_iter': [1000, 5000, 10000],
     'random_state': [42]}
  ]
lr_gs = cv_grid_search(lr, param_grid, 10, 'f1', vectors_train, train_labels)

In [23]:
param_grid = [
    {'kernel': ['poly', 'sigmoid', 'rbf'],
     'gamma' : ['scale', 'auto'],
     'random_state': [42]}
  ]
svc_gs = cv_grid_search(svc, param_grid, 10, 'f1', vectors_train, train_labels)

In [24]:
best_nb, best_knc, best_lr, best_svc = initialize_trad_models()

In [25]:
# Set the best parameter values for each model from grid search for the hyperparameters 
best_nb.set_params(**nb_gs.best_params_)
best_knc.set_params(**knc_gs.best_params_)
best_lr.set_params(**lr_gs.best_params_)
best_svc.set_params(**svc_gs.best_params_)

#Train the each model with the best parameters
best_nb.fit(vectors_train,train_labels)
best_knc.fit(vectors_train,train_labels)
best_lr.fit(vectors_train,train_labels)
best_svc.fit(vectors_train,train_labels)

#Predict the labels on test dataset using the trained models
nb_predict = best_nb.predict(vectors_test)
knc_predict = best_knc.predict(vectors_test)
lr_predict = best_lr.predict(vectors_test)
svc_predict = best_svc.predict(vectors_test)
nb_predicted_labels = np.array(nb_predict, dtype = int)
knc_predicted_labels = np.array(knc_predict, dtype = int)
lr_predictted_labels = np.array(lr_predict, dtype = int)
svc_predicted_labels = np.array(svc_predict, dtype = int)
actual_labels = np.array(test_labels, dtype = int)

#Calculate the performance metrics based on the predicted labels and actual labels in test dataset
nb_perf_scores, nb_cm = get_performance_score(nb_predicted_labels, actual_labels)
knc_perf_scores, knc_cm = get_performance_score(knc_predicted_labels, actual_labels)
lr_perf_scores, lr_cm = get_performance_score(lr_predictted_labels, actual_labels)
svc_perf_scores, svc_cm = get_performance_score(svc_predicted_labels, actual_labels)


In [26]:
confusion_mats = {
    'Naive Bayes': nb_cm,
    'K-Nearest Neighbor': knc_cm,
    'Logistic Regression': lr_cm,
    'SVM': svc_cm
}

In [27]:
# consolidate_perf_score(['nb', 'knc', 'lr', 'svc'], [nb_perf_scores, knc_perf_scores, lr_perf_scores, svc_perf_scores])

## BERTweet

In [28]:
# !pip install transformers
# !pip install torch
# !pip install datasets
# !pip install evaluate
# !pip install numpy
# !pip install accelerate
# !pip install emoji==0.6.0
# !pip install torch torchvision torchaudio

In [29]:
from transformers import BertForSequenceClassification, AutoModelForSequenceClassification
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import BertweetTokenizer
from transformers import AlbertTokenizer, AlbertModel
from transformers import AutoModel
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np
from transformers import TrainingArguments, Trainer
import torch
from torch.utils.data import TensorDataset
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler,random_split
from transformers import AutoConfig
from sklearn import metrics

  from .autonotebook import tqdm as notebook_tqdm


In [30]:
for index, row in input_df.iterrows():
        text = row['text']
        pp_text = preprocessing(text)
        input_df.at[index, 'text'] = pp_text

In [31]:
#Training and Test Dataset split
tweet_texts = input_df['text']
class_labels = input_df['target']
train_tweets, test_tweets, train_labels, test_labels = train_test_split(tweet_texts,class_labels,test_size=0.2, random_state=42, stratify=class_labels)

In [32]:
##raining and Dev Dataset split
tweet_texts = train_tweets
class_labels = train_labels
train_tweets, dev_tweets, train_labels, dev_labels = train_test_split(tweet_texts,class_labels,test_size=0.2, random_state=42, stratify=class_labels)

In [33]:
#Concat tweets and labels series from the split into dataframe
train_cols = [pd.Series(train_tweets, name='text'), pd.Series(train_labels, name='labels')]
train_df = pd.concat(train_cols, axis = 1)
dev_cols = [pd.Series(dev_tweets, name='text'), pd.Series(dev_labels, name='labels')]
dev_df = pd.concat(dev_cols, axis = 1)
test_cols = [pd.Series(test_tweets, name='text'), pd.Series(test_labels, name='labels')]
test_df = pd.concat(test_cols,axis = 1)

In [34]:
#Define constants for BERTweet model
model_name = "model/bertweet/v1"
max_length = 32
trucate = True
padding='max_length'
batch_size = 32
# mps_device = torch.device("mps")
# cuda_device = torch.device("cuda")
id2text = {0: "not_disaster", 1: "disaster"}
text2id = {"not_disaster": 0, "disaster": 1}

In [35]:
#Intialize tokenizer, data_collector and classifier for BERTweet
tokenizer = BertweetTokenizer.from_pretrained(model_name)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
classifier = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, id2label=id2text, label2id=text2id)
# classifier = classifier.to(cuda_device)

In [36]:
def preprocessor(input):
     token_ids_dict = tokenizer.encode_plus(input['text'], add_special_tokens = True, padding=padding, max_length=max_length, truncation=trucate,return_attention_mask = True)
     token_ids_dict['label'] = input['labels']
     return token_ids_dict

In [37]:
#Convert the input text into token_ids, attention_mask and token_type_ids dataset
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(dev_df)
test_dataset = Dataset.from_pandas(test_df)
train_map = train_dataset.map(preprocessor)
dev_map = eval_dataset.map(preprocessor)
test_map = test_dataset.map(preprocessor)

Map: 100%|██████████| 4872/4872 [00:01<00:00, 3712.26 examples/s]
Map: 100%|██████████| 1218/1218 [00:00<00:00, 3333.59 examples/s]
Map: 100%|██████████| 1523/1523 [00:00<00:00, 3459.16 examples/s]


In [38]:
def calculate_f1(labels):
    f1 = evaluate.load("f1")
    predicted, actual = labels
    predicted = np.argmax(predicted, axis=1)
    return f1.compute(predictions=predicted, references=actual)

In [39]:
#Intialize Trainer and Training Arguments for finetuning BERTweet
training_args = TrainingArguments(
    output_dir="trainer_cache",
    overwrite_output_dir=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model = 'f1',
    greater_is_better=True,
    num_train_epochs=8,
    learning_rate = 1e-5,
    adam_epsilon = 1e-5,
    weight_decay = 1e-5,
    adafactor = False,
    # use_mps_device=False

)

bt_trainer = Trainer(
    model=classifier,
    args=training_args,
    train_dataset=train_map,
    eval_dataset=dev_map,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=calculate_f1,
)

In [40]:
#Finetune BERTweet
# bt_trainer.train()

In [41]:
actual_label = test_df['labels']
predictions_prob = bt_trainer.predict(test_map)
predictions =  predictions_prob.predictions
predictions = np.argmax(predictions,axis=1)
predicted_lables = np.array(predictions, dtype = int)
actual_labels = np.array(actual_label, dtype = int)
bt_metrics_df, bt_confusion_mat = get_performance_score(actual_labels, predicted_lables)

100%|██████████| 191/191 [00:06<00:00, 29.98it/s]


In [42]:
bt_metrics_df, bt_confusion_mat

(         F1  Accuracy  Precision    Recall     AUROC
 0  0.808411  0.838477    0.82381  0.793578  0.832922,
 array([[758, 111],
        [135, 519]], dtype=int64))

In [43]:
confusion_mats['BERTweet'] = bt_confusion_mat

In [44]:
# plot_confusion_matrix(bt_confusion_mat, 'light', 'BERTweet')

In [45]:
# bt_trainer.save_model(output_dir = 'model/bertweet/v1/')

## RoBERTa

In [46]:
# model_name = "FacebookAI/xlm-roberta-base"
model_name = 'model/roberta/v1'
max_length = 32
trucate = True
padding='max_length'
id2text = {0: "not_disaster", 1: "disaster"}
text2id = {"not_disaster": 0, "disaster": 1}
# classifier = classifier.to(mps_device)

In [47]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
classifier = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, id2label=id2text, label2id=text2id)

In [48]:
def preprocessor(input):
     token_ids_dict = tokenizer.encode_plus(input['text'], add_special_tokens = True, padding=padding, max_length=max_length, truncation=trucate,return_attention_mask = True)
     token_ids_dict['label'] = input['labels']
     return token_ids_dict

In [49]:
#Convert the input text into token_ids, attention_mask and token_type_ids dataset
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(dev_df)
test_dataset = Dataset.from_pandas(test_df)
train_map = train_dataset.map(preprocessor)
dev_map = eval_dataset.map(preprocessor)
test_map = test_dataset.map(preprocessor)

Map: 100%|██████████| 4872/4872 [00:01<00:00, 4843.01 examples/s]
Map: 100%|██████████| 1218/1218 [00:00<00:00, 5021.17 examples/s]
Map: 100%|██████████| 1523/1523 [00:00<00:00, 5059.08 examples/s]


In [50]:
training_args = TrainingArguments(
    output_dir="trainer_cache",
    overwrite_output_dir=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model = 'f1',
    greater_is_better=True,
    num_train_epochs=8,
    learning_rate = 1e-5,
    adam_epsilon = 1e-5,
    weight_decay = 1e-5,
    adafactor = False,
    use_mps_device=False

)

rb_trainer = Trainer(
    model=classifier,
    args=training_args,
    train_dataset=train_map,
    eval_dataset=dev_map,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=calculate_f1,
)

In [51]:
# rb_trainer.train()

In [52]:
actual_label = test_df['labels']
predictions_prob = rb_trainer.predict(test_map)
predictions =  predictions_prob.predictions
predictions = np.argmax(predictions,axis=1)
predicted_lables = np.array(predictions, dtype = int)
actual_labels = np.array(actual_label, dtype = int)
rb_metrics_df, rb_confusion_mat = get_performance_score(actual_labels, predicted_lables)

100%|██████████| 191/191 [00:06<00:00, 30.38it/s]


In [53]:
# rb_trainer.save_model(output_dir = 'model/roberta/v1/')

In [54]:
rb_metrics_df, rb_confusion_mat

(         F1  Accuracy  Precision    Recall     AUROC
 0  0.789713  0.817466   0.781437  0.798165  0.815078,
 array([[723, 146],
        [132, 522]], dtype=int64))

In [55]:
confusion_mats['RoBERTa'] = rb_confusion_mat

In [56]:
# plot_confusion_matrix(rb_confusion_mat, 'dark', 'RoBERTa')

In [57]:
consolidate_perf_score(['NB', 'KNN', 'LR', 'SVM', 'BERTweet', 'RoBERTa'], [nb_perf_scores, knc_perf_scores, lr_perf_scores, svc_perf_scores, bt_metrics_df, rb_metrics_df])

Unnamed: 0,Model,F1,Accuracy,Precision,Recall,AUROC
0,NB,0.7664,0.808273,0.732416,0.803691,0.807455
0,KNN,0.621535,0.533815,0.891437,0.477087,0.620603
0,LR,0.773289,0.819435,0.717125,0.838998,0.823545
0,SVM,0.761578,0.817466,0.678899,0.867188,0.829736
0,BERTweet,0.808411,0.838477,0.82381,0.793578,0.832922
0,RoBERTa,0.789713,0.817466,0.781437,0.798165,0.815078


In [58]:
# plot_confusion_matrix(lr_cm, 'light', 'Logistic Regression')

In [59]:
# Plot confusion matrices for all the models except CNN and RNN
def plot_confusion_mats(confusion_mats):
    themes = ['light', 'dark']
    for theme in themes:
        for name, confusion_mat in confusion_mats.items():
            plot_confusion_matrix(confusion_mat, theme, name)

In [60]:
plot_confusion_mats(confusion_mats)

## GUI

# File input

In [61]:
def prod_preprocessor(input):
     token_ids_dict = tokenizer.encode_plus(input['text'], add_special_tokens = True, padding=padding, max_length=max_length, truncation=trucate,return_attention_mask = True)
     return token_ids_dict

In [97]:
perf_results_df = pd.read_csv('results/csv/performance_results.csv')
perf_results_df = perf_results_df.T
perf_results_df.columns=['NB', 'KNC', 'LR', 'SVC', 'BERTweet', 'RoBERTa', 'RNN', 'CNN']
perf_results_df=perf_results_df[1:]
perf_results_df = perf_results_df.astype('float32')
perf_results_df = perf_results_df.round(2)
# perf_results_df.insert(0, 'Metrics', perf_results_df.index)
# perf_results_df.index = list(range(0, len(perf_results_df)))
perf_results_df

Unnamed: 0,NB,KNC,LR,SVC,BERTweet,RoBERTa,RNN,CNN
Accuracy,0.81,0.53,0.82,0.82,0.84,0.82,0.81,0.8
Precision,0.73,0.89,0.72,0.68,0.82,0.78,0.84,0.85
Recall,0.8,0.48,0.84,0.87,0.79,0.8,0.69,0.66
F1,0.77,0.62,0.77,0.76,0.81,0.79,0.76,0.74
AUROC,0.81,0.62,0.82,0.83,0.83,0.82,0.8,0.79


In [121]:
def plot_bar_plots(perf_results_df):
    model_dict = {'NB':'Naive Bayes', 'LR':'Logistic Regression','SVC':'SVM','KNC':'K-Nearest Neighbor',
                'CNN':'CNN','RNN':'RNN',
                'BERTweet':'BERTweet','RoBERTa':'RoBERTa'}
    themes = ['light', 'dark']
    for theme in themes:
        for shortName, longName in model_dict.items():
            if theme == 'light':
                plt.rcParams['text.color'] = 'black'
                plt.rcParams['axes.labelcolor'] = 'black'
                plt.rcParams['xtick.color'] = 'black'
                plt.rcParams['ytick.color'] = 'black'
                plt.rcParams['axes.edgecolor'] = 'black'
                _, bar_ax = plt.subplots(facecolor='#FFFFFF')
                bar_plot = perf_results_df[shortName].plot(figsize=(8,4), title=f"{longName} Performance Metrics", kind='bar', ax = bar_ax)
                bar_plot.set_facecolor('#FFFFFF')
            elif theme == 'dark':
                plt.rcParams['text.color'] = 'white'
                plt.rcParams['axes.labelcolor'] = 'white'
                plt.rcParams['xtick.color'] = 'white'
                plt.rcParams['ytick.color'] = 'white'
                plt.rcParams['axes.edgecolor'] = '#ffffff'
                _, bar_ax = plt.subplots(facecolor='#212936')
                bar_plot = perf_results_df[shortName].plot(figsize=(8,4), title=f"{longName} Performance Metrics", kind='bar', ax=bar_ax)
                bar_plot.set_facecolor('#212936')
                bar_plot.spines
            bar_labels = bar_plot.bar(perf_results_df.index, perf_results_df[shortName], color = 'g', width=0.5)
            bar_plot.bar_label(bar_labels, label_type='edge')
            bar_plot.set_ylim([0, 1])
            plt.savefig(f'results/images/performance/{longName}_{theme}.png', bbox_inches='tight')
            plt.close()

In [122]:
plot_bar_plots(perf_results_df)

In [64]:
def prod_file_predict(path):
    prod_input_df = pd.read_csv(path)
    prod_input_df['text'] = data_cleanup(prod_input_df)
    prod_input_df['text'] = prod_input_df['text'].apply(preprocessing)
    prod_test_dataset = Dataset.from_pandas(prod_input_df)
    prod_test_dataset_map = prod_test_dataset.map(prod_preprocessor)
    prod_predictions_prob = bt_trainer.predict(prod_test_dataset_map)
    prod_predictions =  prod_predictions_prob.predictions
    prod_predictions = np.argmax(prod_predictions,axis=1)
    prod_predicted_lables = np.array(prod_predictions, dtype = int)
    prod_output_df = pd.DataFrame(data=prod_input_df['text'], columns=['text'])
    prod_output_df['label'] = pd.Series(prod_predicted_lables)
    output_map = {
        0: 'Not Disaster',
        1: 'Disaster'
    }
    prod_output_df['label'] = prod_output_df['label'].map(output_map)
    return prod_output_df
    

In [65]:
def prod_text_predict(text):
    text_dict = {}
    text_dict['text'] = text
    prod_input_df = pd.DataFrame(text_dict.values(), columns=['text'])
    prod_input_df['text'] = data_cleanup(prod_input_df)
    prod_input_df['text'] = prod_input_df['text'].apply(preprocessing)
    prod_test_dataset = Dataset.from_pandas(prod_input_df)
    prod_test_dataset_map = prod_test_dataset.map(prod_preprocessor)
    prod_predictions_prob = bt_trainer.predict(prod_test_dataset_map)
    prod_predictions =  prod_predictions_prob.predictions
    prod_predictions = np.argmax(prod_predictions,axis=1)
    prod_predicted_lables = np.array(prod_predictions, dtype = int)
    prod_output_df = pd.DataFrame(data=prod_input_df['text'], columns=['text'])
    prod_output_df['label'] = pd.Series(prod_predicted_lables)
    output_map = {
        0: 'Not Disaster',
        1: 'Disaster'
    }
    prod_output_df['label'] = prod_output_df['label'].map(output_map)
    return prod_output_df

In [66]:
def get_file_name(metrics_input,model_input):
    theme = 'dark'
    metric_dict = {'Performance Metrics':'./results/images/bar/', 
               'Confusion Matrix':'./results/images/confusion/'}
    model_dict = {'NB':'Naive Bayes', 'LR':'Logistic Regression','SVM':'SVM','KNN':'K-Nearest Neighbor',
              'CNN':'CNN','RNN':'RNN',
              'BERTweet':'BERTweet','RoBERTa':'RoBERTa'}
    base_dir = metric_dict[metrics_input]
    filename = model_dict[model_input]
    file_path = base_dir+filename+'_'+theme+'.png'
    return file_path



In [None]:
import gradio as gr

def text_predict(input):
    output = prod_text_predict(input)
    return output.iloc[0]['label']

def file_predict(input):
    output = prod_file_predict(input)
    return output

def model_performance(metrics_input,model_input):
    file_path = get_file_name(metrics_input,model_input)
    output_plot = gr.Image(file_path, height = 600, width = 600)
    return output_plot

with gr.Blocks() as gui_demo:
    gr.Markdown("Automated Classification of Disaster-Related Tweets")
    with gr.Tab("Text"):
        with gr.Row():
            with gr.Column():
                text_input = gr.Textbox(label= "Input Tweet")
                text_button = gr.Button("Predict")
            with gr.Column():
                text_output = gr.Textbox(label = "Prediction")
        
    with gr.Tab("File"):
        # with gr.Row():
        file_input = gr.File(label= "Input File (as .csv)")
        file_output = gr.DataFrame(label = "Prediction")
        file_button = gr.Button("Predict")
    with gr.Tab("API"):
        with gr.Row():
            api_input = gr.Textbox(label= "API Endpoint")
        api_button = gr.Button("Predict")
    with gr.Tab("Model Performance"):
        with gr.Row():
            with gr.Column():
                metric_choices = ['Performance Metrics', 'Confusion Matrix']
                metrics_input = gr.Radio(choices = metric_choices, 
                                        value = 'Performance Metrics', 
                                        type = 'value',
                                        show_label = True, 
                                        interactive = True,
                                        label= "Select metric")
                model_choices = ['NB', 'LR', 'SVM', 'KNN', 'CNN', 'RNN', 'BERTweet', 'RoBERTa', 'All' ]
                model_input = gr.Radio(choices = model_choices, 
                                        value = 'NB', 
                                        type = 'value',
                                        show_label = True, 
                                        interactive = True,
                                        label= "Select model")
                model_perf_button = gr.Button("View Results")
            with gr.Column():
                model_output = gr.Image()
        

    text_button.click(text_predict, inputs=text_input, outputs=text_output)
    file_button.click(file_predict, inputs=file_input, outputs=file_output)
    model_perf_button.click(model_performance, inputs=[metrics_input,model_input] , outputs=model_output)

# gui_demo.launch()


Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




Map: 100%|██████████| 1/1 [00:00<00:00, 105.17 examples/s]
100%|██████████| 1/1 [00:00<00:00, 914.79it/s]
Map: 100%|██████████| 1/1 [00:00<00:00, 331.41 examples/s]
100%|██████████| 1/1 [00:00<00:00, 998.64it/s]
Map: 100%|██████████| 3263/3263 [00:00<00:00, 4242.71 examples/s]
Traceback (most recent call last):
  File "c:\Users\viswa\miniconda3\envs\mlproject\Lib\site-packages\gradio\queueing.py", line 527, in process_events
    response = await route_utils.call_process_api(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\viswa\miniconda3\envs\mlproject\Lib\site-packages\gradio\route_utils.py", line 261, in call_process_api
    output = await app.get_blocks().process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\viswa\miniconda3\envs\mlproject\Lib\site-packages\gradio\blocks.py", line 1788, in process_api
    result = await self.call_function(
             ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\viswa\miniconda3\envs\mlproject\Lib\site-pa

In [72]:
gui_demo.close()  

Closing server running on port: 7860
