In [1]:
!pip install simpletransformers==0.51
!pip install tqdm==4.41.1
!pip install transformers -U
!pip install unidecode datefinder dateparser selenium 

Collecting simpletransformers==0.51
[?25l  Downloading https://files.pythonhosted.org/packages/b8/dc/f140c68aee992ad594375e30b86c385dd6da46e14f108cbb6247ece79e7a/simpletransformers-0.51.0-py3-none-any.whl (224kB)
[K     |████████████████████████████████| 225kB 15.0MB/s 
[?25hCollecting tensorboardx
[?25l  Downloading https://files.pythonhosted.org/packages/af/0c/4f41bcd45db376e6fe5c619c01100e9b7531c55791b7244815bac6eac32c/tensorboardX-2.1-py2.py3-none-any.whl (308kB)
[K     |████████████████████████████████| 317kB 46.7MB/s 
Collecting wandb
[?25l  Downloading https://files.pythonhosted.org/packages/b3/fe/9698a355bd53757f00d535df18deb951c089188a9a9baaadc73ddfcbe043/wandb-0.10.15-py2.py3-none-any.whl (1.9MB)
[K     |████████████████████████████████| 1.9MB 53.6MB/s 
Collecting transformers>=4.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/88/b1/41130a228dd656a1a31ba281598a968320283f48d42782845f6ba567f00b/transformers-4.2.2-py3-none-any.whl (1.8MB)
[K     |████████

Collecting tqdm==4.41.1
[?25l  Downloading https://files.pythonhosted.org/packages/72/c9/7fc20feac72e79032a7c8138fd0d395dc6d8812b5b9edf53c3afd0b31017/tqdm-4.41.1-py2.py3-none-any.whl (56kB)
[K     |█████▊                          | 10kB 23.2MB/s eta 0:00:01[K     |███████████▌                    | 20kB 30.1MB/s eta 0:00:01[K     |█████████████████▎              | 30kB 22.2MB/s eta 0:00:01[K     |███████████████████████         | 40kB 20.5MB/s eta 0:00:01[K     |████████████████████████████▉   | 51kB 21.8MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 7.3MB/s 
[31mERROR: transformers 4.2.2 has requirement tokenizers==0.9.4, but you'll have tokenizers 0.10.0 which is incompatible.[0m
[31mERROR: simpletransformers 0.51.0 has requirement tqdm>=4.47.0, but you'll have tqdm 4.41.1 which is incompatible.[0m
[?25hInstalling collected packages: tqdm
  Found existing installation: tqdm 4.56.0
    Uninstalling tqdm-4.56.0:
      Successfully uninstalled tqdm-4.56.

### Imports

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_recall_fscore_support
from sklearn.model_selection import train_test_split, ParameterGrid
import nltk
from nltk.tokenize import word_tokenize
from sklearn.utils import shuffle
from sklearn import decomposition
import pandas as pd
from copy import deepcopy
import os
import re
import numpy as np
import matplotlib.pyplot as plt
import json
import itertools
import random
import warnings
import string
warnings.filterwarnings("ignore", category=UserWarning)

In [5]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs

### Constants

In [6]:
root = '/gdrive/MyDrive/Bangladesh Flood Research'
global_shuffle_seed = 4
global_debug=True
global_override=True

### Drive imports

In [7]:
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

Mounted at /gdrive


In [8]:
functions_file = root+'/NLP_flood_Bangladesh/Colabs/functions.py'
all_papers_file = root+'/NLP_flood_Bangladesh/Colabs/all_papers.py'
!cp "$functions_file" .
!cp "$all_papers_file" .

In [9]:
from functions import load_data_tagtog, query_dataframe, load_data, get_new_predicted_data

### Get drive data

In [10]:
def make_data_ratio(df_data, test_size=None, train_size=None, shuffle_seed=4, debug=False, 
                    save_folder=None, load_folder=None, override=False, file_prefix=''):
    save_file, load_file=None, None
    if save_folder: save_file = os.path.join(save_folder,file_prefix+'data.json')
    if load_folder: load_file = os.path.join(load_folder,file_prefix+'data.json')
    
    if not override and load_file and os.path.isfile(load_file):
        if debug: print('loaded',load_file)
        js = json.load(open(load_file))
        train_df = pd.DataFrame(js['train'])
        test_df = pd.DataFrame(js['test'])
        return {'train':train_df, 'test':test_df}
    
    train_df, test_df = train_test_split(df_data, test_size=test_size, train_size=train_size, random_state=shuffle_seed, stratify=df_data['is_flood'])
    train_df = train_df[train_df['is_flood'].notna()]
    test_df = test_df[test_df['is_flood'].notna()]

    if debug: print('Data Loaded')

    if save_file:
        train_json = train_df.to_json(orient='records')
        test_json = test_df.to_json(orient='records')
        json.dump({'train':json.loads(train_json), 'test':json.loads(test_json)}, open(save_file,'w'), indent=2)
    return {'train':train_df, 'test':test_df}

In [11]:
nlp_data_path = os.path.join(root, 'data/nlp_data')
classifier_data_path = os.path.join(nlp_data_path, 'classifier')

In [12]:
data_split_folder = os.path.join(classifier_data_path, 'data_splits')
result, clf_result = {}, {}
save_data_folder = data_split_folder
load_data_folder = data_split_folder
train_size = 500
test_size = 880
debug=global_debug or False
override=False
df_data=None
data_split = make_data_ratio(df_data, train_size=train_size, test_size=test_size, save_folder=save_data_folder, load_folder=load_data_folder, 
                               debug=debug, shuffle_seed=global_shuffle_seed, override=override)
df_data = pd.concat([data_split['train'], data_split['test']])

loaded /gdrive/MyDrive/Bangladesh Flood Research/data/nlp_data/classifier/data_splits/data.json


In [13]:
print('Train:',len(data_split['train']), '\t\tTest:',len(data_split['test']))
print('Train is_flood:',len(data_split['train'].loc[data_split['train']['is_flood']==True]), \
'\tTrain not is_flood:',len(data_split['train'].loc[data_split['train']['is_flood']==False]))
print('Test is_flood:',len(data_split['test'].loc[data_split['test']['is_flood']==True]), \
'\tTest not is_flood:',len(data_split['test'].loc[data_split['test']['is_flood']==False]))

Train: 1103 		Test: 277
Train is_flood: 530 	Train not is_flood: 573
Test is_flood: 133 	Test not is_flood: 144


In [14]:
total_len = len(data_split['train'])+len(data_split['test'])

In [15]:
def remove():
  !rm -r cache_dir outputs runs
remove()

rm: cannot remove 'cache_dir': No such file or directory
rm: cannot remove 'outputs': No such file or directory
rm: cannot remove 'runs': No such file or directory


### Classifier

#### Train Size Loop

In [None]:
overall_result = []
for train_size in [10,20,50,100,200,500,1000]:
    test_size = 270
    result, clf_result = {}, {}
    debug=True
    override=True
    data_split = make_data_ratio(df_data, test_size=test_size, train_size=train_size,
                               debug=debug, shuffle_seed=global_shuffle_seed, override=override)
    print(len(data_split['train']), len(data_split['test']))
    actual = [i if i==True else 0 for i in data_split['test']['is_flood']]
    train_df = data_split['train'][['text', 'is_flood']]
    train_df.columns = ['text', 'labels']
    test_df = data_split['test'][['text', 'is_flood']]
    test_df.columns = ['text', 'labels']

    model_args = ClassificationArgs(
        num_train_epochs=10,
        max_seq_length=512,
        overwrite_output_dir=True
    )
    model = ClassificationModel(
        "bert", "bert-base-uncased", args=model_args
    )
    model.train_model(train_df)
    # print([i for i in list(test_df['text']) if len(i)<10])
    # break

    predict, raw_outputs = model.predict(list(test_df['text']))
    key = 'BERT-512'
    clf_acc = accuracy_score(actual, predict)
    pre, rec, fsc, sup = precision_recall_fscore_support(actual, predict, average='binary')
    d = { 'key':key, 'train_size':train_size, 'test_size':test_size, 'accuracy':clf_acc, 'precision':pre, 
        'recall':rec, 'f1':fsc,
    }
    overall_result.append(d)
    print(d)
    remove()

Data Loaded
10 270


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=10.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 10', max=2.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 10', max=2.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 10', max=2.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 10', max=2.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 10', max=2.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 5 of 10', max=2.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 6 of 10', max=2.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 7 of 10', max=2.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 8 of 10', max=2.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 9 of 10', max=2.0, style=ProgressStyle(desc…





HBox(children=(FloatProgress(value=0.0, max=270.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=34.0), HTML(value='')))


{'key': 'BERT-512', 'train_size': 10, 'test_size': 270, 'accuracy': 0.6148148148148148, 'precision': 0.6382978723404256, 'recall': 0.46153846153846156, 'f1': 0.5357142857142858}
Data Loaded
20 270


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=10.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 10', max=3.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 10', max=3.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 10', max=3.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 10', max=3.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 10', max=3.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 5 of 10', max=3.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 6 of 10', max=3.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 7 of 10', max=3.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 8 of 10', max=3.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 9 of 10', max=3.0, style=ProgressStyle(desc…





HBox(children=(FloatProgress(value=0.0, max=270.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=34.0), HTML(value='')))


{'key': 'BERT-512', 'train_size': 20, 'test_size': 270, 'accuracy': 0.8333333333333334, 'precision': 0.8455284552845529, 'recall': 0.8, 'f1': 0.8221343873517788}
Data Loaded
50 270


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=10.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 10', max=7.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 10', max=7.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 10', max=7.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 10', max=7.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 10', max=7.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 5 of 10', max=7.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 6 of 10', max=7.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 7 of 10', max=7.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 8 of 10', max=7.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 9 of 10', max=7.0, style=ProgressStyle(desc…





HBox(children=(FloatProgress(value=0.0, max=270.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=34.0), HTML(value='')))


{'key': 'BERT-512', 'train_size': 50, 'test_size': 270, 'accuracy': 0.9, 'precision': 0.8705035971223022, 'recall': 0.9307692307692308, 'f1': 0.899628252788104}
Data Loaded
100 270


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

HBox(children=(FloatProgress(value=0.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=10.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 10', max=13.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 10', max=13.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 10', max=13.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 10', max=13.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 10', max=13.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 5 of 10', max=13.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 6 of 10', max=13.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 7 of 10', max=13.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 8 of 10', max=13.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 9 of 10', max=13.0, style=ProgressStyle(des…





HBox(children=(FloatProgress(value=0.0, max=270.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=34.0), HTML(value='')))


{'key': 'BERT-512', 'train_size': 100, 'test_size': 270, 'accuracy': 0.9148148148148149, 'precision': 0.8741258741258742, 'recall': 0.9615384615384616, 'f1': 0.9157509157509157}
Data Loaded
200 270


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=10.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 10', max=25.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 10', max=25.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 10', max=25.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 10', max=25.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 10', max=25.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 5 of 10', max=25.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 6 of 10', max=25.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 7 of 10', max=25.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 8 of 10', max=25.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 9 of 10', max=25.0, style=ProgressStyle(des…





HBox(children=(FloatProgress(value=0.0, max=270.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=34.0), HTML(value='')))


{'key': 'BERT-512', 'train_size': 200, 'test_size': 270, 'accuracy': 0.9074074074074074, 'precision': 0.8888888888888888, 'recall': 0.9230769230769231, 'f1': 0.9056603773584906}
Data Loaded
500 270


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

HBox(children=(FloatProgress(value=0.0, max=500.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=10.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 10', max=63.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 10', max=63.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 10', max=63.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 10', max=63.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 10', max=63.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 5 of 10', max=63.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 6 of 10', max=63.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 7 of 10', max=63.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 8 of 10', max=63.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 9 of 10', max=63.0, style=ProgressStyle(des…





HBox(children=(FloatProgress(value=0.0, max=270.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=34.0), HTML(value='')))


{'key': 'BERT-512', 'train_size': 500, 'test_size': 270, 'accuracy': 0.9222222222222223, 'precision': 0.9097744360902256, 'recall': 0.9307692307692308, 'f1': 0.920152091254753}
Data Loaded
1000 270


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=10.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 10', max=125.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 10', max=125.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 10', max=125.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 10', max=125.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 10', max=125.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 5 of 10', max=125.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 6 of 10', max=125.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 7 of 10', max=125.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 8 of 10', max=125.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 9 of 10', max=125.0, style=ProgressStyle(de…





HBox(children=(FloatProgress(value=0.0, max=270.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=34.0), HTML(value='')))


{'key': 'BERT-512', 'train_size': 1000, 'test_size': 270, 'accuracy': 0.9259259259259259, 'precision': 0.9044117647058824, 'recall': 0.9461538461538461, 'f1': 0.9248120300751879}


ValueError: ignored

#### Cross Validation loop

In [None]:
from sklearn.model_selection import KFold 
overall_result = []
kf = KFold(n_splits=5, random_state=32, shuffle=True)
data = df_data[['text', 'is_flood']]
data.columns = ['text', 'labels']
pres, recs, f1s, accs = [], [], [], []
for train_index, test_index in kf.split(data):
  # splitting Dataframe (dataset not included)
    train_df = data.iloc[train_index]
    test_df = data.iloc[test_index]
    model_args = ClassificationArgs(
        num_train_epochs=10,
        max_seq_length=512,
        overwrite_output_dir=True
    )
    model = ClassificationModel(
        "bert", "bert-base-uncased", args=model_args
    )
    model.train_model(train_df)
    predict, raw_outputs = model.predict(list(test_df['text']))
    actual = [i if i==True else 0 for i in test_df['labels']]
    clf_acc = accuracy_score(actual, predict)
    pre, rec, fsc, sup = precision_recall_fscore_support(actual, predict, average='binary')
    accs.append(clf_acc)
    pres.append(pre)
    recs.append(rec)
    f1s.append(fsc)
    remove()
key = 'BERT-512'
d = { 'key':key, 'accuracy':accs, 'precision':pres, 'recall':recs, 'f1':f1s}
print(d)
remove()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

HBox(children=(FloatProgress(value=0.0, max=1104.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=10.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 10', max=138.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 10', max=138.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 10', max=138.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 10', max=138.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 10', max=138.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 5 of 10', max=138.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 6 of 10', max=138.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 7 of 10', max=138.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 8 of 10', max=138.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 9 of 10', max=138.0, style=ProgressStyle(de…





HBox(children=(FloatProgress(value=0.0, max=276.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=35.0), HTML(value='')))




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

HBox(children=(FloatProgress(value=0.0, max=1104.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=10.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 10', max=138.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 10', max=138.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 10', max=138.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 10', max=138.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 10', max=138.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 5 of 10', max=138.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 6 of 10', max=138.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 7 of 10', max=138.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 8 of 10', max=138.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 9 of 10', max=138.0, style=ProgressStyle(de…





HBox(children=(FloatProgress(value=0.0, max=276.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=35.0), HTML(value='')))




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

HBox(children=(FloatProgress(value=0.0, max=1104.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=10.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 10', max=138.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 10', max=138.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 10', max=138.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 10', max=138.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 10', max=138.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 5 of 10', max=138.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 6 of 10', max=138.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 7 of 10', max=138.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 8 of 10', max=138.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 9 of 10', max=138.0, style=ProgressStyle(de…





HBox(children=(FloatProgress(value=0.0, max=276.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=35.0), HTML(value='')))




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

HBox(children=(FloatProgress(value=0.0, max=1104.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=10.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 10', max=138.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 10', max=138.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 10', max=138.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 10', max=138.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 10', max=138.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 5 of 10', max=138.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 6 of 10', max=138.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 7 of 10', max=138.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 8 of 10', max=138.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 9 of 10', max=138.0, style=ProgressStyle(de…





HBox(children=(FloatProgress(value=0.0, max=276.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=35.0), HTML(value='')))




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

HBox(children=(FloatProgress(value=0.0, max=1104.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=10.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 10', max=138.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 10', max=138.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 10', max=138.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 10', max=138.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 10', max=138.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 5 of 10', max=138.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 6 of 10', max=138.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 7 of 10', max=138.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 8 of 10', max=138.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 9 of 10', max=138.0, style=ProgressStyle(de…





HBox(children=(FloatProgress(value=0.0, max=276.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=35.0), HTML(value='')))


{'key': 'BERT-512', 'accuracy': [0.9456521739130435, 0.9166666666666666, 0.9347826086956522, 0.927536231884058, 0.9202898550724637], 'precision': [0.9389312977099237, 0.8506493506493507, 0.9060402684563759, 0.9121621621621622, 0.9090909090909091], 'recall': [0.9461538461538461, 1.0, 0.9712230215827338, 0.9507042253521126, 0.9090909090909091], 'f1': [0.9425287356321839, 0.9192982456140351, 0.9375, 0.9310344827586207, 0.9090909090909091]}
rm: cannot remove 'cache_dir': No such file or directory
rm: cannot remove 'outputs': No such file or directory
rm: cannot remove 'runs': No such file or directory


In [None]:
result = {'key': 'BERT-512', 
          'accuracy': [0.9456521739130435, 0.9166666666666666, 0.9347826086956522, 0.927536231884058, 0.9202898550724637], 
          'precision': [0.9389312977099237, 0.8506493506493507, 0.9060402684563759, 0.9121621621621622, 0.9090909090909091], 
          'recall': [0.9461538461538461, 1.0, 0.9712230215827338, 0.9507042253521126, 0.9090909090909091], 
          'f1': [0.9425287356321839, 0.9192982456140351, 0.9375, 0.9310344827586207, 0.9090909090909091]}

In [None]:
import numpy as np
result['mean_accuracy'] = np.mean(result['accuracy'])
result['mean_precision'] = np.mean(result['precision'])
result['mean_recall'] = np.mean(result['recall'])
result['mean_f1'] = np.mean(result['f1'])

{'key': 'BERT-512', 'accuracy': [0.9456521739130435, 0.9166666666666666, 0.9347826086956522, 0.927536231884058, 0.9202898550724637], 'precision': [0.9389312977099237, 0.8506493506493507, 0.9060402684563759, 0.9121621621621622, 0.9090909090909091], 'recall': [0.9461538461538461, 1.0, 0.9712230215827338, 0.9507042253521126, 0.9090909090909091], 'f1': [0.9425287356321839, 0.9192982456140351, 0.9375, 0.9310344827586207, 0.9090909090909091], 'mean_accuracy': 0.9289855072463767, 'mean_precision': 0.9033747976137443, 'mean_recall': 0.9554344004359203, 'mean_f1': 0.9278904746191496}


### Epoch Loops

In [None]:
overall_result = []
for epochs in [2,5,10,20]:
    train_size=500
    test_size = len(df_data) - train_size
    result, clf_result = {}, {}
    debug=True
    override=True
    data_split = make_data_ratio(df_data, test_size=test_size, train_size=train_size,
                               debug=debug, shuffle_seed=global_shuffle_seed, override=override)
    print(len(data_split['train']), len(data_split['test']))
    actual = [i if i==True else 0 for i in data_split['test']['is_flood']]
    train_df = data_split['train'][['text', 'is_flood']]
    train_df.columns = ['text', 'labels']
    test_df = data_split['test'][['text', 'is_flood']]
    test_df.columns = ['text', 'labels']

    model_args = ClassificationArgs(
        num_train_epochs=epochs,
        max_seq_length=512,
        overwrite_output_dir=True
    )
    model = ClassificationModel(
        "bert", "bert-base-uncased", args=model_args
    )
    model.train_model(train_df)
    # print([i for i in list(test_df['text']) if len(i)<10])
    # break

    predict, raw_outputs = model.predict(list(test_df['text']))
    key = 'BERT-512'
    clf_acc = accuracy_score(actual, predict)
    pre, rec, fsc, sup = precision_recall_fscore_support(actual, predict, average='binary')
    d = { 'key':key, 'train_size':train_size, 'test_size':test_size, 'accuracy':clf_acc, 'precision':pre, 
        'recall':rec, 'f1':fsc,
    }
    overall_result.append(d)
    print(d)
    remove()

Data Loaded
500 880


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

HBox(children=(FloatProgress(value=0.0, max=500.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=2.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 2', max=63.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 2', max=63.0, style=ProgressStyle(desc…





HBox(children=(FloatProgress(value=0.0, max=880.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=110.0), HTML(value='')))


{'key': 'BERT-512', 'train_size': 500, 'test_size': 880, 'accuracy': 0.8818181818181818, 'precision': 0.841541755888651, 'recall': 0.9290780141843972, 'f1': 0.8831460674157304}
Data Loaded
500 880


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

HBox(children=(FloatProgress(value=0.0, max=500.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 5', max=63.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 5', max=63.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 5', max=63.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 5', max=63.0, style=ProgressStyle(desc…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 5', max=63.0, style=ProgressStyle(desc…





HBox(children=(FloatProgress(value=0.0, max=880.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=110.0), HTML(value='')))


{'key': 'BERT-512', 'train_size': 500, 'test_size': 880, 'accuracy': 0.9375, 'precision': 0.9200913242009132, 'recall': 0.9527186761229315, 'f1': 0.9361207897793262}
Data Loaded
500 880


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

HBox(children=(FloatProgress(value=0.0, max=500.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=10.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 10', max=63.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 10', max=63.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 10', max=63.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 10', max=63.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 10', max=63.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 5 of 10', max=63.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 6 of 10', max=63.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 7 of 10', max=63.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 8 of 10', max=63.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 9 of 10', max=63.0, style=ProgressStyle(des…





HBox(children=(FloatProgress(value=0.0, max=880.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=110.0), HTML(value='')))


{'key': 'BERT-512', 'train_size': 500, 'test_size': 880, 'accuracy': 0.9375, 'precision': 0.923963133640553, 'recall': 0.9479905437352246, 'f1': 0.9358226371061844}
Data Loaded
500 880


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

HBox(children=(FloatProgress(value=0.0, max=500.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=20.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 20', max=63.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 20', max=63.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 20', max=63.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 20', max=63.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 20', max=63.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 5 of 20', max=63.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 6 of 20', max=63.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 7 of 20', max=63.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 8 of 20', max=63.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 9 of 20', max=63.0, style=ProgressStyle(des…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 10 of 20', max=63.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 11 of 20', max=63.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 12 of 20', max=63.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 13 of 20', max=63.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 14 of 20', max=63.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 15 of 20', max=63.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 16 of 20', max=63.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 17 of 20', max=63.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 18 of 20', max=63.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 19 of 20', max=63.0, style=ProgressStyle(de…





HBox(children=(FloatProgress(value=0.0, max=880.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=110.0), HTML(value='')))


{'key': 'BERT-512', 'train_size': 500, 'test_size': 880, 'accuracy': 0.9272727272727272, 'precision': 0.8945054945054945, 'recall': 0.9621749408983451, 'f1': 0.9271070615034169}


### Single Run

In [22]:
train_size = len(df_data)-2
test_size = 2
result, clf_result = {}, {}
debug=True
data_split = make_data_ratio(df_data, test_size=test_size, train_size=train_size,
                            debug=True, shuffle_seed=global_shuffle_seed, override=True)
print(len(data_split['train']), len(data_split['test']))
actual = [i if i==True else 0 for i in data_split['test']['is_flood']]
train_df = data_split['train'][['text', 'is_flood']]
train_df.columns = ['text', 'labels']
test_df = data_split['test'][['text', 'is_flood']]
test_df.columns = ['text', 'labels']
train_df = pd.concat([train_df, test_df])

model_args = ClassificationArgs(
    num_train_epochs=10,
    max_seq_length=512,
    overwrite_output_dir=True
)
model = ClassificationModel(
    "bert", "bert-base-uncased", args=model_args
)
model.train_model(train_df)

Data Loaded
1378 2


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, max=1380.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=10.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 10', max=173.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 10', max=173.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 10', max=173.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 10', max=173.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 10', max=173.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 5 of 10', max=173.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 6 of 10', max=173.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 7 of 10', max=173.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 8 of 10', max=173.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 9 of 10', max=173.0, style=ProgressStyle(de…





(1730, 0.11757897193517705)

In [23]:
import pandas as pd

In [24]:
new_data_path = root+'/NLP_flood_Bangladesh/Colabs/new_data.json'
new_data = pd.read_json(new_data_path)

In [25]:
predictions, raw_outputs = model.predict(new_data['text'])

HBox(children=(FloatProgress(value=0.0, max=36123.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4516.0), HTML(value='')))




In [26]:
pred = [False if i==0 else True for i in predictions]
new_data['is_flood'] = pred
jdata = json.loads(new_data.to_json(orient='records'))
json.dump(jdata,open('new_data.json','w'))
new_data[new_data['is_flood']==True]

Unnamed: 0,datePublished,text,org_text,id,newspaper,is_flood
32,2014-04-20 00:00:00,body recovered around saturday said tongi poli...,body recovered around saturday said tongi poli...,1c5b1d9c-b5b0-4cae-b10c-b2073b0534c3,bdnews,True
76,2014-12-27 00:00:00,awards introduced london based curry life maga...,awards introduced london based curry life maga...,3878bfd3-4f0c-4e38-8bd1-3c20524734e0,bdnews,True
100,2015-07-01 00:00:00,residents dakkhinkhan kashaibarhi gawair molla...,residents dakkhinkhan kashaibarhi gawair molla...,592f0bf5-de02-4bab-925c-b9bd8bbc1efa,bdnews,True
103,2015-08-11 00:00:00,least million people affected flooding across ...,least million people affected flooding across ...,845c5050-ca2e-433f-a71f-03079f6a41ba,bdnews,True
106,2015-11-01 00:00:00,four member appellate division bench chief jus...,four member appellate division bench chief jus...,e45f05bf-51e6-4469-845d-4005e2a80b17,bdnews,True
...,...,...,...,...,...,...
34615,2019-04-07 10:06:58,erosion caused meghna river taken serious turn...,erosion caused meghna river taken serious turn...,b0c65117-21ba-44c4-a06e-edbd3a43900c,theIndependent,True
34871,2014-11-29 00:00:00,rangpur cultivation flood tolerant variety ric...,rangpur cultivation flood tolerant variety ric...,3db972ee-76f0-45d4-bf26-a8f85a3b7722,theNewNation,True
34878,2017-12-11 00:00:00,sarwaruddin aslam halda river sweat water natu...,sarwaruddin aslam halda river sweat water natu...,7030cb41-c278-452f-ab8d-9f0919529f92,theNewNation,True
34916,2017-08-31 00:00:00,gaibandha disaster risk management phase proje...,gaibandha disaster risk management phase proje...,41a0db1b-d3f2-45b9-b078-22f001fc61f3,theNewNation,True


In [27]:
jdata = json.loads(new_data[new_data['is_flood']==True].to_json(orient='records'))
json.dump(jdata,open('predicted_isflood.json','w'))
jdata = json.loads(new_data[new_data['is_flood']==False].to_json(orient='records'))
json.dump(jdata,open('predicted_not_isflood.json','w'))

In [None]:
predictions, raw_outputs = model.predict(test_df['text'])

  0%|          | 0/277 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

In [None]:
print(accuracy_score(test_df['labels'], predictions))
precision_recall_fscore_support(test_df['labels'], predictions, average='binary')

0.9314079422382672


(0.9191176470588235, 0.9398496240601504, 0.929368029739777, None)