## Datapipeline

In [1]:
import sys, os
sys.path.insert(0, '..')

import pandas as pd
ROOT = ""

#### Loading splits from directory

In [2]:
from google.colab import drive
import pandas as pd
drive.mount('drive')
ROOT = "/content/drive/My Drive/iri_bengali_data"


Mounted at drive


In [3]:
K_FOLD = True
USE_TRANSLATED = False

if K_FOLD:
    df = pd.read_csv(os.path.join(ROOT, "fiveFoldStratifiedSamplingWithTranslation.csv"), index_col=0)
    X_trains, X_tests, y_trains, y_tests = [[df[col][df[f"split_{i}"] == cat] for i in range(5)]
            for col in ('content' if not USE_TRANSLATED else 'translated', 'is_flood') 
            for cat in ('test', 'train')
           ]
    print(f"{len(y_trains)} folds\n{len(y_trains[0])} training examples\n{len(y_tests[0])} test examples")
else:
    df = pd.read_csv(os.path.join(ROOT, "training_splits.csv"), index_col=0)
    train_dt = df[df['type'] == 'train']
    test_dt = df[df['type'] == 'test']
    valid_dt = df[df['type'] == 'valid']
    X_train, y_train = train_dt['content'], train_dt['is_flood']
    X_test, y_test = test_dt['content'], test_dt['is_flood']
    X_valid, y_valid = valid_dt['content'], valid_dt['is_flood']
    print(f"{len(X_train)} training examples\n{len(X_valid)} validation examples\n{len(X_test)} test examples")

5 folds
804 training examples
202 test examples


# Models

In [17]:
performance_summaries = {}

## Testing Pipeline for sklearn models

- Feature Extractor
- Model
- Metrics

In [39]:
# Feature Extractors
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Models
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

# Metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Combine 
from sklearn.model_selection import ParameterGrid

import numpy as np

def sklearn_run_pipeline(X_train, X_valid, y_train, y_valid, vectorizer, model):
    model, vectorizer = model(), vectorizer()
    vectorizer.fit(np.hstack([X_train, X_valid]))
    model.fit(vectorizer.transform(X_train), y_train)
    return model, vectorizer

def sklearn_run_pipeline_series(X_train, X_valid, y_train, y_valid, vectorizers, models, metrics):
    res = {modelname: {} for modelname in models.keys()}
    
    vectorizers = {k: v() for k, v in vectorizers.items()}
    train_with_valid = np.hstack([X_train, X_valid])
    for vect in vectorizers.values():
        vect.fit(train_with_valid)
    
    X_vects_train = {name: vect.transform(X_train) for name, vect in vectorizers.items()}
    X_vects_valid = {name: vect.transform(X_valid) for name, vect in vectorizers.items()}
    
    for model_name, model_class in models.items():
        model = model_class()
        table = res[model_name]
        for vect_name, X_vect_train in X_vects_train.items():
            model.fit(X_vect_train, y_train)
            y_pred = model.predict(X_vects_valid[vect_name])
            res[model_name][vect_name] = {name: metric(y_valid, y_pred) for name, metric in metrics.items()}
    
    return res

In [40]:
feature_extractors = {
    'CountVect': CountVectorizer,
    'CountVect-2gram': lambda: CountVectorizer(ngram_range = (1, 2)),
    'CountVect-min_df-max_df': lambda: CountVectorizer(min_df = 0.05, max_df = 0.95),
    'CountVect-2gram-min_df-max_df': lambda: CountVectorizer(min_df = 0.05, max_df = 0.95, ngram_range = (1, 2)),
    'TFIDF': TfidfVectorizer, 
    'TFIDF-2gram': lambda: TfidfVectorizer(ngram_range = (1, 2)),
    'TFIDF-min_df-max_df': lambda: TfidfVectorizer(min_df = 0.05, max_df = 0.95),
    'TFIDF-2gram-min_df-max_df': lambda: TfidfVectorizer(min_df = 0.05, max_df = 0.95, ngram_range = (1, 2))
}

models = {
    'RandomForest': lambda: RandomForestClassifier(class_weight = 'balanced'),
    'LinearSVC': lambda: LinearSVC(class_weight = 'balanced'),
    'LogRegL1': lambda: LogisticRegression(penalty = 'l1', 
                                   class_weight = 'balanced', 
                                   solver = 'liblinear',
                                   max_iter = 1000
                                  ),
    'LogRegL2': lambda: LogisticRegression(penalty = 'l2', 
                                   class_weight = 'balanced', 
                                   solver = 'liblinear',
                                   max_iter = 1000
                                  )
}

metrics = {"Accuracy": accuracy_score, 'Precision': precision_score, 'Recall': recall_score, 'f1': f1_score}

In [41]:
FOLD_INDEX = None
if K_FOLD and FOLD_INDEX is None:
    k = len(X_trains)
    results = {}
    for X_train, X_test, y_train, y_test in zip(X_trains, X_tests, y_trains, y_tests):
        fold_res = sklearn_run_pipeline_series(X_train, X_test, y_train, y_test, feature_extractors, models, metrics)

        for model, m_res in fold_res.items():
            for vect, v_res in m_res.items():
                for metric, value in v_res.items():
                    if model not in results:
                        results[model] = {}
                    if vect not in results[model]:
                        results[model][vect] = {}
                    if metric not in results[model][vect]:
                        results[model][vect][metric] = 0
                    results[model][vect][metric] += value/k
elif K_FOLD:
    results = sklearn_run_pipeline_series(X_trains[FOLD_INDEX], X_tests[FOLD_INDEX], y_trains[FOLD_INDEX], y_tests[FOLD_INDEX], feature_extractors, models, metrics)
else:
    results = sklearn_run_pipeline_series(X_train, X_valid, y_train, y_valid, feature_extractors, models, metrics)



In [42]:
import pandas as pd
for model, model_results in results.items():
    print(model)
    print(pd.DataFrame(model_results).T.to_markdown(), '\n')

RandomForest
|                               |   Accuracy |   Precision |   Recall |       f1 |
|:------------------------------|-----------:|------------:|---------:|---------:|
| CountVect                     |   0.855805 |    0.869091 | 0.681698 | 0.762828 |
| CountVect-2gram               |   0.866755 |    0.888574 | 0.69913  | 0.78207  |
| CountVect-min_df-max_df       |   0.862775 |    0.881357 | 0.693251 | 0.77457  |
| CountVect-2gram-min_df-max_df |   0.863775 |    0.866086 | 0.713623 | 0.781779 |
| TFIDF                         |   0.85881  |    0.873924 | 0.687495 | 0.767317 |
| TFIDF-2gram                   |   0.861775 |    0.888719 | 0.681698 | 0.770467 |
| TFIDF-min_df-max_df           |   0.85482  |    0.860304 | 0.687536 | 0.763255 |
| TFIDF-2gram-min_df-max_df     |   0.873721 |    0.87308  | 0.739627 | 0.799897 | 

LinearSVC
|                               |   Accuracy |   Precision |   Recall |       f1 |
|:------------------------------|-----------:|------------:|--

In [43]:
best_models = [(model_name, 
                  max((vect_name for vect_name in vect_res.keys()), key = lambda v: vect_res[v]['Accuracy'])) 
                for model_name, vect_res in results.items()
              ]

def generate_model_summary_for_sklearn_models(model_name, 
                                              vect_name, 
                                              X_trains = X_trains, 
                                              y_trains = y_trains, 
                                              X_tests = X_tests,
                                              y_tests = y_tests):

    summary = {}
    for i, datasets in enumerate(zip(X_trains, y_trains, X_tests, y_tests)):
        X_train, y_train, X_test, y_test = datasets
        model, vect = sklearn_run_pipeline(X_train, X_test, y_train, y_test, feature_extractors[vect_name], models[model_name])
        pred = model.predict(vect.transform(X_test))
        summary[f"fold_{i}"] = {metric_name: metric(pred, y_test) for metric_name, metric in metrics.items()}
        summary[f"fold_{i}"]["model"] = (model, vect)
    return summary

In [44]:
sklearn_summary = {m_v: generate_model_summary_for_sklearn_models(*m_v) for m_v in best_models}

In [None]:
sklearn_summary

{('LinearSVC',
  'TFIDF-2gram'): {'fold_0': {'Accuracy': 0.9356435643564357,
   'Precision': 0.8857142857142857,
   'Recall': 0.9253731343283582,
   'f1': 0.9051094890510949,
   'model': (LinearSVC(C=1.0, class_weight='balanced', dual=True, fit_intercept=True,
              intercept_scaling=1, loss='squared_hinge', max_iter=1000,
              multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
              verbose=0),
    TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                    dtype=<class 'numpy.float64'>, encoding='utf-8',
                    input='content', lowercase=True, max_df=1.0, max_features=None,
                    min_df=1, ngram_range=(1, 2), norm='l2', preprocessor=None,
                    smooth_idf=True, stop_words=None, strip_accents=None,
                    sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                    tokenizer=None, use_idf=True, vocabulary=None))}, 'fold_1': {'Accuracy': 0.895522388059

## Neural Network Models

In [4]:
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class BengaliNewsDataset(Dataset):
    def __init__(self, X, y = None):
        if y is None:
            self.X, self.y = X['content'], X['is_flood']
        else:
            self.X, self.y = X, y
    
    def __getitem__(self, i):
        return self.X.iloc[i], self.y.iloc[i]
    
    def __len__(self):
        return len(self.y)

device

device(type='cpu')

In [5]:
SAMPLE_LOAD = False
FOLD_INDEX = 1
if SAMPLE_LOAD:
    X_sample = X_train.sample(10)
    y_sample = y_train[X_sample.index]
    X_sample_v = X_valid.sample(10)
    y_sample_v = y_valid[X_sample_v.index]
    train_ds = BengaliNewsDataset(X_sample, y_sample)
    valid_ds = BengaliNewsDataset(X_sample_v, y_sample_v)
elif not K_FOLD:
    train_ds = BengaliNewsDataset(X_train, y_train)
    valid_ds = BengaliNewsDataset(X_valid, y_valid)
elif FOLD_INDEX is None:
    train_ds = [BengaliNewsDataset(X_trains[i], y_trains[i]) for i in range(len(X_trains))] 
    valid_ds = [BengaliNewsDataset(X_tests[i], y_tests[i]) for i in range(len(X_trains))] 
else:
    train_ds = BengaliNewsDataset(X_trains[FOLD_INDEX], y_trains[FOLD_INDEX])
    valid_ds = BengaliNewsDataset(X_tests[FOLD_INDEX], y_tests[FOLD_INDEX])

In [None]:
del df

In [None]:
next(iter(train_ds))

('চরাঞ্চলে সাধারণত আষাঢ়–ভাদ্র মাসের মাঝামাঝি পর্যন্ত বন্যা থাকে। সেই বন্যা মোকাবিলা ও ফসলের আবাদ নিয়ে আলাদা প্রস্তুতি থাকে। তবে এবার এই দুই মাসে কুষ্টিয়ার দৌলতপুর উপজেলার রামকৃষ্ণপুর ও চিলমারী ইউনিয়নের চরাঞ্চলগুলোতে বন্যা দেখা দেয়নি। তবে হঠাৎ করে এই আশ্বিনে বন্যা দেখা দিয়েছে।\xa0\nকয়েক দিনের ব্যবধানে হুহু করে পদ্মার পানি বেড়ে যাওয়ায় পানি উপচে চরাঞ্চল প্লাবিত হয়। এতে ওই দুই ইউনিয়নের ৩৫ গ্রামের ৫০ হাজার বাসিন্দা পানিবন্দী হয়ে পড়েছে। প্রতিদিন গড়ে ৫ সেন্টিমিটার করে পানির উচ্চতা বাড়ছে। এতে চরম দুর্ভোগে আছে ওই বাসিন্দারা। মাঠের ফসল ডুবে নষ্ট হয়ে গেছে। এখনো সরকারি বা বেসরকারিভাবে ওই সব অঞ্চলে কোনো সাহায্য–সহযোগিতা মেলেনি।\xa0\nপানি উন্নয়ন বোর্ড (পাউবো) বলছে, গত সাত দিনে পানির উচ্চতা সবচেয়ে বেশি বেড়েছে। পদ্মার বিপৎসীমা নির্ধারণ আছে ১৪ দশমিক ২৫ সেন্টিমিটার। সেখানে গতকাল শুক্রবার পানির উচ্চতা ছিল ১৩ দশমিক ৯০ সেন্টিমিটার, যা সর্তকবার্তা।\xa0\nকুষ্টিয়া পাউবোর নির্বাহী প্রকৌশলী পীযূষ কৃষ্ণ কুন্ডু প্রথম আলোকে বলেন, হঠাৎ পানি বাড়ছে। এ সময়ে এর আগে এভাবে পানি বাড়েনি। আরও দু–এক দিন পানি বাড়ার পর কমতে পা

### Google Translate + BERT

In [27]:
!pip install transformers &> /dev/null
!pip install simpletransformers &> /dev/null
!pip install tqdm>=4.55.0 &> /dev/null

In [28]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

#### Training

In [None]:
assert(USE_TRANSLATED)

FOLD_INDEX = 1

def google_bert_runpipeline(X_train, X_test, y_train, y_test, fold):
  train_df = pd.concat([X_train, y_train], axis=1).rename(columns={"translated": "text", 'is_flood': "labels"})
  test_df = pd.concat([X_test, y_test], axis=1).rename(columns={"translated": "text", 'is_flood': "labels"})
  model_args = ClassificationArgs(
      num_train_epochs=10,
      max_seq_length=512,
      overwrite_output_dir=True,
      no_save=True
  )
  model = ClassificationModel(
      "bert", "bert-base-uncased", args=model_args
  )
  model.train_model(train_df)

  save_file = os.path.join(ROOT, f"eng_translated_bert_fold_{fold}.ckpt")
  torch.save(model.model.state_dict(), save_file)
  
  predict, _ = model.predict(list(test_df['text']))
  
  actual = 1 * test_df['labels']
  acc = accuracy_score(actual, predict)
  pre, rec, fsc, _ = precision_recall_fscore_support(actual, predict, average='binary')
  return {"accuracy": acc, "precision": pre, "recall": rec, 'fscore': fsc, 'model': save_file}

performance_summaries['google_translate_with_bert'] = {f"fold_{i}": google_bert_runpipeline(*p, fold = i) for i, p in enumerate(zip(X_trains, X_tests, y_trains, y_tests))}


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=10.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 10', max=101.0, style=ProgressStyle(de…

  model.parameters(), args.max_grad_norm





HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 10', max=101.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 10', max=101.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 10', max=101.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 10', max=101.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 5 of 10', max=101.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 6 of 10', max=101.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 7 of 10', max=101.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 8 of 10', max=101.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 9 of 10', max=101.0, style=ProgressStyle(de…





HBox(children=(FloatProgress(value=0.0, max=202.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

HBox(children=(FloatProgress(value=0.0, max=805.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=10.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 10', max=101.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 10', max=101.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 10', max=101.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 10', max=101.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 10', max=101.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 5 of 10', max=101.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 6 of 10', max=101.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 7 of 10', max=101.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 8 of 10', max=101.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 9 of 10', max=101.0, style=ProgressStyle(de…





HBox(children=(FloatProgress(value=0.0, max=201.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

HBox(children=(FloatProgress(value=0.0, max=805.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=10.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 10', max=101.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 10', max=101.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 10', max=101.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 10', max=101.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 10', max=101.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 5 of 10', max=101.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 6 of 10', max=101.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 7 of 10', max=101.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 8 of 10', max=101.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 9 of 10', max=101.0, style=ProgressStyle(de…





HBox(children=(FloatProgress(value=0.0, max=201.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

HBox(children=(FloatProgress(value=0.0, max=805.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=10.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 10', max=101.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 10', max=101.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 10', max=101.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 10', max=101.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 10', max=101.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 5 of 10', max=101.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 6 of 10', max=101.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 7 of 10', max=101.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 8 of 10', max=101.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 9 of 10', max=101.0, style=ProgressStyle(de…





HBox(children=(FloatProgress(value=0.0, max=201.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

HBox(children=(FloatProgress(value=0.0, max=805.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=10.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 10', max=101.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 10', max=101.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 2 of 10', max=101.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 3 of 10', max=101.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 4 of 10', max=101.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 5 of 10', max=101.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 6 of 10', max=101.0, style=ProgressStyle(de…




HBox(children=(FloatProgress(value=0.0, description='Running Epoch 7 of 10', max=101.0, style=ProgressStyle(de…

#### Evaluation

In [32]:
model_args = ClassificationArgs(max_seq_length=512)
model = ClassificationModel("bert", "bert-base-uncased", use_cuda = False)

i = 1
model.model.load_state_dict(torch.load(os.path.join(ROOT, f"eng_translated_bert_fold_{i}.ckpt"), map_location=device))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

<All keys matched successfully>

In [33]:
predict, _ = model.predict(list(X_tests[i]))

HBox(children=(FloatProgress(value=0.0, max=201.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




In [38]:
pred_df = (pd.Series(predict) == 1)
fp = (X_tests[i][(pred_df == True) & (y_tests[i] == False)])
fn = (X_tests[i][(pred_df == False) & (y_tests[i] == True)])

display_basic_evaluation("Google-Translated Text with BERT", fp , fn)

----------------------------------------------------------------------------------------------------
Model: Google-Translated Text with BERT
----------------------------------------------------------------------------------------------------
|            |   FP Mean |   FN Mean |   Positives Mean |   Negatives Mean |      Mean |
|:-----------|----------:|----------:|-----------------:|-----------------:|----------:|
| ক্ষতিগ্রস্ত   |       nan |  1.23529  |         1.74566  |        0.733333  | 1.08151   |
| ঘূর্ণিঝড়     |       nan |  1.17647  |         0.375723 |        0.412121  | 0.399602  |
| নষ্ট        |       nan |  0.705882 |         0.384393 |        0.469697  | 0.440358  |
| ক্ষয়ক্ষতি    |       nan |  0.117647 |         0.066474 |        0.0469697 | 0.0536779 |
| বন্যা       |       nan |  6.88235  |         8.89595  |        2.24394   | 4.53181   |
| ক্ষয়        |       nan |  0.117647 |         0.066474 |        0.0636364 | 0.0646123 |
| নিমজ্জিত    |       nan |  0.17

Unnamed: 0,FP Mean,FN Mean,Positives Mean,Negatives Mean,Mean
ক্ষতিগ্রস্ত,,1.235294,1.745665,0.733333,1.081511
ঘূর্ণিঝড়,,1.176471,0.375723,0.412121,0.399602
নষ্ট,,0.705882,0.384393,0.469697,0.440358
ক্ষয়ক্ষতি,,0.117647,0.066474,0.04697,0.053678
বন্যা,,6.882353,8.895954,2.243939,4.531809
ক্ষয়,,0.117647,0.066474,0.063636,0.064612
নিমজ্জিত,,0.176471,0.216763,0.043939,0.10338
পানিবন্দি,,0.823529,1.387283,0.039394,0.502982
পানিবন্দী,,0.235294,0.791908,0.033333,0.294235
পাহাড়ি ঢলে,,1.058824,0.950867,0.148485,0.424453


### LSTM CNN Model

In [6]:
!pip install bnlp_toolkit &> /dev/null
!pip install gensim==4.0.1 &> /dev/null
!pip install transformers &> /dev/null

assert(not USE_TRANSLATED)

In [7]:
MAX_SIZE = 256

#### Fast-Text Word Embeddings

In [9]:
import pandas as pd, numpy as np
from gensim.models import FastText
from basicBanglaTools import *

def tokenize_and_clean(text, stopwords = stopwords, num_token = True):
    tokens = [tk for tk in tokenizer.tokenize(text) if tk not in stopwords and all(c in bengaliTextChars for c in tk)]
    if num_token == True:
        for i, tk in enumerate(tokens):
            if len(tk) > 1 and all(c in digits for c in tk):
                tokens[i] = '<NUM>'
    return tokens

EMBEDDING_DIM = 100
fast_text_model = FastText.load(os.path.join(ROOT, "FastTextModel/fasttextBengali_5_count_16_window.model"))

punkt not found. downloading...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


#### Neural Net Model



In [10]:
def collate_batch(batch):
    texts, labels = zip(*batch)
    text_enc = torch.tensor([np.vstack([fast_text_model.wv[tokenized[:MAX_SIZE]], np.zeros([max(MAX_SIZE - len(tokenized), 0), fast_text_model.vector_size])])
                             for text in texts 
                             for tokenized in (tokenize_and_clean(text),)
                             ])
    labels_enc = torch.tensor([1 if label else 0 for label in labels])
    
    return text_enc.to(device, dtype=torch.float32), labels_enc.to(device,  dtype=torch.float)

train_loader = DataLoader(train_ds, batch_size = 32, shuffle=True, collate_fn = collate_batch)
valid_loader = DataLoader(valid_ds, batch_size = 16, shuffle=True, collate_fn = collate_batch)

In [11]:
shape = next(iter(train_loader))[0].shape
assert(tuple(shape) == (32, MAX_SIZE, EMBEDDING_DIM))

In [12]:
class LSTM_CNN_Classifier(nn.Module):
  def __init__(self):
    super(LSTM_CNN_Classifier, self).__init__()

    self.conv = nn.Sequential(nn.Conv1d(EMBEDDING_DIM, 32, kernel_size=3, padding='same'),
                              nn.ELU(),
                              nn.Conv1d(32, 32, kernel_size=3, padding='same'),
                              nn.ReLU(),
                              nn.MaxPool1d(4)
                              )
    self.lstm = nn.LSTM(32, batch_first = True, hidden_size = 8, bidirectional = True, dropout=0.3)

    self.classifier = nn.Sequential(
        nn.Dropout(0.2),
        nn.Linear(1024, 128),
        nn.Dropout(0.2),
        nn.Linear(128, 1)
    )

  def forward(self, X):
    # Convolution expects Batch, Embedding Dim, Seq Length
    X = X.transpose(1, 2).contiguous()
    conv_out = self.conv(X)
    #print(f"Conv out shape: {conv_out.shape}")
    conv_out = conv_out.transpose(1, 2).contiguous()
    lstm_out = self.lstm(conv_out)[0]
    lstm_out = torch.flatten(lstm_out, start_dim=1)
    #print(f"LSTM out shape: {lstm_out.shape}")
    class_out = self.classifier(lstm_out)
    return class_out 

In [14]:
#lstm_model = LSTM_CNN_Classifier().to(device)
lstm_model = torch.load("/content/drive/My Drive/iri_bengali_data/lstm_model_epoch_40.ckpt", map_location=device)

In [None]:
from transformers import get_scheduler, AdamW

optimizer = AdamW(lstm_model.parameters())

num_epochs = 50
save_every = 10
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

criterion = nn.BCEWithLogitsLoss()

lstm_model.train()
loss_hist, accuracy_hist = [], []
for epoch in range(num_epochs):
  if epoch != 1  and (epoch + 1) % save_every == 0:
        torch.save(lstm_model, f"/content/drive/My Drive/iri_bengali_data/lstm_model_epoch_{epoch}_fold_{FOLD_INDEX}.ckpt")
  
  with tqdm(train_loader, unit = "batch") as tepoch:
    n, accuracy = 0, 0
    t_pred, f_pred = 0, 0 

    for X_v, y_v in valid_loader:
      n += 1
      with torch.no_grad():
        output = lstm_model(X_v)
        #pred = output.argmax(dim = 1, keepdim=True).squeeze()
        pred = 1*(output > 0).squeeze()
        accuracy += (pred == y_v).sum().item()/len(y_v)
        t_pred += pred.sum().item()
        f_pred += (1-pred).sum().item()
    accuracy /= n
    
    for data, target in tepoch:
      data, target = data.to(device), target.to(device)

      optimizer.zero_grad()
      logits_pred = lstm_model(data)
      loss = criterion(logits_pred, target.unsqueeze(1))
      loss.backward()
      optimizer.step()
        
      lr_scheduler.step()
      progress_bar.update(1)
      tepoch.set_postfix(loss = loss.item(), accuracy = 100. * accuracy, true_prediction = t_pred, false_prediction = f_pred)
      loss_hist.append(loss.item()), accuracy_hist.append(accuracy)
    

HBox(children=(FloatProgress(value=0.0, max=1300.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=26.0), HTML(value='')))




In [None]:
X, y = next(iter(train_loader))
criterion = nn.BCEWithLogitsLoss()
pred = lstm_model(X)
criterion(pred, y.to(device, dtype=torch.float).unsqueeze(1))

tensor(6.1646e-05, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward>)

In [15]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
results = {}

test_ds = BengaliNewsDataset(X_tests[FOLD_INDEX], y_tests[FOLD_INDEX])
test_loader = DataLoader(test_ds, batch_size = len(test_ds), shuffle=False, collate_fn = collate_batch)
for X_v, y_v in test_loader:
      with torch.no_grad():
        output = lstm_model(X_v)
        pred = 1*(output > 0).squeeze()
        accuracy = (pred == y_v).sum().item()/len(y_v)
        pred = pred.detach().cpu()
        target = y_v.detach().cpu()
        results["accuracy"] = accuracy_score(target, pred)
        results["precision"], results["recall"], results["fscore"], _ =  precision_recall_fscore_support(target, pred, average='binary')

In [18]:
epoch = 40
results["model"] = f"/content/drive/My Drive/iri_bengali_data/lstm_model_epoch_{epoch}.ckpt"
performance_summaries["LSTM"]= {f"fold_{FOLD_INDEX}":  results}
performance_summaries["LSTM"]

{'fold_1': {'accuracy': 0.8855721393034826,
  'fscore': 0.8345323741007195,
  'model': '/content/drive/My Drive/iri_bengali_data/lstm_model_epoch_40.ckpt',
  'precision': 0.8285714285714286,
  'recall': 0.8405797101449275}}

In [26]:
pred_df = (pd.Series(pred) == 1)
fp = (X_tests[FOLD_INDEX][(pred_df == True) & (y_tests[FOLD_INDEX] == False)])
fn = (X_tests[FOLD_INDEX][(pred_df == False) & (y_tests[FOLD_INDEX] == True)])

display_basic_evaluation("CNN-LSTM", fp , fn)

----------------------------------------------------------------------------------------------------
Model: CNN-LSTM
----------------------------------------------------------------------------------------------------
|            |   FP Mean |   FN Mean |   Positives Mean |   Negatives Mean |      Mean |
|:-----------|----------:|----------:|-----------------:|-----------------:|----------:|
| ক্ষতিগ্রস্ত   |  1        |       1.1 |         1.74566  |        0.733333  | 1.08151   |
| ঘূর্ণিঝড়     |  0.428571 |       2   |         0.375723 |        0.412121  | 0.399602  |
| নষ্ট        |  0.714286 |       0.8 |         0.384393 |        0.469697  | 0.440358  |
| ক্ষয়ক্ষতি    |  0        |       0.2 |         0.066474 |        0.0469697 | 0.0536779 |
| বন্যা       |  1.57143  |       7.9 |         8.89595  |        2.24394   | 4.53181   |
| ক্ষয়        |  0        |       0.2 |         0.066474 |        0.0636364 | 0.0646123 |
| নিমজ্জিত    |  0        |       0.1 |         0.216763 

Unnamed: 0,FP Mean,FN Mean,Positives Mean,Negatives Mean,Mean
ক্ষতিগ্রস্ত,1.0,1.1,1.745665,0.733333,1.081511
ঘূর্ণিঝড়,0.428571,2.0,0.375723,0.412121,0.399602
নষ্ট,0.714286,0.8,0.384393,0.469697,0.440358
ক্ষয়ক্ষতি,0.0,0.2,0.066474,0.04697,0.053678
বন্যা,1.571429,7.9,8.895954,2.243939,4.531809
ক্ষয়,0.0,0.2,0.066474,0.063636,0.064612
নিমজ্জিত,0.0,0.1,0.216763,0.043939,0.10338
পানিবন্দি,0.0,0.8,1.387283,0.039394,0.502982
পানিবন্দী,0.0,0.0,0.791908,0.033333,0.294235
পাহাড়ি ঢলে,0.0,0.7,0.950867,0.148485,0.424453


### Multi-lingual Bert Uncased

In [41]:
!pip install transformers &> /dev/null
!pip install simpletransformers &> /dev/null

#### Simpletransformers

In [None]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

assert(not USE_TRANSLATED)
FOLD_INDEX = 1

def multi_lingual_bert_runpipeline(X_train, X_test, y_train, y_test, fold):
  train_df = pd.concat([X_train, y_train], axis=1).rename(columns={"content": "text", 'is_flood': "labels"})
  test_df = pd.concat([X_test, y_test], axis=1).rename(columns={"content": "text", 'is_flood': "labels"})
  
  model_args = ClassificationArgs(
        num_train_epochs=10,
        max_seq_length=512,
        no_save = True,
        overwrite_output_dir=True
    )
  model = ClassificationModel(
      "bert", "bert-base-multilingual-uncased", 
       args=model_args,
       use_cuda=False
    )
  model.train_model(train_df)

  save_file = os.path.join(ROOT, f"multilingual_bert_fold_{fold}.ckpt")
  torch.save(model.model.state_dict(), save_file)

  predict, _ = model.predict(list(test_df['text']))
  
  actual = 1 * test_df['labels']
  acc = accuracy_score(actual, predict)
  pre, rec, fsc, _ = precision_recall_fscore_support(actual, predict, average='binary')
  return {"accuracy": acc, "precision": pre, "recall": rec, 'fscore': fsc, 'model': save_file}

performance_summaries['multi-lingual_bert'] = {f"fold_{i}": multi_lingual_bert_runpipeline(*p, fold = i) for i, p in enumerate(zip(X_trains, X_tests, y_trains, y_tests)) if i == 1}

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=672271273.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=871891.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1715180.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, max=805.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Epoch', max=10.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 10', max=101.0, style=ProgressStyle(de…

In [None]:
summary

{'accuracy': 0.8308457711442786,
 'fscore': 0.75,
 'precision': 0.7611940298507462,
 'recall': 0.7391304347826086}

#### Custom-built

In [None]:
from transformers import BertTokenizer, BertModel

assert(not USE_TRANSLATE)
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
model = BertModel.from_pretrained("bert-base-multilingual-uncased")

MAX_SEQ_LEN = 512
def collate_batch(batch):
    text, labels = zip(*batch)
    text_enc = tokenizer(text, padding="max_length", truncation=True, return_tensors = "pt", max_length=MAX_SEQ_LEN)
    labels_enc = torch.tensor([1 if label else 0 for label in labels])
    
    return text_enc.to(device), labels_enc.to(device)

train_loader = DataLoader(train_ds, batch_size = 12, shuffle=True, collate_fn = collate_batch)
valid_loader = DataLoader(valid_ds, batch_size = 16, shuffle=True, collate_fn = collate_batch)

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
next(iter(train_loader))[0]['input_ids'].shape

torch.Size([12, 512])

In [None]:
class BERT_Based_Classifier(nn.Module):
    def __init__(self, bert_model):
        super(BERT_Based_Classifier, self).__init__()
        
        self.bert = bert_model
        self.pooler_size = bert_model.pooler.dense.out_features
        
        self.classifier = nn.Sequential(nn.Dropout(p = 0.1, inplace = False),
                                        nn.Linear(in_features=self.pooler_size, out_features=256, bias=True),
                                        nn.Sigmoid(),
                                        nn.Linear(in_features=256, out_features=1, bias=True)
                                       )
        
    def forward(self, **params):
        output = self.bert(**params)['pooler_output']
        logits = self.classifier(output)
        return logits
    
    def train(self):
        self.bert.train()


In [None]:
classification_model = BERT_Based_Classifier(model).to(device)
#classification_model = torch.load("/content/drive/My Drive/iri_bengali_data/classification_model_epoch_30.ckpt")

In [None]:
from transformers import get_scheduler, AdamW

optimizer = AdamW(model.parameters())

num_epochs = 50
save_every = 10
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

criterion = torch.nn.CrossEntropyLoss()
classification_model.train()
loss_hist, accuracy_hist = [], []
for epoch in range(num_epochs):
  if epoch != 0  and epoch % save_every == 0:
        torch.save(classification_model, f"/content/drive/My Drive/iri_bengali_data/classification_model_epoch_{epoch}.ckpt")
  
  with tqdm(train_loader, unit = "batch") as tepoch:
    n, accuracy = 0, 0
    t_pred, f_pred = 0, 0 
    for X_v, y_v in valid_loader:
      n += 1
      with torch.no_grad():
        output = classification_model(**X_v)
        pred = output.argmax(dim = 1, keepdim=True).squeeze()
        accuracy += (pred == y_v).sum().item()/len(y_v)
        t_pred += pred.sum().item()
        f_pred += len(y_v) - t_pred
    accuracy /= n
    
    for data, target in tepoch:
      data, target = data.to(device), target.to(device)

      optimizer.zero_grad()
      logits_pred = classification_model(**data)
      loss = criterion(logits_pred, target)
      print(logits_pred, target, logits_pred.argmax(dim = 1, keepdim=True).squeeze())
      loss.backward()
      optimizer.step()
        
      lr_scheduler.step()
      progress_bar.update(1)
      tepoch.set_postfix(loss = loss.item(), accuracy = 100. * accuracy, true_prediction = t_pred, false_prediction = f_pred)
      loss_hist.append(loss.item()), accuracy_hist.append(accuracy)
    

HBox(children=(FloatProgress(value=0.0, max=2700.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=54.0), HTML(value='')))

tensor([[-0.0870, -0.8362],
        [-0.1384, -0.8405],
        [-0.1059, -0.8653],
        [-0.0857, -0.8154],
        [-0.0404, -0.8701],
        [-0.0795, -0.8216],
        [-0.0910, -0.8493],
        [-0.0852, -0.8726],
        [-0.1391, -0.9164],
        [-0.1044, -0.8214],
        [-0.1127, -0.8013],
        [-0.0832, -0.8427]], device='cuda:0', grad_fn=<AddmmBackward>) tensor([0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0], device='cuda:0') tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], device='cuda:0')



KeyboardInterrupt: ignored

In [None]:
#valid_ds = BengaliNewsDataset(X_valid, y_valid)
#valid_loader = DataLoader(valid_ds, batch_size = len(valid_ds), shuffle=True, collate_fn = collate_batch)

n = 0
for X_v, y_v in train_loader:
    with torch.no_grad():
      output = classification_model(**X_v)
      pred = output.argmax(dim = 1, keepdim=True).squeeze()
      accuracy += (pred == y_v).sum().item()/len(y_v)
      n += 1
accuracy /= n

print(f"Validation Accuracy: {accuracy}")

Validation Accuracy: 0.6867283950617284


### Bangla BERT

In [None]:
from transformers import BertForMaskedLM, BertTokenizer

model = BertForMaskedLM.from_pretrained("sagorsarker/bangla-bert-base")
tokenizer = BertTokenizer.from_pretrained("sagorsarker/bangla-bert-base")

def collate_batch(batch):
    text, labels = zip(*batch)
    text_enc = tokenizer(text, return_tensors = "pt", truncation=True, padding=True, max_length=512)
    labels_enc = torch.tensor([1 if label else 0 for label in labels])
    
    return text_enc.to(device), labels_enc.to(device)

train_loader = DataLoader(train_ds, batch_size = 8, shuffle=True, collate_fn = collate_batch)
valid_loader = DataLoader(valid_ds, batch_size = 16, shuffle=True, collate_fn = collate_batch)

Some weights of the model checkpoint at sagorsarker/bangla-bert-base were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
class BERT_Based_Classifier(nn.Module):
    def __init__(self, bert_model):
        super(BERT_Based_Classifier, self).__init__()
        
        self.bert = bert_model
        self.pooler_size = bert_model.pooler.dense.out_features
        
        self.classifier = nn.Sequential(nn.Dropout(p = 0.1, inplace = False),
                                        nn.Linear(in_features=self.pooler_size, out_features=256, bias=True),
                                        nn.Sigmoid(),
                                        nn.Linear(in_features=256, out_features=2, bias=True)
                                       )
        
    def forward(self, **params):
        output = self.bert(**params)['pooler_output']
        logits = self.classifier(output)
        return logits
    
    def train(self):
        self.bert.train()

bangla_based_bert = BERT_Based_Classifier(model).to(device)

AttributeError: ignored

In [None]:
model = model.to(device);

In [None]:
X, y = next(iter(train_loader))
sample_out = model(**X)

In [None]:
model.bert

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(102025, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
         

In [None]:
sample_out[0][:, :, 0:1]

tensor([[[-7.2617],
         [-4.4485],
         [-6.0201],
         ...,
         [-6.3354],
         [-6.3734],
         [-7.9369]],

        [[-7.2810],
         [-5.9241],
         [-5.6631],
         ...,
         [-6.7784],
         [-5.9308],
         [-7.6452]],

        [[-8.1417],
         [-4.4144],
         [-4.0560],
         ...,
         [-7.2066],
         [-4.7772],
         [-9.0846]],

        ...,

        [[-8.4719],
         [-7.1970],
         [-8.6366],
         ...,
         [-6.8117],
         [-7.0659],
         [-8.4203]],

        [[-7.7649],
         [-6.8578],
         [-7.7224],
         ...,
         [-7.3801],
         [-5.9231],
         [-8.5084]],

        [[-8.3196],
         [-7.4330],
         [-3.1532],
         ...,
         [-6.8732],
         [-6.3989],
         [-8.4618]]], device='cuda:0', grad_fn=<SliceBackward>)

# Results Analysis

Deeper Look at the results from the models

### Keyword Counting

In [19]:
!pip install bnlp_toolkit &> /dev/null

In [20]:
from basicBanglaTools import *

damaged_keywords = {
        "ক্ষয়ক্ষতি": "damage",
        "ক্ষয়": "damage",
        "ক্ষতিগ্রস্ত": "damaged",
        "নষ্ট": "damaged"
    }

waterlogged_keywords = {
    "পানিবন্দী": "waterlogged",
    "নিমজ্জিত": "submerged",
    "পানিবন্দি": "waterlogged"
}

flood_keywords = {
    "বন্যা": "flood",
    "পাহাড়ি ঢলে": "steam coming down the hill"
}

cyclone_keywords = {
    "ঘূর্ণিঝড়": "cyclone"
}

def df_keyword_count(df, keywords, per_entry = False):
    if not isinstance(keywords, dict):
        keywords = {kw: None for kw in keywords}
        
    keyword_counts = df.apply(lambda x: keywordCount(x.content, keywords), axis = 1)
    if per_entry:
        return keyword_counts
    counts = {keyword: len(df[keyword_counts.apply(lambda x: x[keyword] > 0)]) for keyword in keywords}
    counts['any'] = len(df[keyword_counts.apply(lambda x: any(x[keyword] > 0 for keyword in keywords))])
    return counts

def count_table(df):
  return pd.DataFrame(list(df_keyword_count(df, flood_keywords.keys() | cyclone_keywords.keys() | damaged_keywords.keys() | waterlogged_keywords.keys(), per_entry=True)))

In [21]:
#df = pd.concat(data, axis=1, keys=[s.name for s in data])
flood_keywords.keys() | cyclone_keywords.keys() | damaged_keywords.keys() | waterlogged_keywords.keys()

kw_mean = count_table(df).mean()
positive_kw_mean = count_table(df[df['is_flood'] == True]).mean()
negative_kw_mean = count_table(df[df['is_flood'] == False]).mean()

keyword_averages = pd.concat([count_table(df[df['is_flood'] == True]).mean(), count_table(df[df['is_flood'] == False]).mean()], axis=1, keys = ["Is Flood Average", "Not Flood Average"])

### False Positive and Negative Samples

In [22]:
def sample_model_evaluation_sklearn(model, X_test, y_test, model_type = "sklearn"):
    if model_type == "sklearn":
        model, vect = model
        pred = model.predict(vect.transform(X_test))
    elif model_type == "net":
        model, collate_fn = model
        dataset = BengaliNewsDataset(X_test, y_test)
        dataloader = DataLoader(train_ds, batch_size = len(dataset), shuffle=False, collate_fn = collate_fn)
        for X, y in dataloader:
          pred = 1 * (model(X) > 0)
    
    false_positives = (X_test[(pred == True) & (y_test == False)])
    false_negatives = (X_test[(pred == False) & (y_test == True)])
    return false_positives, false_negatives #false_positives.sample(min(10, len(false_positives))), false_negatives.sample(min(10, len(false_negatives)))

In [23]:
def display_basic_evaluation(name, false_positives, false_negatives):
    print("-"* 100)
    print(f"Model: {name}")
    print("-"* 100)
    

    fp, fn = false_positives, false_negatives
    fp_cnts = count_table(df.iloc[fp.index]).mean()
    fn_cnts = count_table(df.iloc[fn.index]).mean()
    kw_summary = pd.concat([fp_cnts, fn_cnts, positive_kw_mean, negative_kw_mean, kw_mean], axis = 1, keys = ["FP Mean", 
                                                                      "FN Mean", 
                                                                      "Positives Mean", 
                                                                      "Negatives Mean", 
                                                                      "Mean"])
    print(kw_summary.to_markdown())
    print("-----False Postives-----")
    print("Total: {}".format(len(fp)))

    for i, text in enumerate(list(df.iloc[fp.index]['translated'].sample(min(10, len(fp))) )):
        print(f"{i})", text[:1000])
        print()
    print("-----False Negatives-----")
    print("Total: {}".format(len(fn)))
    for i, text in enumerate(list(df.iloc[fn.index]['translated'].sample(min(10, len(fn))) )):
        print(f"{i})", text[:1000])
        print()
    return kw_summary

In [24]:
def show_skmodel(name = ('LinearSVC', 'TFIDF-2gram'), fold = 0):
    sk_model = sklearn_summary[name][f'fold_{fold}']['model']
    fp, fn = sample_model_evaluation_sklearn(sk_model, X_tests[0], y_tests[0])
    display_basic_evaluation(name[0], fp, fn)

In [93]:
for model in sklearn_summary.keys():
  show_skmodel(model)
  print()

----------------------------------------------------------------------------------------------------
Model: RandomForest
----------------------------------------------------------------------------------------------------
|            |   FP Mean |   FN Mean |   Positives Mean |   Negatives Mean |      Mean |
|:-----------|----------:|----------:|-----------------:|-----------------:|----------:|
| নষ্ট        |  0        | 0.230769  |         0.384393 |        0.469697  | 0.440358  |
| পানিবন্দী   |  0        | 0         |         0.791908 |        0.0333333 | 0.294235  |
| ঘূর্ণিঝড়     |  0        | 0.384615  |         0.375723 |        0.412121  | 0.399602  |
| ক্ষয়        |  0        | 0.230769  |         0.066474 |        0.0636364 | 0.0646123 |
| নিমজ্জিত    |  0        | 0.0769231 |         0.216763 |        0.0439394 | 0.10338   |
| পানিবন্দি   |  0.166667 | 0         |         1.38728  |        0.0393939 | 0.502982  |
| পাহাড়ি ঢলে |  0.5      | 0.307692  |         0.950867 |

In [40]:
performance_summaries['LSTM']

{'fold_1': {'accuracy': 0.8855721393034826,
  'fscore': 0.8345323741007195,
  'model': '/content/drive/My Drive/iri_bengali_data/lstm_model_epoch_40.ckpt',
  'precision': 0.8285714285714286,
  'recall': 0.8405797101449275}}