## Datapipeline

In [237]:
import sys
sys.path.insert(0, '..')

from data_management.load_tagtog import load_from_annotations_folder

In [238]:
df = load_from_annotations_folder("../tagtog/datasets/annotations_large_mixed_batch/", csv_folder="../data_management/")
X, y = df['content'], df['is_flood']

In [239]:
from sklearn.model_selection import train_test_split

X_train_valid, X_test, y_train_valid, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, test_size = 0.20, random_state = 42)

print(f"{len(X_train)} training examples\n{len(X_valid)} validation examples\n{len(X_test)} test examples")

643 training examples
161 validation examples
202 test examples


#### Storing splits in dataframe

In [253]:
import pandas as pd

train_dt=pd.DataFrame([X_train, y_train]).T
valid_dt=pd.DataFrame([X_valid, y_valid]).T
test_dt=pd.DataFrame([X_test, y_test]).T
train_dt['type'] = 'train'
valid_dt['type'] = 'valid'
test_dt['type'] = 'test'
splits_df = pd.concat([train_dt, valid_dt, test_dt])

In [254]:
splits_df.to_csv("training_splits.csv")

#### Loading splits from google drive

In [255]:
from google.colab import drive
import pandas as pd
drive.mount('drive')

ModuleNotFoundError: No module named 'google.colab'

In [None]:
df = pd.read_csv("/content/drive/My Drive/iri_bengali_data/training_splits.csv", index_col=0)
train_dt = df[df['type'] == 'train']
test_dt = df[df['type'] == 'test']
valid_dt = df[df['type'] == 'valid']
X_train, y_train = train_dt['content'], train_dt['is_flood']
X_test, y_test = test_dt['content'], test_dt['is_flood']
X_valid, y_valid = valid_dt['content'], valid_dt['is_flood']
print(f"{len(X_train)} training examples\n{len(X_valid)} validation examples\n{len(X_test)} test examples")

## Testing Pipeline for sklearn models

- Feature Extractor
- Model
- Metrics

In [7]:
# Feature Extractors
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Models
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

# Metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Combine 
from sklearn.model_selection import ParameterGrid

import numpy as np

def sklearn_run_pipeline(X_train, X_valid, y_train, y_valid, vectorizer, model, metric):
    vectorizer.fit(np.hstack([X_train, X_valid]))
    model.fit(vectorizer.transform(X_train), y_train)
    return metric(model.predict(vectorizer.transform(X_valid)), y_valid)

def sklearn_run_pipeline_series(X_train, X_valid, y_train, y_valid, vectorizers, models, metrics):
    res = {modelname: {} for modelname in models.keys()}
    
    train_with_valid = np.hstack([X_train, X_valid])
    for vect in vectorizers.values():
        vect.fit(train_with_valid)
    
    X_vects_train = {name: vect.transform(X_train) for name, vect in vectorizers.items()}
    X_vects_valid = {name: vect.transform(X_valid) for name, vect in vectorizers.items()}
    
    for model_name, model in models.items():
        table = res[model_name]
        for vect_name, X_vect_train in X_vects_train.items():
            model.fit(X_vect_train, y_train)
            y_pred = model.predict(X_vects_valid[vect_name])
            res[model_name][vect_name] = {name: metric(y_valid, y_pred) for name, metric in metrics.items()}
    
    return res

In [8]:
feature_extractors = {
    'CountVect': CountVectorizer(), 
    'CountVect-2gram': CountVectorizer(ngram_range = (1, 2)),
    'CountVect-min_df-max_df': CountVectorizer(min_df = 0.05, max_df = 0.95),
    'CountVect-2gram-min_df-max_df': CountVectorizer(min_df = 0.05, max_df = 0.95, ngram_range = (1, 2)),
    'TFIDF': TfidfVectorizer(), 
    'TFIDF-2gram': TfidfVectorizer(ngram_range = (1, 2)),
    'TFIDF-min_df-max_df': TfidfVectorizer(min_df = 0.05, max_df = 0.95),
    'TFIDF-2gram-min_df-max_df': TfidfVectorizer(min_df = 0.05, max_df = 0.95, ngram_range = (1, 2))
}

models = {
    'RandomForest': RandomForestClassifier(class_weight = 'balanced'),
    'LinearSVC': LinearSVC(class_weight = 'balanced'),
    'LogRegL1': LogisticRegression(penalty = 'l1', 
                                   class_weight = 'balanced', 
                                   solver = 'liblinear',
                                   max_iter = 1000
                                  ),
    'LogRegL2': LogisticRegression(penalty = 'l2', 
                                   class_weight = 'balanced', 
                                   solver = 'liblinear',
                                   max_iter = 1000
                                  )
}

metrics = {"Accuracy": accuracy_score, 'Precision': precision_score, 'Recall': recall_score, 'f1': f1_score}

In [9]:
results = sklearn_run_pipeline_series(X_train, X_valid, y_train, y_valid, feature_extractors, models, metrics)

In [10]:
import pandas as pd
for model, model_results in results.items():
    print(model)
    print(pd.DataFrame(model_results).T.to_markdown(), '\n')

RandomForest
|                               |   Accuracy |   Precision |   Recall |       f1 |
|:------------------------------|-----------:|------------:|---------:|---------:|
| CountVect                     |   0.84472  |    0.906977 | 0.65     | 0.757282 |
| CountVect-2gram               |   0.84472  |    0.926829 | 0.633333 | 0.752475 |
| CountVect-min_df-max_df       |   0.857143 |    0.95122  | 0.65     | 0.772277 |
| CountVect-2gram-min_df-max_df |   0.832298 |    0.902439 | 0.616667 | 0.732673 |
| TFIDF                         |   0.850932 |    0.973684 | 0.616667 | 0.755102 |
| TFIDF-2gram                   |   0.832298 |    0.971429 | 0.566667 | 0.715789 |
| TFIDF-min_df-max_df           |   0.857143 |    0.95122  | 0.65     | 0.772277 |
| TFIDF-2gram-min_df-max_df     |   0.857143 |    0.95122  | 0.65     | 0.772277 | 

LinearSVC
|                               |   Accuracy |   Precision |   Recall |       f1 |
|:------------------------------|-----------:|------------:|--

## BERT Model with Translation

### Translate Dataset, Kinda Hard :( Translate broke

In [94]:
from basicBanglaTools import translate

## Bert Based Models

### Multi-lingual Bert Uncased

In [66]:
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
model = BertModel.from_pretrained("bert-base-multilingual-uncased", num_labels = 2)

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [230]:
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn

class BengaliNewsDataset(Dataset):
    def __init__(self, X, y = None):
        if y is None:
            self.X, self.y = X['content'], X['is_flood']
        else:
            self.X, self.y = X, y
    
    def __getitem__(self, i):
        return self.X.iloc[i], self.y.iloc[i]
    
    def __len__(self):
        return len(self.y)


def collate_batch(batch):
    text, labels = zip(*batch)
    text_enc = tokenizer(text, padding="max_length", truncation=True, return_tensors = "pt")
    labels_enc = torch.tensor([1 if label else 0 for label in labels])
    
    return text_enc, labels_enc


In [231]:
class BERT_Based_Classifier(nn.Module):
    def __init__(self, bert_model):
        super(BERT_Based_Classifier, self).__init__()
        
        self.bert = bert_model
        self.pooler_size = bert_model.pooler.dense.out_features
        
        self.classifier = nn.Sequential(nn.Dropout(p = 0.1, inplace = False),
                                        nn.Linear(in_features=self.pooler_size, out_features=2, bias=True)
                                       )
        
    def forward(self, **params):
        output = self.bert(**params)['pooler_output']
        logits = self.classifier(output)
        return logits
    
    def train(self):
        self.bert.train()


In [232]:
classification_model = BERT_Based_Classifier(model)

In [88]:
X_sample = X.sample(10)
y_sample = y[X_sample.index]


ds = BengaliNewsDataset(X_sample, y_sample)
dataloader = DataLoader(ds, batch_size = 2, shuffle=True, collate_fn = collate_batch)

In [94]:
next(iter(dataloader))[0]['input_ids'].shape

torch.Size([2, 512])

In [196]:
from transformers import get_scheduler, AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 2
num_training_steps = num_epochs * len(dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

In [229]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

criterion = torch.nn.CrossEntropyLoss()
classification_model.train()
for epoch in range(num_epochs):
    for X, y in dataloader:
        
        optimizer.zero_grad()
        logits_pred = classification_model(**X)
        loss = criterion(logits_pred, y)
        loss.backward()
        optimizer.step()
        
        lr_scheduler.step()
        progress_bar.update(1)

  0%|          | 0/15 [00:00<?, ?it/s]

In [None]:
        outputs = self.bert(**params)
        
        hidden_states = outputs[1]
        
        pooled_output = torch.cat(tuple(hidden_states[-self.use_hidden:]), dim=-1)
        print(pooled_output.shape)
        return pooled_output
        
#        pooled_output = pooled_output[:, 0, :]
        #pooled_output = self.dropout(pooled_output)
        # classifier of course has to be 4 * hidden_dim, because we concat 4 layers
#        logits = self.classifier(pooled_output)

In [165]:
outputs = model(**batch)

In [185]:
outputs['pooler_output']

torch.Size([2, 768])

In [190]:
classification_model.classifier(outputs['pooler_output'])

tensor([[ 0.1368, -0.0041],
        [ 0.1194,  0.0491]], grad_fn=<AddmmBackward>)

In [202]:
#loss = nn.BCEWithLogitsLoss()
inp = torch.randn((10, 3), requires_grad=True)
targ = torch.empty(3).random_(2)
inp.shape, targ.shape

(torch.Size([10, 3]), torch.Size([3]))

### Bangla Bert

In [None]:
from transformers import BertForMaskedLM, BertTokenizer, pipeline

model = BertForMaskedLM.from_pretrained("sagorsarker/bangla-bert-base")
tokenizer = BertTokenizer.from_pretrained("sagorsarker/bangla-bert-base")
nlp = pipeline('fill-mask', model=model, tokenizer=tokenizer)
for pred in nlp(f"আমি বাংলায় {nlp.tokenizer.mask_token} গাই।"):
    print(pred)