# Importing Important Packages

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
import torch.optim as optim

from sklearn.metrics import f1_score
from tqdm import tqdm

# Install Cleaned Dataset

In [2]:
df = pd.read_parquet('data/data_clean.pqt')
df.head()

Unnamed: 0,prism_consumer_id,prism_account_id,memo_clean,amount,category_description
0,0,acc_0,KROGER,20.98,GROCERIES
1,0,acc_0,CASH APP * FREE CA,200.0,GENERAL_MERCHANDISE
7,0,acc_0,AMAZON * AMZN WA,33.2,GENERAL_MERCHANDISE
9,0,acc_0,AMAZON,42.79,GENERAL_MERCHANDISE
10,0,acc_0,KROGER,36.55,GROCERIES


# Preprocess Data for Baseline Text Categorization

In [3]:
data = df[['category_description', 'memo_clean']]
data = data.rename(columns = {'category_description':'category', 'memo_clean':'memo'})
data.head()

Unnamed: 0,category,memo
0,GROCERIES,KROGER
1,GENERAL_MERCHANDISE,CASH APP * FREE CA
7,GENERAL_MERCHANDISE,AMAZON * AMZN WA
9,GENERAL_MERCHANDISE,AMAZON
10,GROCERIES,KROGER


In [4]:
categories = data['category'].unique()
categories.sort()
cat_dict = dict(zip(categories, np.arange(len(categories))))
cat_dict

{'EDUCATION': 0,
 'FOOD_AND_BEVERAGES': 1,
 'GENERAL_MERCHANDISE': 2,
 'GROCERIES': 3,
 'MORTGAGE': 4,
 'OVERDRAFT': 5,
 'PETS': 6,
 'RENT': 7,
 'TRAVEL': 8}

In [5]:
data['category_label'] = data['category'].map(cat_dict)
data.head()

Unnamed: 0,category,memo,category_label
0,GROCERIES,KROGER,3
1,GENERAL_MERCHANDISE,CASH APP * FREE CA,2
7,GENERAL_MERCHANDISE,AMAZON * AMZN WA,2
9,GENERAL_MERCHANDISE,AMAZON,2
10,GROCERIES,KROGER,3


In [6]:
data_balanced = data.groupby('category').apply(lambda x: x.sample(1000, replace=True)).reset_index(drop=True)
data_balanced.head()

Unnamed: 0,category,memo,category_label
0,EDUCATION,MLT SCHOOL,0
1,EDUCATION,CREATIVE LEARNING P,0
2,EDUCATION,POS W UNIVERSITY MESA AZ,0
3,EDUCATION,UGA GA CTR RESTAURAQPS,0
4,EDUCATION,PURCHASE AUTHORIZED ON SCHOOL TOOLS EAST FAYET...,0


In [16]:
X_train, X_test, y_train, y_test = train_test_split(data_balanced[['memo']], data_balanced['category_label'], test_size=0.2, random_state=707)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=707)

# Preparing BERT Dataset

In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BERT_MODEL = "bert-base-uncased"

In [18]:
class BertDataset(Dataset):
    def __init__(self, X, y, tokenizer, max_length):
        super(BertDataset, self).__init__()
        self.X = X
        self.tokenizer = tokenizer
        self.y = y
        self.max_length = max_length

    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, index):
        text = self.X.iloc[index,0]
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            pad_to_max_length=True,
            add_special_tokens=True,
            return_attention_mask=True,
            max_length=self.max_length,
            truncation=True,
            padding='max_length'
        )

        ids = inputs["input_ids"]
        token_type_ids = inputs["token_type_ids"]
        mask = inputs["attention_mask"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'target': torch.tensor(self.y.iloc[index], dtype=torch.long)
        }

In [19]:
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)

dataset_train = BertDataset(X_train, y_train, tokenizer, max_length=20)
dataloader_train = DataLoader(dataset=dataset_train,batch_size=32)

dataset_valid = BertDataset(X_valid, y_valid, tokenizer, max_length=20)
dataloader_valid = DataLoader(dataset=dataset_valid,batch_size=32)

# BERT Baseline Model

## Define Model

In [81]:
bert_base = BertForSequenceClassification.from_pretrained(
    BERT_MODEL,
    num_labels=len(cat_dict),
    output_attentions=False,
    output_hidden_states=False
)
loss_fn = nn.BCEWithLogitsLoss()
epochs = 3

#Initialize Optimizer
optimizer= optim.Adam(bert_base.parameters(),lr= 1e-5,eps=1e-8)
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0,
    num_training_steps=len(dataloader_train)*epochs
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Performance Metrics

In [82]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in cat_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

## Training Model

In [83]:
def evaluate(dataloader_val, model):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [84]:
def train(dataloader_train, dataloader_validation, scheduler, model, epochs):
    for epoch in range(epochs):
        
        model.train()
        
        loss_train_total = 0

        progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
        for batch in progress_bar:

            model.zero_grad()
            
            # print(batch.keys())
            # batch = tuple(b.to(device) for b in batch)
            
            inputs = {
                'input_ids':    batch['ids'],
                'attention_mask': batch['mask'],
                'labels':         batch['target'],
            }       

            outputs = model(**inputs)
            
            loss = outputs[0]
            loss_train_total += loss.item()
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()
            
            progress_bar.set_description(f'Epoch={epoch}/{epochs}')
            progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})

            
            
        torch.save(model.state_dict(), f'models/finetuned_BERT_epoch_{epoch}.model')
            
        tqdm.write(f'\nEpoch {epoch}')
        
        loss_train_avg = loss_train_total/len(dataloader_train)            
        tqdm.write(f'Training loss: {loss_train_avg}')
        
        val_loss, predictions, true_vals = evaluate(dataloader_validation)
        val_f1 = f1_score_func(predictions, true_vals)
        tqdm.write(f'Validation loss: {val_loss}')
        tqdm.write(f'F1 Score (Weighted): {val_f1}')
        
    return model

In [85]:
bert_base = train(dataloader_train, dataloader_valid, scheduler, bert_base, epochs)

                                                                                 

RuntimeError: Parent directory data_volume does not exist.