# Importing Important Packages

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
import torch.optim as optim

from sklearn.metrics import f1_score
from tqdm import tqdm

In [2]:
DATA_PATH = '../../data'
MODEL_PATH = '../../src/models'

# Install Cleaned Dataset

In [3]:
df = pd.read_parquet(f'{DATA_PATH}/processed/data_clean.pqt')
df.head()

Unnamed: 0,prism_consumer_id,prism_account_id,memo,amount,posted_date,category,year,month,day
2,0,acc_0,TST CASA DEL RIO EXP FAIRLAWN OH,18.42,2022-09-26,FOOD_AND_BEVERAGES,2022,9,26
4,0,acc_0,BUFFALO WILD WINGS,26.47,2022-09-12,FOOD_AND_BEVERAGES,2022,9,12
6,0,acc_0,OCULUS CA,11.73,2022-04-18,GENERAL_MERCHANDISE,2022,4,18
7,0,acc_0,LOS GIRASOLES STOW OH,30.04,2022-03-09,FOOD_AND_BEVERAGES,2022,3,9
8,0,acc_0,BUZZIS LAUNDRY OH,4.16,2022-03-29,GENERAL_MERCHANDISE,2022,3,29


# Preprocess Data for Baseline Text Categorization

In [6]:
data = df[['category', 'memo']]
# data = data.rename(columns = {'category_description':'category', 'memo_clean':'memo'})
data.head()

Unnamed: 0,category,memo
2,FOOD_AND_BEVERAGES,TST CASA DEL RIO EXP FAIRLAWN OH
4,FOOD_AND_BEVERAGES,BUFFALO WILD WINGS
6,GENERAL_MERCHANDISE,OCULUS CA
7,FOOD_AND_BEVERAGES,LOS GIRASOLES STOW OH
8,GENERAL_MERCHANDISE,BUZZIS LAUNDRY OH


In [7]:
categories = data['category'].unique()
categories.sort()
cat_dict = dict(zip(categories, np.arange(len(categories))))
cat_dict

{'EDUCATION': 0,
 'FOOD_AND_BEVERAGES': 1,
 'GENERAL_MERCHANDISE': 2,
 'GROCERIES': 3,
 'MORTGAGE': 4,
 'OVERDRAFT': 5,
 'PETS': 6,
 'RENT': 7,
 'TRAVEL': 8}

In [8]:
data['category_label'] = data['category'].map(cat_dict)
data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['category_label'] = data['category'].map(cat_dict)


Unnamed: 0,category,memo,category_label
2,FOOD_AND_BEVERAGES,TST CASA DEL RIO EXP FAIRLAWN OH,1
4,FOOD_AND_BEVERAGES,BUFFALO WILD WINGS,1
6,GENERAL_MERCHANDISE,OCULUS CA,2
7,FOOD_AND_BEVERAGES,LOS GIRASOLES STOW OH,1
8,GENERAL_MERCHANDISE,BUZZIS LAUNDRY OH,2


In [9]:
data_balanced = data.groupby('category').apply(lambda x: x.sample(1000, replace=True)).reset_index(drop=True)
data_balanced.head()

Unnamed: 0,category,memo,category_label
0,EDUCATION,EDUCATIONAL EPAYMENT MORENO MONICA,0
1,EDUCATION,POS DEBIT VISA CHECK CARD TEACHERSPAYTEACHER,0
2,EDUCATION,CHECKCARD ICP * GOLDFISH SWIM SCHOO CO,0
3,EDUCATION,PURCHASE AUTHORIZED ON MSB DENVER PUBLIC CO CARD,0
4,EDUCATION,OREGON SCHOOL F,0


In [10]:
X_train, X_test, y_train, y_test = train_test_split(data_balanced[['memo']], data_balanced['category_label'], test_size=0.2, random_state=707)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=707)

# Preparing BERT Dataset

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BERT_MODEL = "bert-base-uncased"

In [12]:
class BertDataset(Dataset):
    def __init__(self, X, y, tokenizer, max_length):
        super(BertDataset, self).__init__()
        self.X = X
        self.tokenizer = tokenizer
        self.y = y
        self.max_length = max_length

    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, index):
        text = self.X.iloc[index,0]
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            pad_to_max_length=True,
            add_special_tokens=True,
            return_attention_mask=True,
            max_length=self.max_length,
            truncation=True,
            padding='max_length'
        )

        ids = inputs["input_ids"]
        token_type_ids = inputs["token_type_ids"]
        mask = inputs["attention_mask"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'target': torch.tensor(self.y.iloc[index], dtype=torch.long)
        }

In [13]:
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)

dataset_train = BertDataset(X_train, y_train, tokenizer, max_length=20)
dataloader_train = DataLoader(dataset=dataset_train,batch_size=32)

dataset_valid = BertDataset(X_valid, y_valid, tokenizer, max_length=20)
dataloader_valid = DataLoader(dataset=dataset_valid,batch_size=32)

# BERT Baseline Model

## Define Model

In [14]:
bert_base = BertForSequenceClassification.from_pretrained(
    BERT_MODEL,
    num_labels=len(cat_dict),
    output_attentions=False,
    output_hidden_states=False
)
loss_fn = nn.BCEWithLogitsLoss()
epochs = 3

#Initialize Optimizer
optimizer= optim.Adam(bert_base.parameters(),lr= 1e-5,eps=1e-8)
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=0,
    num_training_steps=len(dataloader_train)*epochs
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Performance Metrics

In [23]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in cat_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

## Training Model

In [24]:
def evaluate(dataloader_val, model):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(batch[b].to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[3],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [25]:
def train(dataloader_train, dataloader_validation, scheduler, model, epochs):
    for epoch in range(epochs):
        
        model.train()
        
        loss_train_total = 0

        progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
        for batch in progress_bar:

            model.zero_grad()
            
            # print(batch.keys())
            batch = tuple(batch[b].to(device) for b in batch)
            
            inputs = {
                'input_ids':    batch[0],
                'attention_mask': batch[1],
                'labels':         batch[3],
            }       

            outputs = model(**inputs)
            
            loss = outputs[0]
            loss_train_total += loss.item()
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()
            scheduler.step()
            
            progress_bar.set_description(f'Epoch={epoch}/{epochs}')
            progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})

            
            
        torch.save(model.state_dict(), f'{MODEL_PATH}/BERT_Baseline_epoch_{epoch}.model')
            
        tqdm.write(f'\nEpoch {epoch}')
        
        loss_train_avg = loss_train_total/len(dataloader_train)            
        tqdm.write(f'Training loss: {loss_train_avg}')
        
        val_loss, predictions, true_vals = evaluate(dataloader_validation, model)
        val_f1 = f1_score_func(predictions, true_vals)
        tqdm.write(f'Validation loss: {val_loss}')
        tqdm.write(f'F1 Score (Weighted): {val_f1}')
        
    return model

In [26]:
bert_base = train(dataloader_train, dataloader_valid, scheduler, bert_base, epochs)

                                                                                 


Epoch 0
Training loss: 1.641977694299486
Validation loss: 0.9374385529094272
F1 Score (Weighted): 0.7873632723016488


                                                                                 


Epoch 1
Training loss: 0.7395104580455356
Validation loss: 0.537149167060852
F1 Score (Weighted): 0.8661026731058772


                                                                                 


Epoch 2
Training loss: 0.5071926216284434
Validation loss: 0.4719823853837119
F1 Score (Weighted): 0.8783513233172373


## Testing BERT Baseline Model

In [27]:
dataset_test = BertDataset(X_test, y_test, tokenizer, max_length=20)
dataloader_test = DataLoader(dataset=dataset_test,batch_size=32)

In [29]:
loss_val_avg, predictions, true_vals = evaluate(dataloader_test, bert_base)

In [41]:
prediction_cats = np.array([np.argmax(pred) for pred in predictions])

In [44]:
acc = sum(prediction_cats == true_vals) / true_vals.shape[0]
print(f"Overall accuracy of BERT Baseline with {epochs} Epochs: {acc}")

Overall accuracy of BERT Baseline with 3 Epochs: 0.8816666666666667
