In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os

In [None]:
DRIVE_BASE = "/content/drive/MyDrive/Colab Notebooks/uah-ra/"

In [None]:
KW_PATH = os.path.join(DRIVE_BASE, "data/keywords.txt")
DATA_PATH = os.path.join(DRIVE_BASE, "data/data.csv")

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
def load_keywords(path):
    res = []
    with open(path) as f:
        text = f.read().strip()
        tags_str = text.split(",")
        res = map(lambda t: [_.strip().lower() for _ in t.split(">")], tags_str)
        res = filter(lambda x: len(x) > 0, res)
        res = list(res)
    return res

In [None]:
KEYWORDS = load_keywords(KW_PATH)
len(KEYWORDS), len(set([kw for kws in KEYWORDS for kw in kws]))

In [None]:
KEYWORDS

# Tag Analysis

In [None]:
!pip install loguru

In [None]:
from loguru import logger

In [None]:
from collections import Counter, defaultdict

In [None]:
def get_counts(keywords, level=0):
    kws = map(lambda x: x[level if level<len(x) else len(x)-1], keywords)
    kws = list(kws)
    # kws = list(map(str.lower, kws))
    counter = Counter(kws)
    return counter

In [None]:
def analyze_kws(keywords, topn=10):
    plt.figure(figsize=(15, 8))
    for level in [0, 1, 2, 3, -1]:
        _ = get_counts(KEYWORDS, level=level)
        logger.debug(f"[Level={level}, NKWs={len(_)}] : {_.most_common(10)}")
        df = pd.DataFrame(_.most_common(topn), columns=["kw", "frequency"])
        ax = sns.barplot(
            x="frequency", y="kw",
            data=df,
            linewidth=2.5,
            facecolor=(1, 1, 1, 0),
            errcolor=".2",
            edgecolor=".2"
        )
        plt.title(f"Level={level}, topn={topn}")
        plt.figure(figsize=(15, 8))

In [None]:
", ".join(list(get_counts(KEYWORDS, level=1).keys()))

In [None]:
analyze_kws(KEYWORDS, topn=20)

# Data Analysis

In [None]:
def parse_kws(kw_str, level=2):
    res = kw_str.split(",")
    res = map(lambda kw: [_.strip().lower() for _ in kw.split(">")], res)
    res = map(lambda x: x[level if level<len(x) else len(x)-1], res)
    return list(set(res))

def load_data(path, level=0):
    logger.info(f"Loading data from {path}. [KW Level={level}]")
    df = pd.read_csv(path)
    df["desc"] = df["desc"].apply(str.strip)
    df["labels"] = df["keywords"].apply(lambda x: parse_kws(x, level))
    df["textlen"] = df["desc"].apply(len)
    return df

In [None]:
DATA = load_data(DATA_PATH, level=1)

In [None]:
DATA.shape

In [None]:
DATA.head(10)

In [None]:
def analyze_labels(df):
    df = df.copy()
    labels = [l for ls in df["labels"] for l in ls]
    uniques = set(labels)
    logger.info(f"{len(uniques)} unique labels")

In [None]:
analyze_labels(DATA)

In [None]:
# idx = 2
# _data.iloc[2].keywords_processed

In [None]:
_data = DATA.copy()
_data = _data[_data["textlen"]>0]

In [None]:
_data.shape

In [None]:
# BERT can only process 512 tokens at once
len(_data[_data["textlen"] <= 512]) / len(_data), len(_data[_data["textlen"] <= 1024]) / len(_data)

In [None]:
plt.figure(figsize=(20, 15))
sns.histplot(data=_data, x="textlen", bins=100).set(xlim=(0, 3000))

# Baseline Model

# Encode Labels

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
DATA_TO_USE = DATA.copy()
DATA_TO_USE = DATA_TO_USE[DATA_TO_USE["textlen"]<=500]

In [None]:
DATA_TO_USE.shape

In [None]:
DATA_TO_USE.head()

In [None]:
analyze_labels(DATA_TO_USE)

In [None]:
LE = MultiLabelBinarizer()
LABELS_ENCODED = LE.fit_transform(DATA_TO_USE["labels"])

In [None]:
LABELS_ENCODED.shape

In [None]:
LE.classes_

In [None]:
LE.inverse_transform(LABELS_ENCODED[0].reshape(1,-1))

In [None]:
DATA_TO_USE["labels_encoded"] = list(LABELS_ENCODED)

In [None]:
DATA_TO_USE.head()

# Split Dataset

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
 X_train, X_test, Y_train, Y_test = train_test_split(DATA_TO_USE["desc"].to_numpy(), LABELS_ENCODED, test_size=0.1, random_state=42)

 X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.1, random_state=42)

In [None]:
X_train.shape, X_val.shape, X_test.shape

In [None]:
Y_train.shape, Y_val.shape, Y_test.shape

In [None]:
X_test

# CreateDataset

In [None]:
! pip install pytorch_lightning

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset

In [None]:
import pytorch_lightning as pl

In [None]:
class TagDataset (Dataset):
    def __init__(self,texts, tags, tokenizer, max_len=512):
        self.tokenizer = tokenizer
        self.texts = texts
        self.labels = tags
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, item_idx):
        text = self.texts[item_idx]
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length= self.max_len,
            padding = 'max_length',
            return_token_type_ids= False,
            return_attention_mask= True,
            truncation=True,
            return_tensors = 'pt'
          )
        
        input_ids = inputs['input_ids'].flatten()
        attn_mask = inputs['attention_mask'].flatten()
        
        return {
            'input_ids': input_ids ,
            'attention_mask': attn_mask,
            'label': torch.tensor(self.labels[item_idx], dtype=torch.float)
            
        }

In [None]:
class TagDataModule (pl.LightningDataModule):
    
    def __init__(self, x_train, y_train, x_val, y_val, x_test, y_test,tokenizer, batch_size=16, max_token_len=512):
        super().__init__()
        self.train_text = x_train
        self.train_label = y_train
        self.val_text = x_val
        self.val_label = y_val
        self.test_text = x_test
        self.test_label = y_test
        self.tokenizer = tokenizer
        self.batch_size = batch_size
        self.max_token_len = max_token_len

    def setup(self):
        self.train_dataset = TagDataset(texts=self.train_text, tags=self.train_label, tokenizer=self.tokenizer,max_len = self.max_token_len)
        self.val_dataset  = TagDataset(texts=self.val_text,tags=self.val_label,tokenizer=self.tokenizer,max_len = self.max_token_len)
        self.test_dataset  = TagDataset(texts=self.test_text,tags=self.test_label,tokenizer=self.tokenizer,max_len = self.max_token_len)
        
        
    def train_dataloader(self):
        return DataLoader (self.train_dataset, batch_size = self.batch_size,shuffle = True , num_workers=2)

    def val_dataloader(self):
        return DataLoader (self.val_dataset, batch_size= 16)

    def test_dataloader(self):
        return DataLoader (self.test_dataset, batch_size= 16)

# Transformers

In [None]:
!pip install transformers

In [None]:
from transformers import AutoTokenizer, AutoModel

In [None]:
TOKENIZER = AutoTokenizer.from_pretrained("bert-base-uncased")
# BASE_MODEL = AutoModel.from_pretrained("bert-base-uncased")
BASE_MODEL = None

In [None]:
# Initialize the parameters that will be use for training
EPOCHS = 10
BATCH_SIZE = 4
MAX_LEN = 512
LR = 1e-03

In [None]:
TAG_DATA_MODULE = TagDataModule(
    X_train, Y_train,
    X_val, Y_val,
    X_test, Y_test,
    TOKENIZER,
    BATCH_SIZE,
    MAX_LEN
)
TAG_DATA_MODULE.setup()

# Model

In [None]:
from pytorch_lightning.callbacks import ModelCheckpoint

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [None]:
class TagClassifier(pl.LightningModule):
    # Set up the classifier
    def __init__(self, base_model=None, n_classes=10, steps_per_epoch=None, n_epochs=5, lr=1e-5 ):
        super().__init__()

        self.model = base_model or AutoModel.from_pretrained("bert-base-uncased", return_dict=True)
        self.classifier = torch.nn.Linear(self.model.config.hidden_size,n_classes)
        self.steps_per_epoch = steps_per_epoch
        self.n_epochs = n_epochs
        self.lr = lr
        self.criterion = torch.nn.BCEWithLogitsLoss()
        
    def forward(self,input_ids, attn_mask):
        output = self.model(input_ids = input_ids ,attention_mask = attn_mask)
        output = self.classifier(output.pooler_output)
        return output
    
    
    def training_step(self,batch,batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']
        
        outputs = self(input_ids,attention_mask)
        loss = self.criterion(outputs,labels)
        self.log('train_loss',loss , prog_bar=True,logger=True)
        
        return {"loss" :loss, "predictions":outputs, "labels": labels }


    def validation_step(self,batch,batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']
        
        outputs = self(input_ids,attention_mask)
        loss = self.criterion(outputs,labels)
        self.log('val_loss',loss , prog_bar=True,logger=True)
        
        return loss

    def test_step(self,batch,batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']
        
        outputs = self(input_ids,attention_mask)
        loss = self.criterion(outputs,labels)
        self.log('test_loss',loss , prog_bar=True,logger=True)
        
        return loss
    
    
    def configure_optimizers(self):
        optimizer = AdamW(self.parameters() , lr=self.lr)
        warmup_steps = self.steps_per_epoch//3
        total_steps = self.steps_per_epoch * self.n_epochs - warmup_steps

        scheduler = get_linear_schedule_with_warmup(optimizer,warmup_steps,total_steps)

        return [optimizer], [scheduler]

In [None]:
steps_per_epoch = len(X_train)//BATCH_SIZE
MODEL = TagClassifier(BASE_MODEL, n_classes=22, steps_per_epoch=steps_per_epoch,n_epochs=EPOCHS,lr=LR)

In [None]:
# # saves a file like: input/QTag-epoch=02-val_loss=0.32.ckpt
# checkpoint_callback = ModelCheckpoint(
#     monitor='val_loss',# monitored quantity
#     filename='QTag-{epoch:02d}-{val_loss:.2f}',
#     save_top_k=3, #  save the top 3 models
#     mode='min', # mode of the monitored quantity  for optimization
# )

In [None]:
trainer = pl.Trainer(max_epochs = EPOCHS , gpus = 1, callbacks=[], progress_bar_refresh_rate = 30)

In [None]:
trainer.fit(MODEL, TAG_DATA_MODULE)

In [None]:
!nvidia-smi

In [None]:
trainer.save_checkpoint("model-10.ckpt")

In [None]:
!mkdir "$DRIVE_BASE/checkpoints/"

In [None]:
! cp "/content/model-10.ckpt" "$DRIVE_BASE/checkpoints"

In [None]:
!ls "$DRIVE_BASE/checkpoints"

# Test

In [None]:
trainer.test(MODEL,datamodule=TAG_DATA_MODULE)

# Inference

In [None]:
MODEL.eval()

In [None]:
import pickle

In [None]:
with open("le.pkl", "wb") as f:
    pickle.dump(LE, f)

In [None]:
from torch.utils.data import TensorDataset, SequentialSampler

In [None]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
MODEL.to(DEVICE)

In [None]:
def inference(model, texts, tokenizer, batch_size=2):
    # model.eval()
    if isinstance(texts, str):
        texts = [texts]
    input_ids, attention_masks = [], []
    for text in texts:
        text_encoded = tokenizer.encode_plus(
                        text,
                        None,
                        add_special_tokens=True,
                        max_length= MAX_LEN,
                        padding = 'max_length',
                        return_token_type_ids= False,
                        return_attention_mask= True,
                        truncation=True,
                        return_tensors = 'pt'      
        )
        input_ids.append(text_encoded["input_ids"])
        attention_masks.append(text_encoded["attention_mask"])
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    pred_data = TensorDataset(input_ids, attention_masks)
    pred_sampler = SequentialSampler(pred_data)
    pred_dataloader = DataLoader(pred_data, sampler=pred_sampler, batch_size=batch_size)
    pred_outs = []
    for batch in pred_dataloader:
        # Add batch to GPU
        batch = tuple(t.to(DEVICE) for t in batch)
    
        # Unpack the inputs from our dataloader
        b_input_ids, b_attn_mask = batch
    
        with torch.no_grad():
            # Forward pass, calculate logit predictions
            pred_out = model(b_input_ids,b_attn_mask)
            pred_out = torch.sigmoid(pred_out)
            # Move predicted output and labels to CPU
            pred_out = pred_out.detach().cpu().numpy()
        pred_outs.append(pred_out)
    return pred_outs

In [None]:
_texts = X_test[:10]
_pred_outs = inference(MODEL, _texts, TOKENIZER)

In [None]:
_pred_outs

In [None]:
_texts

In [None]:
thresh = 0.3
for _txt, _yt, _p in zip(_texts, Y_test, _pred_outs.copy()):
    _p = _p.flatten()
    confs = _p[_p>thresh]
    _p[_p<thresh] = 0
    _p[_p>=thresh] = 1
    
    print(confs)
    pred_tag = LE.inverse_transform(np.array([_p]))[0]
    gt_tag = LE.inverse_transform(np.array([_yt]))[0]
    print(_txt[:50], gt_tag, pred_tag)

# Custom Evaluation

In [None]:
def inference2(model, tokenizer, texts, gts, threshold=0.3):
    _pred_outs = inference(model, texts, tokenizer, batch_size=1)
    res = []
    for txt, gt, pred in zip(texts, gts, _pred_outs):
        p = pred.flatten().copy()
        confs = p[p>threshold]
        p[p<threshold] = 0
        p[p>=threshold] = 1
        p = np.array([p])
        gt = np.array([gt])
        pred_tags = LE.inverse_transform(p)[0]
        gt_tags = LE.inverse_transform(gt)[0]
        res.append({"gts": gt_tags, "preds": pred_tags, "text": txt})
    return res

In [None]:
def compute_jaccard(tokens1, tokens2):
    if not tokens1 or not tokens2:
        return 0
    intersection = set(tokens1).intersection(tokens2)
    union = set(tokens1).union(tokens2)
    return len(intersection)/len(union)

In [None]:
compute_jaccard([1, 2], [1, 2, 3])

In [None]:
import json

In [None]:
!mkdir "$DRIVE_BASE/outputs/"

In [None]:
def evaluate_jaccard(model, tokenizer, texts, gts, threshold=0.3):
    """
        Jaccard Evaluation. SIimlar to IoU
    """
    predictions = inference2(model, tokenizer, texts, gts, threshold)
    with open("inference.json", "w") as f:
        json.dump(predictions, f)
    metrics = []
    for pmap in predictions:
        metrics.append(compute_jaccard(pmap["gts"], pmap["preds"]))
    return metrics

In [None]:
_ = evaluate_jaccard(MODEL, TOKENIZER, X_test[:50], Y_test[:50], threshold=0.3)

In [None]:
_

In [None]:
!cp "inference.json" "$DRIVE_BASE/outputs/"

# Reference

- https://discuss.pytorch.org/t/using-bcewithlogisloss-for-multi-label-classification/67011/2
- https://medium.com/analytics-vidhya/finetune-distilbert-for-multi-label-text-classsification-task-994eb448f94c