# About the notebook


This notebook performs error analysis for the best model for DataSolve 2022, and try to answer where and why it went wrong!

# Imports

In [None]:
import os
import gc
import warnings
from pathlib import Path

import wandb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, f1_score, multilabel_confusion_matrix

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
warnings.simplefilter("ignore")

# Download artifacts

First, download the best model artifact logged on Weights & Biases

In [None]:
BEST_MODEL_DIR = Path("../dbv3l")
if not os.path.exists(BEST_MODEL_DIR):
    BEST_MODEL_DIR.mkdir()
    api = wandb.Api()
    for fold in range(5):
        artifact = api.artifact(f"gladiator/DataSolve-2022/dbv3l-15ep:v{fold}", type="model")
        artifact.get_path(f"oof_{fold}.pkl").download(BEST_MODEL_DIR)

# Combine all OOF dfs

In [None]:
train_df = pd.read_csv("../input/train_folds_5.csv")
LABEL_COLS = [col for col in train_df.columns if col not in ["id", "name", "document_text", "fold"]]
PRED_COLS = [f"pred_{col}" for col in LABEL_COLS]
len(LABEL_COLS), len(PRED_COLS)

In [None]:
train_df.head(2)

In [None]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def post_process_logits(logits: np.ndarray, threshold=0.5):
    probs = sigmoid(logits)
    preds = np.zeros(probs.shape)
    preds[np.where(probs >= threshold)] = 1
    return preds

oof_dfs = []
for fold in range(5):
    tmp_df = pd.DataFrame()
    p = pd.read_pickle(BEST_MODEL_DIR/f'oof_{fold}.pkl')
    tmp_df['id'] = p['id']
    tmp_df[LABEL_COLS] = p['labels']
    tmp_df[PRED_COLS] = p['logits']
    oof_dfs.append(tmp_df)
    
oof_df = pd.concat(oof_dfs).sort_values(by='id').reset_index(drop=True)
oof_df[PRED_COLS] = post_process_logits(oof_df[PRED_COLS]).astype(int)
train_df = train_df.sort_values(by="id")
assert train_df.shape[0] == oof_df.shape[0]
assert sum(oof_df["id"].to_numpy() == train_df.id.to_numpy()) == train_df.shape[0]

oof_df.insert(loc=1, column="name", value=train_df["name"].to_numpy())
oof_df.insert(loc=2, column="document_text", value=train_df["document_text"].to_numpy())
# rearrage columns for better visibility while analyses
cols = []
for col in LABEL_COLS:
    cols.append(col)
    cols.append(f"pred_{col}")
oof_df = oof_df[["id", "name", "document_text"] + cols]
oof_df

In [None]:
oof_df

In [None]:
print(classification_report(
    oof_df[LABEL_COLS],
    oof_df[PRED_COLS],
    output_dict=False,
    target_names=LABEL_COLS,
))

In [None]:
report = classification_report(
        oof_df[LABEL_COLS],
        oof_df[PRED_COLS],
        output_dict=True,
        target_names=LABEL_COLS,
    )
report_df = pd.DataFrame(report).T[:50]

In [None]:
report_df

In [None]:
report_df.insert(loc=0, column="class", value=report_df.index)
report_df = report_df[["class", "precision", "recall", "f1-score", "support"]]
report_df = report_df.reset_index(drop=True)
report_df

# Log results as tables to W&B for interactive exploration

In [None]:
run = wandb.init(project="DataSolve-2022", group="error-analysis", name="classification-report")

In [None]:
# log oof predictions and classification report as wandb table
oof_table = wandb.Table(dataframe=oof_df)
classification_report_table = wandb.Table(dataframe=report_df)

run.log({"oof_predictions": oof_table, "classification_report": classification_report_table})

In [None]:
run.finish()

In [None]:
# mean of support (i.e. average examples per classes)
report_df["support"].mean()

In [None]:
report_df["support"].min(),report_df["support"].max() 

In [None]:
report_df[report_df["support"] == 435.0]

In [None]:
report_df[report_df["support"] == 1742.0]

# Tokenization example

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")

In [None]:
sample = oof_df.iloc[0]
sample

In [None]:
sample_text = tokenizer.cls_token + sample["name"] + tokenizer.sep_token + sample["document_text"] + tokenizer.sep_token
sample_text

In [None]:
print(tokenizer.convert_ids_to_tokens(tokenizer(sample_text, add_special_tokens=False)["input_ids"]))

In [None]:
print(tokenizer(sample_text, add_special_tokens=False)["input_ids"])

In [None]:
from datasets import Dataset
def tokenize_func(example):
    tok = tokenizer(example["text"], add_special_tokens=False)
    return {"length": len(tok["input_ids"])}

train_df["text"] = tokenizer.cls_token + train_df["name"] + tokenizer.sep_token + train_df["document_text"] + tokenizer.sep_token
ds = Dataset.from_pandas(train_df)
ds = ds.map(tokenize_func)

In [None]:
df = ds.to_pandas()

In [None]:
df.length.mean()

In [None]:
len_df = pd.DataFrame()

means = []
maxis = []
minis = []
total_samples = []
for col in LABEL_COLS:
    mean = int(df[df[col] == 1]["length"].mean())
    maxi = df[df[col] == 1]["length"].max()
    mini = df[df[col] == 1]["length"].min()
    total_sample = len(df[df[col] == 1])
    means.append(mean); maxis.append(maxi); minis.append(mini)
    total_samples.append(total_sample)

len_df["label"] = LABEL_COLS
len_df["mean"] = means
len_df["max"] = maxis
len_df["min"] = minis
len_df["total_samples"] = total_samples
len_df

In [None]:
len_df["total_samples"].mean()