# About the notebook


This notebook performs error analysis for the best model for DataSolve 2022, and try to answer where and why it went wrong!

# Imports

In [1]:
import os
import gc
import warnings
from pathlib import Path

import wandb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, f1_score, multilabel_confusion_matrix

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
warnings.simplefilter("ignore")

# Download artifacts

First, download the best model artifact logged on Weights & Biases

In [2]:
BEST_MODEL_DIR = Path("../dbv3l")
if not os.path.exists(BEST_MODEL_DIR):
    BEST_MODEL_DIR.mkdir()
    api = wandb.Api()
    for fold in range(5):
        artifact = api.artifact(f"gladiator/DataSolve-2022/dbv3l-15ep:v{fold}", type="model")
        artifact.get_path(f"oof_{fold}.pkl").download(BEST_MODEL_DIR)

# Combine all OOF dfs

In [3]:
train_df = pd.read_csv("../input/train_folds_5.csv")
LABEL_COLS = [col for col in train_df.columns if col not in ["id", "name", "document_text", "fold"]]
PRED_COLS = [f"pred_{col}" for col in LABEL_COLS]
len(LABEL_COLS), len(PRED_COLS)

(50, 50)

In [4]:
train_df.head(2)

Unnamed: 0,id,name,document_text,Accounting and Finance,Antitrust,Banking,Broker Dealer,Commodities Trading,Compliance Management,Consumer protection,Contract Provisions,Corporate Communications,Corporate Governance,Definitions,Delivery,Examinations,Exemptions,Fees and Charges,Financial Accounting,Financial Crime,Forms,Fraud,IT Risk,Information Filing,Insurance,Legal,Legal Proceedings,Licensing,Licensure and certification,Liquidity Risk,Listing,Market Abuse,Market Risk,Monetary and Economic Policy,Money Services,Money-Laundering and Terrorist Financing,Natural Disasters,Payments and Settlements,Powers and Duties,Quotation,Records Maintenance,Regulatory Actions,Regulatory Reporting,Required Disclosures,Research,Risk Management,Securities Clearing,Securities Issuing,Securities Management,Securities Sales,Securities Settlement,Trade Pricing,Trade Settlement,fold
0,4772,Consent Order in the Matter of Solium Financia...,"Solium Financial Services LLC (""SFS"") is a bro...",0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,4774,Alberta Securities Commission Warns Investors ...,A new year brings new investment opportunities...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


In [5]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def post_process_logits(logits: np.ndarray, threshold=0.5):
    probs = sigmoid(logits)
    preds = np.zeros(probs.shape)
    preds[np.where(probs >= threshold)] = 1
    return preds

oof_dfs = []
for fold in range(5):
    tmp_df = pd.DataFrame()
    p = pd.read_pickle(BEST_MODEL_DIR/f'oof_{fold}.pkl')
    tmp_df['id'] = p['id']
    tmp_df[LABEL_COLS] = p['labels']
    tmp_df[PRED_COLS] = p['logits']
    oof_dfs.append(tmp_df)
    
oof_df = pd.concat(oof_dfs).sort_values(by='id').reset_index(drop=True)
oof_df[PRED_COLS] = post_process_logits(oof_df[PRED_COLS]).astype(int)
train_df = train_df.sort_values(by="id")
assert train_df.shape[0] == oof_df.shape[0]
assert sum(oof_df["id"].to_numpy() == train_df.id.to_numpy()) == train_df.shape[0]

oof_df.insert(loc=1, column="name", value=train_df["name"].to_numpy())
oof_df.insert(loc=2, column="document_text", value=train_df["document_text"].to_numpy())
# rearrage columns for better visibility while analyses
cols = []
for col in LABEL_COLS:
    cols.append(col)
    cols.append(f"pred_{col}")
oof_df = oof_df[["id", "name", "document_text"] + cols]
oof_df

Unnamed: 0,id,name,document_text,Accounting and Finance,pred_Accounting and Finance,Antitrust,pred_Antitrust,Banking,pred_Banking,Broker Dealer,pred_Broker Dealer,Commodities Trading,pred_Commodities Trading,Compliance Management,pred_Compliance Management,Consumer protection,pred_Consumer protection,Contract Provisions,pred_Contract Provisions,Corporate Communications,pred_Corporate Communications,Corporate Governance,pred_Corporate Governance,Definitions,pred_Definitions,Delivery,pred_Delivery,Examinations,pred_Examinations,Exemptions,pred_Exemptions,Fees and Charges,pred_Fees and Charges,Financial Accounting,pred_Financial Accounting,Financial Crime,pred_Financial Crime,Forms,pred_Forms,Fraud,pred_Fraud,IT Risk,pred_IT Risk,Information Filing,pred_Information Filing,Insurance,pred_Insurance,Legal,pred_Legal,Legal Proceedings,pred_Legal Proceedings,Licensing,pred_Licensing,Licensure and certification,pred_Licensure and certification,Liquidity Risk,pred_Liquidity Risk,Listing,pred_Listing,Market Abuse,pred_Market Abuse,Market Risk,pred_Market Risk,Monetary and Economic Policy,pred_Monetary and Economic Policy,Money Services,pred_Money Services,Money-Laundering and Terrorist Financing,pred_Money-Laundering and Terrorist Financing,Natural Disasters,pred_Natural Disasters,Payments and Settlements,pred_Payments and Settlements,Powers and Duties,pred_Powers and Duties,Quotation,pred_Quotation,Records Maintenance,pred_Records Maintenance,Regulatory Actions,pred_Regulatory Actions,Regulatory Reporting,pred_Regulatory Reporting,Required Disclosures,pred_Required Disclosures,Research,pred_Research,Risk Management,pred_Risk Management,Securities Clearing,pred_Securities Clearing,Securities Issuing,pred_Securities Issuing,Securities Management,pred_Securities Management,Securities Sales,pred_Securities Sales,Securities Settlement,pred_Securities Settlement,Trade Pricing,pred_Trade Pricing,Trade Settlement,pred_Trade Settlement
0,4772,Consent Order in the Matter of Solium Financia...,"Solium Financial Services LLC (""SFS"") is a bro...",0,0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,4774,Alberta Securities Commission Warns Investors ...,A new year brings new investment opportunities...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,4775,Exempt Market Dealer Agrees to Settlement,The Alberta Securities Commission (ASC) has co...,0,0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,1,1,0,0,1,0
3,4776,Canadian Securities Regulators Announces Consu...,The Canadian Securities Administrators (CSA) p...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0
4,4778,CSA Consultation Paper 51-405 Consideration of...,"On April 6, 2017, the Canadian Securities Admi...",0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9854,57201,What To Do If Youre Having Difficulty Repaying...,Having a good track record when it comes to me...,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9855,57207,The Bank of Russia Restricted Exchange Trading...,"The Bank of Russia has decided, starting from ...",0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,1,1,1,1,0,0,1,0
9856,57234,Application of a Pecuniary Administrative Sanc...,Consob published application of a pecuniary ad...,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9857,57240,Investor Consultation - Investor Service Hotli...,"On May 31, 2022, Shenzhen Stock Exchange publi...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0


In [6]:
oof_df

Unnamed: 0,id,name,document_text,Accounting and Finance,pred_Accounting and Finance,Antitrust,pred_Antitrust,Banking,pred_Banking,Broker Dealer,pred_Broker Dealer,Commodities Trading,pred_Commodities Trading,Compliance Management,pred_Compliance Management,Consumer protection,pred_Consumer protection,Contract Provisions,pred_Contract Provisions,Corporate Communications,pred_Corporate Communications,Corporate Governance,pred_Corporate Governance,Definitions,pred_Definitions,Delivery,pred_Delivery,Examinations,pred_Examinations,Exemptions,pred_Exemptions,Fees and Charges,pred_Fees and Charges,Financial Accounting,pred_Financial Accounting,Financial Crime,pred_Financial Crime,Forms,pred_Forms,Fraud,pred_Fraud,IT Risk,pred_IT Risk,Information Filing,pred_Information Filing,Insurance,pred_Insurance,Legal,pred_Legal,Legal Proceedings,pred_Legal Proceedings,Licensing,pred_Licensing,Licensure and certification,pred_Licensure and certification,Liquidity Risk,pred_Liquidity Risk,Listing,pred_Listing,Market Abuse,pred_Market Abuse,Market Risk,pred_Market Risk,Monetary and Economic Policy,pred_Monetary and Economic Policy,Money Services,pred_Money Services,Money-Laundering and Terrorist Financing,pred_Money-Laundering and Terrorist Financing,Natural Disasters,pred_Natural Disasters,Payments and Settlements,pred_Payments and Settlements,Powers and Duties,pred_Powers and Duties,Quotation,pred_Quotation,Records Maintenance,pred_Records Maintenance,Regulatory Actions,pred_Regulatory Actions,Regulatory Reporting,pred_Regulatory Reporting,Required Disclosures,pred_Required Disclosures,Research,pred_Research,Risk Management,pred_Risk Management,Securities Clearing,pred_Securities Clearing,Securities Issuing,pred_Securities Issuing,Securities Management,pred_Securities Management,Securities Sales,pred_Securities Sales,Securities Settlement,pred_Securities Settlement,Trade Pricing,pred_Trade Pricing,Trade Settlement,pred_Trade Settlement
0,4772,Consent Order in the Matter of Solium Financia...,"Solium Financial Services LLC (""SFS"") is a bro...",0,0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,4774,Alberta Securities Commission Warns Investors ...,A new year brings new investment opportunities...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,4775,Exempt Market Dealer Agrees to Settlement,The Alberta Securities Commission (ASC) has co...,0,0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,1,1,0,0,1,0
3,4776,Canadian Securities Regulators Announces Consu...,The Canadian Securities Administrators (CSA) p...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0
4,4778,CSA Consultation Paper 51-405 Consideration of...,"On April 6, 2017, the Canadian Securities Admi...",0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9854,57201,What To Do If Youre Having Difficulty Repaying...,Having a good track record when it comes to me...,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9855,57207,The Bank of Russia Restricted Exchange Trading...,"The Bank of Russia has decided, starting from ...",0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,1,1,1,1,0,0,1,0
9856,57234,Application of a Pecuniary Administrative Sanc...,Consob published application of a pecuniary ad...,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9857,57240,Investor Consultation - Investor Service Hotli...,"On May 31, 2022, Shenzhen Stock Exchange publi...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0


In [7]:
print(classification_report(
    oof_df[LABEL_COLS],
    oof_df[PRED_COLS],
    output_dict=False,
    target_names=LABEL_COLS,
))

                                          precision    recall  f1-score   support

                  Accounting and Finance       0.94      0.83      0.88       935
                               Antitrust       0.93      0.83      0.88       880
                                 Banking       0.95      0.92      0.93      1078
                           Broker Dealer       0.91      0.89      0.90       670
                     Commodities Trading       0.91      0.95      0.93       682
                   Compliance Management       0.91      0.90      0.90      1391
                     Consumer protection       0.94      0.88      0.91       969
                     Contract Provisions       0.87      0.84      0.85      1153
                Corporate Communications       0.89      0.60      0.71       518
                    Corporate Governance       0.83      0.69      0.75       958
                             Definitions       0.83      0.83      0.83       570
               

In [8]:
report = classification_report(
        oof_df[LABEL_COLS],
        oof_df[PRED_COLS],
        output_dict=True,
        target_names=LABEL_COLS,
    )
report_df = pd.DataFrame(report).T[:50]

In [9]:
report_df

Unnamed: 0,precision,recall,f1-score,support
Accounting and Finance,0.940462,0.827807,0.880546,935.0
Antitrust,0.933419,0.828409,0.877784,880.0
Banking,0.951784,0.915584,0.933333,1078.0
Broker Dealer,0.911315,0.889552,0.900302,670.0
Commodities Trading,0.907821,0.953079,0.9299,682.0
Compliance Management,0.906273,0.903666,0.904968,1391.0
Consumer protection,0.944568,0.879257,0.910743,969.0
Contract Provisions,0.86828,0.840416,0.854121,1153.0
Corporate Communications,0.887931,0.596525,0.713626,518.0
Corporate Governance,0.828499,0.685804,0.750428,958.0


In [10]:
report_df.insert(loc=0, column="class", value=report_df.index)
report_df = report_df[["class", "precision", "recall", "f1-score", "support"]]
report_df = report_df.reset_index(drop=True)
report_df

Unnamed: 0,class,precision,recall,f1-score,support
0,Accounting and Finance,0.940462,0.827807,0.880546,935.0
1,Antitrust,0.933419,0.828409,0.877784,880.0
2,Banking,0.951784,0.915584,0.933333,1078.0
3,Broker Dealer,0.911315,0.889552,0.900302,670.0
4,Commodities Trading,0.907821,0.953079,0.9299,682.0
5,Compliance Management,0.906273,0.903666,0.904968,1391.0
6,Consumer protection,0.944568,0.879257,0.910743,969.0
7,Contract Provisions,0.86828,0.840416,0.854121,1153.0
8,Corporate Communications,0.887931,0.596525,0.713626,518.0
9,Corporate Governance,0.828499,0.685804,0.750428,958.0


# Log results as tables to W&B for interactive exploration

In [11]:
run = wandb.init(project="DataSolve-2022", group="error-analysis", name="classification-report")

[34m[1mwandb[0m: Currently logged in as: [33mgladiator[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [12]:
# log oof predictions and classification report as wandb table
oof_table = wandb.Table(dataframe=oof_df)
classification_report_table = wandb.Table(dataframe=report_df)

run.log({"oof_predictions": oof_table, "classification_report": classification_report_table})

In [13]:
run.finish()

In [14]:
# mean of support (i.e. average examples per classes)
report_df["support"].mean()

942.04

In [16]:
report_df["support"].min(),report_df["support"].max() 

(435.0, 1742.0)

In [18]:
report_df[report_df["support"] == 435.0]

Unnamed: 0,class,precision,recall,f1-score,support
19,IT Risk,0.867769,0.724138,0.789474,435.0


In [19]:
report_df[report_df["support"] == 1742.0]

Unnamed: 0,class,precision,recall,f1-score,support
12,Examinations,0.853865,0.811711,0.832254,1742.0


# Tokenization example

In [20]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-large")

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/580 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [22]:
sample = oof_df.iloc[0]
sample

id                                                                          4772
name                           Consent Order in the Matter of Solium Financia...
document_text                  Solium Financial Services LLC ("SFS") is a bro...
Accounting and Finance                                                         0
pred_Accounting and Finance                                                    0
                                                     ...                        
pred_Securities Settlement                                                     0
Trade Pricing                                                                  0
pred_Trade Pricing                                                             0
Trade Settlement                                                               0
pred_Trade Settlement                                                          0
Name: 0, Length: 103, dtype: object

In [23]:
sample_text = tokenizer.cls_token + sample["name"] + tokenizer.sep_token + sample["document_text"] + tokenizer.sep_token
sample_text

'[CLS]Consent Order in the Matter of Solium Financial Services LLC[SEP]Solium Financial Services LLC ("SFS") is a broker-dealer with a principal place of business at 50 Tice Boulevard, Suite A-18 Woodcliff Lake, New Jersey 07677, and is registered as a broker-dealer with the Alabama Securities Commission ("Commission"). During the period from at least January 2009 to June 6, 2019, SFS acted as broker-dealer in Alabama as the term broker-dealer is defined by Title 8, Chapter 6, 8-6-2 of the Act. Code of Alabama, 8-6-3(a) states that it is unlawful for a person to transact business in Alabama as a broker-dealer or agent unless such person is registered under the Act. By engaging in the conduct set forth above, SFS acted as an unregistered broker-dealer in Alabama in violation of 8-6-3(a) of the Act. This Order concludes the investigation by the Commission and any other action that the Commission could commence under applicable Alabama law as it relates to the substance of the Findings of

In [29]:
print(tokenizer.convert_ids_to_tokens(tokenizer(sample_text, add_special_tokens=False)["input_ids"]))

['[CLS]', '▁Consent', '▁Order', '▁in', '▁the', '▁Matter', '▁of', '▁So', 'lium', '▁Financial', '▁Services', '▁LLC', '[SEP]', '▁So', 'lium', '▁Financial', '▁Services', '▁LLC', '▁(', '"', 'S', 'FS', '"', ')', '▁is', '▁a', '▁broker', '-', 'dealer', '▁with', '▁a', '▁principal', '▁place', '▁of', '▁business', '▁at', '▁50', '▁T', 'ice', '▁Boulevard', ',', '▁Suite', '▁A', '-', '18', '▁Wood', 'cliff', '▁Lake', ',', '▁New', '▁Jersey', '▁07', '677', ',', '▁and', '▁is', '▁registered', '▁as', '▁a', '▁broker', '-', 'dealer', '▁with', '▁the', '▁Alabama', '▁Securities', '▁Commission', '▁(', '"', 'Commission', '"', ')', '.', '▁During', '▁the', '▁period', '▁from', '▁at', '▁least', '▁January', '▁2009', '▁to', '▁June', '▁6', ',', '▁2019', ',', '▁SF', 'S', '▁acted', '▁as', '▁broker', '-', 'dealer', '▁in', '▁Alabama', '▁as', '▁the', '▁term', '▁broker', '-', 'dealer', '▁is', '▁defined', '▁by', '▁Title', '▁8', ',', '▁Chapter', '▁6', ',', '▁8', '-', '6', '-', '2', '▁of', '▁the', '▁Act', '.', '▁Code', '▁of', '▁A

In [31]:
print(tokenizer(sample_text, add_special_tokens=False)["input_ids"])

[1, 36219, 4077, 267, 262, 14759, 265, 471, 60661, 3729, 1724, 3927, 2, 471, 60661, 3729, 1724, 3927, 287, 309, 430, 16480, 309, 285, 269, 266, 7347, 271, 58161, 275, 266, 4891, 470, 265, 460, 288, 960, 897, 10953, 14201, 261, 9500, 336, 271, 2048, 4059, 40099, 2202, 261, 485, 3744, 7844, 46692, 261, 263, 269, 2079, 283, 266, 7347, 271, 58161, 275, 262, 6002, 10207, 2653, 287, 309, 75573, 309, 285, 260, 1717, 262, 926, 292, 288, 668, 1278, 1812, 264, 1172, 525, 261, 1112, 261, 10000, 430, 8736, 283, 7347, 271, 58161, 267, 6002, 283, 262, 1384, 7347, 271, 58161, 269, 3034, 293, 7181, 578, 261, 4696, 525, 261, 578, 271, 765, 271, 445, 265, 262, 1878, 260, 3506, 265, 6002, 261, 578, 271, 765, 271, 508, 555, 452, 285, 1603, 272, 278, 269, 15082, 270, 266, 604, 264, 51494, 460, 267, 6002, 283, 266, 7347, 271, 58161, 289, 2645, 2336, 405, 604, 269, 2079, 494, 262, 1878, 260, 927, 4686, 267, 262, 3360, 487, 4243, 764, 261, 10000, 430, 8736, 283, 299, 46245, 7347, 271, 58161, 267, 6002, 267, 6

In [35]:
from datasets import Dataset
def tokenize_func(example):
    tok = tokenizer(example["text"], add_special_tokens=False)
    return {"length": len(tok["input_ids"])}

train_df["text"] = tokenizer.cls_token + train_df["name"] + tokenizer.sep_token + train_df["document_text"] + tokenizer.sep_token
ds = Dataset.from_pandas(train_df)
ds = ds.map(tokenize_func)

  0%|          | 0/9859 [00:00<?, ?ex/s]

In [37]:
df = ds.to_pandas()

In [57]:
df.length.mean()

329.62896845521857

In [60]:
len_df = pd.DataFrame()

means = []
maxis = []
minis = []
total_samples = []
for col in LABEL_COLS:
    mean = int(df[df[col] == 1]["length"].mean())
    maxi = df[df[col] == 1]["length"].max()
    mini = df[df[col] == 1]["length"].min()
    total_sample = len(df[df[col] == 1])
    means.append(mean); maxis.append(maxi); minis.append(mini)
    total_samples.append(total_sample)

len_df["label"] = LABEL_COLS
len_df["mean"] = means
len_df["max"] = maxis
len_df["min"] = minis
len_df["total_samples"] = total_samples
len_df

Unnamed: 0,label,mean,max,min,total_samples
0,Accounting and Finance,371,1621,80,935
1,Antitrust,362,1288,80,880
2,Banking,303,1328,75,1078
3,Broker Dealer,510,1604,90,670
4,Commodities Trading,314,1698,84,682
5,Compliance Management,411,1621,86,1391
6,Consumer protection,362,1698,76,969
7,Contract Provisions,358,1698,86,1153
8,Corporate Communications,328,1379,90,518
9,Corporate Governance,424,1721,82,958


In [62]:
len_df["total_samples"].mean()

942.04