In [1]:
DEBUG = True
WANDB = False
ENVIRON = "local"
EXP_NAME = "catboost-countvect"
NOTES = "5 fold CatBoost pipeline, count vectorizer"
TAGS = ["catboost", "count_vectorizer", "5_fold_split"]
GROUP = "catboost"

# Setup Environment

In [2]:
import pkgutil
from pathlib import Path

PROJECT = "DataSolve-2022"

if ENVIRON == "jarvislabs":
    ROOT_DIR = Path(f"/home/{PROJECT}")
    ARTIFACTS_DIR = Path("/home/artifacts") 
    SETUP_SCRIPT_PATH = Path("/home/setup.sh")

elif ENVIRON == "kaggle":
    ROOT_DIR = Path(f"/kaggle/working/{PROJECT}")
    ARTIFACTS_DIR = Path("/kaggle/working/artifacts")
    SETUP_SCRIPT_PATH = Path("/kaggle/input/datasolve-setup-script/setup.sh")

elif ENVIRON == "local":
    ROOT_DIR = Path(f"../{PROJECT}")
    ARTIFACTS_DIR = Path("../artifacts")
    SETUP_SCRIPT_PATH = Path("../setup.sh")

if not pkgutil.find_loader("omegaconf") and ENVIRON == "kaggle":    
    !bash {SETUP_SCRIPT_PATH} {ENVIRON} "true"

# load secret keys
%load_ext dotenv
if ENVIRON == "kaggle":
    %dotenv /kaggle/input/datasolve-setup-script/.env
else:
    %dotenv {ROOT_DIR}/.env

# Configuration

In [3]:
import os, gc
import math
gc.enable()

import shutil
import wandb
from wandb import AlertLevel
from omegaconf import OmegaConf

class Config:
    # general
    debug = DEBUG
    wandb = WANDB
    group = GROUP
    seed = 42
    train_csv = "train_folds_5.csv"
    fold = 0 # will be overriden later
    
    # tracking
    exp_name = EXP_NAME
    notes = NOTES
    tags = TAGS
    upload_artifacts_to_wandb = True
    
    # misc
    tfidf_max_features = None
    cnt_vect_max_features = None
    

config_dict = {x:dict(Config.__dict__)[x] for x in dict(Config.__dict__) if not x.startswith('_')}
cfg = OmegaConf.create(config_dict)

if cfg.debug:
    cfg.tags += ["debug"]
    cfg.tfidf_max_features = 10
    cfg.cnt_vect_max_features = 10

OUTPUT_DIR = Path(ARTIFACTS_DIR/cfg.exp_name)
print(f"Saving outputs to {OUTPUT_DIR}")
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"Experiment: {cfg.exp_name}, Desc: {cfg.notes}\n")
print(OmegaConf.to_yaml(cfg, resolve=True))

Saving outputs to /home/artifacts/catboost-countvect
Experiment: catboost-countvect, Desc: 5 fold CatBoost pipeline, count vecoterizer

debug: true
wandb: false
group: catboost
seed: 42
train_csv: train_folds_5.csv
fold: 0
exp_name: catboost-countvect
notes: 5 fold CatBoost pipeline, count vecoterizer
tags:
- catboost
- count_vectorizer
- 5_fold_split
- debug
upload_artifacts_to_wandb: true
tfidf_max_features: 10
cnt_vect_max_features: 10



# Imports

In [4]:
import pickle
import numpy as np
import pandas as pd
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import roc_auc_score

# Init W&B run

In [5]:
if cfg.wandb:
    wandb.init(
        project="DataSolve-2022",
        group=cfg.group,
        name=cfg.exp_name,
        tags=cfg.tags,
        notes=cfg.notes,
        config=config_dict,
        save_code=True,
    )
    wandb.alert(
        title=f"Experiment {wandb.run.name}",
        text=f"🚀 Starting experiment {wandb.run.name}, Description: {cfg.notes}",
        level=AlertLevel.INFO,
        wait_duration=0,
    )

# Helper functions

In [6]:
def clear_memory():
    gc.collect()
    torch.cuda.empty_cache()

def delete_file(path: str):
    if os.exists(path):
        os.remove(path)

def save_pickle(obj, filepath):
    with open(filepath, 'wb') as handle:
        pickle.dump(obj, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
def post_process_logits(logits: np.ndarray, threshold=0.5):
    preds = np.zeros(logits.shape)
    preds[np.where(logits >= threshold)] = 1
    preds = preds.flatten().astype(int)
    return preds

# Read Data

In [7]:
# READ DATA
df = pd.read_csv(ROOT_DIR/'input'/cfg.train_csv)
test_df = pd.read_csv(ROOT_DIR/'input'/'test.csv')

df['text'] = df['name'] + ". " + df['document_text']
test_df['text'] = test_df['name'] + ". " + test_df['document_text']
if cfg.debug:
    df = df.sample(200, random_state=42).reset_index(drop=True)
LABEL_COLS = [col for col in df.columns if col not in ["id", "name", "document_text", "fold", "text"]]
print(len(LABEL_COLS))
df.head()

50


Unnamed: 0,id,name,document_text,Accounting and Finance,Antitrust,Banking,Broker Dealer,Commodities Trading,Compliance Management,Consumer protection,...,Risk Management,Securities Clearing,Securities Issuing,Securities Management,Securities Sales,Securities Settlement,Trade Pricing,Trade Settlement,fold,text
0,51862,Adjudication Order in Respect of Siva Kumar C ...,Securities and Exchange Board of India (‘SEBI’...,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,Adjudication Order in Respect of Siva Kumar C ...
1,5718,Abusive Financial Services: Consob Obscures 7 ...,Consob has ordered the closing of 7 new websit...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,Abusive Financial Services: Consob Obscures 7 ...
2,15395,ICO Consultation on the Draft Statutory Guidance,Ofcom published that they are running a consul...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,ICO Consultation on the Draft Statutory Guidan...
3,6593,Renewal Without Change of the Registration of ...,As part of its continuing effort to reduce pap...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,Renewal Without Change of the Registration of ...
4,51895,Financial Sanctions: Russia,The news release is issued to publicize the ad...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,Financial Sanctions: Russia. The news release ...


# Train

In [8]:
%%time
all_logits = []
all_ids = []
all_labels = []
all_test_logits = []

for fold in range(5):
    cfg.fold = fold
    
    train_df = df[df['fold'] != fold].reset_index(drop=True)
    valid_df = df[df['fold'] == fold].reset_index(drop=True)

    cnt = CountVectorizer(max_features = cfg.cnt_vect_max_features)
    # tfv = TfidfVectorizer(max_features = cfg.tfidf_max_features)
    cnt.fit(train_df['text'].to_numpy())
    # tfv.fit(train_df['text'].to_numpy())

    xtrain = cnt.transform(train_df['text'].to_numpy()).toarray()
    # xtrain2 = tfv.transform(train_df['text'].to_numpy()).toarray()
    # xtrain = np.concatenate([xtrain1, xtrain2], axis=1)
    print(f"Total features (cnt): {xtrain.shape[1]}")
    
    xvalid = cnt.transform(valid_df['text'].to_numpy()).toarray()
    # xvalid2 = tfv.transform(valid_df['text'].to_numpy()).toarray()
    # xvalid = np.concatenate([xvalid1, xvalid2], axis=1)
    
    ytrain = train_df[LABEL_COLS]
    yvalid = valid_df[LABEL_COLS]
    labels = yvalid.to_numpy().flatten()
    
    xtest = cnt.transform(test_df['text'].to_numpy()).toarray()
    # xtest2 = tfv.transform(test_df['text'].to_numpy()).toarray()
    # xtest = np.concatenate([xtest1, xtest2], axis=1)
    
    clf = CatBoostClassifier(
            loss_function='MultiLogloss',
            eval_metric='HammingLoss',
            iterations=500,
            class_names=LABEL_COLS,
            silent=False,
            thread_count=-1
    )
    clf.fit(xtrain, ytrain)
    
    val_logits = clf.predict_proba(xvalid)
    val_preds = post_process_logits(val_logits)
    print(f"fold-{cfg.fold} score: {roc_auc_score(labels, val_preds, average='macro')}")
    
    test_logits = clf.predict_proba(xtest)
    
    all_ids.append(valid_df['id'].to_numpy())
    all_labels.append(yvalid.to_numpy())
    all_logits.append(val_logits)
    all_test_logits.append(test_logits)
    
    clf.save_model(OUTPUT_DIR/f'model_{fold}.json')

# Save oof predictions
oof_logits = np.concatenate(all_logits)
oof_labels = np.concatenate(all_labels)
oof_ids = np.concatenate(all_ids)
oof_dict = {"id": oof_ids, "labels": oof_labels, "logits": oof_logits}
save_pickle(oof_dict, OUTPUT_DIR/f'{cfg.exp_name}_oof.pkl')

cv_score = roc_auc_score(oof_labels.flatten(), post_process_logits(oof_logits), average="macro")
print(f"CV Score: {cv_score:.6f}")

if cfg.wandb:
    wandb.log({"cv": cv_score})

Total features (cnt): 10
Learning rate set to 0.008966
0:	learn: 0.0893252	total: 161ms	remaining: 1m 20s
1:	learn: 0.0862577	total: 204ms	remaining: 50.7s
2:	learn: 0.0865031	total: 218ms	remaining: 36.1s
3:	learn: 0.0833129	total: 231ms	remaining: 28.7s
4:	learn: 0.0879755	total: 249ms	remaining: 24.6s
5:	learn: 0.0890798	total: 350ms	remaining: 28.8s
6:	learn: 0.0888344	total: 361ms	remaining: 25.4s
7:	learn: 0.0889571	total: 372ms	remaining: 22.9s
8:	learn: 0.0895706	total: 383ms	remaining: 20.9s
9:	learn: 0.0901840	total: 397ms	remaining: 19.5s
10:	learn: 0.0904294	total: 410ms	remaining: 18.2s
11:	learn: 0.0907975	total: 481ms	remaining: 19.6s
12:	learn: 0.0904294	total: 527ms	remaining: 19.7s
13:	learn: 0.0909202	total: 537ms	remaining: 18.6s
14:	learn: 0.0909202	total: 550ms	remaining: 17.8s
15:	learn: 0.0907975	total: 564ms	remaining: 17.1s
16:	learn: 0.0909202	total: 577ms	remaining: 16.4s
17:	learn: 0.0914110	total: 659ms	remaining: 17.6s
18:	learn: 0.0910429	total: 671ms	re

# Inference

In [9]:
test_df.head()

Unnamed: 0,id,name,document_text,text
0,4771,Companies (Amendment) Regulations 2020,Accounting and Corporate Regulatory Authority ...,Companies (Amendment) Regulations 2020. Accoun...
1,4773,Notice of Intended Action Section 810-5-75-.31...,The Notice of intended action regarding sectio...,Notice of Intended Action Section 810-5-75-.31...
2,4787,Case and Desist Order in the Matter of Henry C...,"On January 9th, 2020, the Staff of the Arkansa...",Case and Desist Order in the Matter of Henry C...
3,4791,AUSTRAC Further Strengthens International Part...,"On January 10, 2020, AUSTRAC and Great Britain...",AUSTRAC Further Strengthens International Part...
4,4794,Sydney Man Accused of Travelling to Philippine...,AUSTRAC published that a 63-year-old Sydney ma...,Sydney Man Accused of Travelling to Philippine...


In [10]:
test_logits = np.mean(all_test_logits, axis=0)
test_preds = post_process_logits(test_logits)

# Save test logits
test_dict = {"id": test_df['id'], "logits": test_logits}
save_pickle(test_dict, OUTPUT_DIR/f"{cfg.exp_name}_test_logits.pkl")

# Create Submission 

In [11]:
ids = []
for id_ in test_df['id'].to_numpy():
    for col in LABEL_COLS:
        ids.append(f"{id_}_{col}")
        
sub_df = pd.DataFrame()
sub_df['id'] = ids
sub_df['predictions'] = test_preds
sub_df.to_csv(OUTPUT_DIR/f"{cfg.exp_name}_sub.csv", index=False)
sub_df.head()

Unnamed: 0,id,predictions
0,4771_Accounting and Finance,0
1,4771_Antitrust,0
2,4771_Banking,0
3,4771_Broker Dealer,0
4,4771_Commodities Trading,0


# Upload artifacts to W&B

In [12]:
# save experiment config file
config_file_save_path = OUTPUT_DIR / f"{cfg.exp_name}_config.yaml"
with open(config_file_save_path, "w") as fp:
    OmegaConf.save(config=cfg, f=fp.name)
if cfg.wandb:
    # log artifacts to wandb
    if cfg.upload_artifacts_to_wandb:
        model_artifact = wandb.Artifact(name=cfg.exp_name, type="model")
        model_artifact.add_dir(OUTPUT_DIR)
        wandb.log_artifact(model_artifact)

    wandb.alert(
        title=f"Experiment {cfg.exp_name}",
        text=f"🎉 Finished experiment {cfg.exp_name}",
        level=AlertLevel.INFO,
        wait_duration=0,
    )
    shutil.copyfile(config_file_save_path, os.path.join(wandb.run.dir, f"{cfg.exp_name}_config.yaml"))
    wandb.finish()