In [1]:
DEBUG = False
WANDB = True
ENVIRON = "kaggle"
EXP_NAME = "xgb-countvect"
NOTES = "5 fold XGBoost pipeline, used count vectorizer with no max features set"
TAGS = ["xgboost", "count_vectorizer", "5_fold_split"]
GROUP = "xgboost"

# Setup Environment

In [2]:
import pkgutil
from pathlib import Path

PROJECT = "DataSolve-2022"

if ENVIRON == "jarvislabs":
    ROOT_DIR = Path(f"/home/{PROJECT}")
    ARTIFACTS_DIR = Path("/home/artifacts") 
    SETUP_SCRIPT_PATH = Path("/home/setup.sh")

elif ENVIRON == "kaggle":
    ROOT_DIR = Path(f"/kaggle/working/{PROJECT}")
    ARTIFACTS_DIR = Path("/kaggle/working/artifacts")
    SETUP_SCRIPT_PATH = Path("/kaggle/input/datasolve-setup-script/setup.sh")
    
if not pkgutil.find_loader("omegaconf") and ENVIRON == "kaggle":    
    !bash {SETUP_SCRIPT_PATH} {ENVIRON} "true"

# load secret keys
%load_ext dotenv
if ENVIRON == "kaggle":
    %dotenv /kaggle/input/datasolve-setup-script/.env
else:
    %dotenv {ROOT_DIR}/.env

Cloning into 'DataSolve-2022'...
remote: Enumerating objects: 34, done.[K
remote: Counting objects: 100% (34/34), done.[K
remote: Compressing objects: 100% (22/22), done.[K
remote: Total 34 (delta 14), reused 30 (delta 10), pack-reused 0[K
Unpacking objects: 100% (34/34), 21.35 KiB | 1.64 MiB/s, done.
Collecting omegaconf
  Downloading omegaconf-2.2.3-py3-none-any.whl (79 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.3/79.3 kB[0m [31m896.8 kB/s[0m eta [36m0:00:00[0m
Collecting antlr4-python3-runtime==4.9.*
  Downloading antlr4-python3-runtime-4.9.3.tar.gz (117 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.0/117.0 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: antlr4-python3-runtime
  Building wheel for antlr4-python3-runtime (setup.py) ... [?25l- \ done
[?25h  Created wheel for antlr4-python3-runtime: filename=ant

# Configuration

In [3]:
import os, gc
import math
gc.enable()

import shutil
import wandb
from wandb import AlertLevel
from omegaconf import OmegaConf

class Config:
    # general
    debug = DEBUG
    wandb = WANDB
    group = GROUP
    seed = 42
    train_csv = "train_folds_5.csv"
    fold = 0 # will be overriden later
    
    # tracking
    exp_name = EXP_NAME
    notes = NOTES
    tags = TAGS
    upload_artifacts_to_wandb = True
    
    # misc
    tfidf_max_features = None
    

config_dict = {x:dict(Config.__dict__)[x] for x in dict(Config.__dict__) if not x.startswith('_')}
cfg = OmegaConf.create(config_dict)

if cfg.debug:
    cfg.tags += ["debug"]
    cfg.tfidf_max_features = 10

OUTPUT_DIR = Path(ARTIFACTS_DIR/cfg.exp_name)
print(f"Saving outputs to {OUTPUT_DIR}")
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"Experiment: {cfg.exp_name}, Desc: {cfg.notes}\n")
print(OmegaConf.to_yaml(cfg, resolve=True))

Saving outputs to /kaggle/working/artifacts/xgb-countvect
Experiment: xgb-countvect, Desc: 5 fold XGBoost pipeline, used count vectorizer with no max features set

debug: false
wandb: true
group: xgboost
seed: 42
train_csv: train_folds_5.csv
fold: 0
exp_name: xgb-countvect
notes: 5 fold XGBoost pipeline, used count vectorizer with no max features set
tags:
- xgboost
- count_vectorizer
- 5_fold_split
upload_artifacts_to_wandb: true
tfidf_max_features: null



# Imports

In [4]:
import pickle
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import roc_auc_score

# Init W&B run

In [5]:
if cfg.wandb:
    wandb.init(
        project="DataSolve-2022",
        group=cfg.group,
        name=cfg.exp_name,
        tags=cfg.tags,
        notes=cfg.notes,
        config=config_dict,
        save_code=True,
    )
    wandb.alert(
        title=f"Experiment {wandb.run.name}",
        text=f"🚀 Starting experiment {wandb.run.name}, Description: {cfg.notes}",
        level=AlertLevel.INFO,
        wait_duration=0,
    )

[34m[1mwandb[0m: Currently logged in as: [33mgladiator[0m. Use [1m`wandb login --relogin`[0m to force relogin


# Helper functions

In [6]:
def clear_memory():
    gc.collect()
    torch.cuda.empty_cache()

def delete_file(path: str):
    if os.exists(path):
        os.remove(path)

def save_pickle(obj, filepath):
    with open(filepath, 'wb') as handle:
        pickle.dump(obj, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
def post_process_logits(logits: np.ndarray, threshold=0.5):
    preds = np.zeros(logits.shape)
    preds[np.where(logits >= threshold)] = 1
    preds = preds.flatten().astype(int)
    return preds

# Read Data

In [7]:
# READ DATA
df = pd.read_csv(ROOT_DIR/'input'/cfg.train_csv)
test_df = pd.read_csv(ROOT_DIR/'input'/'test.csv')

df['text'] = df['name'] + ". " + df['document_text']
test_df['text'] = test_df['name'] + ". " + test_df['document_text']
if cfg.debug:
    df = df.sample(100, random_state=42).reset_index(drop=True)
LABEL_COLS = [col for col in df.columns if col not in ["id", "name", "document_text", "fold", "text"]]
print(len(LABEL_COLS))
df.head()

50


Unnamed: 0,id,name,document_text,Accounting and Finance,Antitrust,Banking,Broker Dealer,Commodities Trading,Compliance Management,Consumer protection,...,Risk Management,Securities Clearing,Securities Issuing,Securities Management,Securities Sales,Securities Settlement,Trade Pricing,Trade Settlement,fold,text
0,4772,Consent Order in the Matter of Solium Financia...,"Solium Financial Services LLC (""SFS"") is a bro...",0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,Consent Order in the Matter of Solium Financia...
1,4774,Alberta Securities Commission Warns Investors ...,A new year brings new investment opportunities...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Alberta Securities Commission Warns Investors ...
2,4775,Exempt Market Dealer Agrees to Settlement,The Alberta Securities Commission (ASC) has co...,0,0,0,1,0,1,0,...,0,0,0,0,1,1,0,1,2,Exempt Market Dealer Agrees to Settlement. The...
3,4776,Canadian Securities Regulators Announces Consu...,The Canadian Securities Administrators (CSA) p...,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,4,Canadian Securities Regulators Announces Consu...
4,4778,CSA Consultation Paper 51-405 Consideration of...,"On April 6, 2017, the Canadian Securities Admi...",0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,2,CSA Consultation Paper 51-405 Consideration of...


# Train

In [8]:
%%time
all_logits = []
all_ids = []
all_labels = []
all_test_logits = []

for fold in range(5):
    cfg.fold = fold
    
    train_df = df[df['fold'] != fold].reset_index(drop=True)
    valid_df = df[df['fold'] == fold].reset_index(drop=True)

    tfv = CountVectorizer(max_features = cfg.tfidf_max_features)
    tfv.fit(train_df['text'].to_numpy())

    xtrain = tfv.transform(train_df['text'].to_numpy())
    xvalid = tfv.transform(valid_df['text'].to_numpy())
    ytrain = train_df[LABEL_COLS]
    yvalid = valid_df[LABEL_COLS]
    labels = yvalid.to_numpy().flatten()
    
    xtest = tfv.transform(test_df['text'].to_numpy())
    
    clf = xgb.XGBClassifier(random_state=cfg.seed, tree_method="gpu_hist")
    clf.fit(xtrain, ytrain)
    
    val_logits = clf.predict_proba(xvalid)
    val_preds = post_process_logits(val_logits)
    print(f"fold-{cfg.fold} score: {roc_auc_score(labels, val_preds, average='macro')}")
    
    test_logits = clf.predict_proba(xtest)
    
    all_ids.append(valid_df['id'].to_numpy())
    all_labels.append(yvalid.to_numpy())
    all_logits.append(val_logits)
    all_test_logits.append(test_logits)
    
    clf.save_model(OUTPUT_DIR/f'model_{fold}.json')

# Save oof predictions
oof_logits = np.concatenate(all_logits)
oof_labels = np.concatenate(all_labels)
oof_ids = np.concatenate(all_ids)
oof_dict = {"id": oof_ids, "labels": oof_labels, "logits": oof_logits}
save_pickle(oof_dict, OUTPUT_DIR/f'{cfg.exp_name}_oof.pkl')

cv_score = roc_auc_score(oof_labels.flatten(), post_process_logits(oof_logits), average="macro")
print(f"CV Score: {cv_score:.6f}")

if cfg.wandb:
    wandb.log({"cv": cv_score})

fold-0 score: 0.8906116551576914
fold-1 score: 0.8933744476353208
fold-2 score: 0.891857228854602
fold-3 score: 0.8824038440941763
fold-4 score: 0.8857650924410466
CV Score: 0.888802
CPU times: user 1h 15min 7s, sys: 4.36 s, total: 1h 15min 12s
Wall time: 1h 15min 19s


# Inference

In [9]:
test_df.head()

Unnamed: 0,id,name,document_text,text
0,4771,Companies (Amendment) Regulations 2020,Accounting and Corporate Regulatory Authority ...,Companies (Amendment) Regulations 2020. Accoun...
1,4773,Notice of Intended Action Section 810-5-75-.31...,The Notice of intended action regarding sectio...,Notice of Intended Action Section 810-5-75-.31...
2,4787,Case and Desist Order in the Matter of Henry C...,"On January 9th, 2020, the Staff of the Arkansa...",Case and Desist Order in the Matter of Henry C...
3,4791,AUSTRAC Further Strengthens International Part...,"On January 10, 2020, AUSTRAC and Great Britain...",AUSTRAC Further Strengthens International Part...
4,4794,Sydney Man Accused of Travelling to Philippine...,AUSTRAC published that a 63-year-old Sydney ma...,Sydney Man Accused of Travelling to Philippine...


In [10]:
test_logits = np.mean(all_test_logits, axis=0)
test_preds = post_process_logits(test_logits)

# Save test logits
test_dict = {"id": test_df['id'], "logits": test_logits}
save_pickle(test_dict, OUTPUT_DIR/f"{cfg.exp_name}_test_logits.pkl")

# Create Submission 

In [11]:
ids = []
for id_ in test_df['id'].to_numpy():
    for col in LABEL_COLS:
        ids.append(f"{id_}_{col}")
        
sub_df = pd.DataFrame()
sub_df['id'] = ids
sub_df['predictions'] = test_preds
sub_df.to_csv(OUTPUT_DIR/f"{cfg.exp_name}_sub.csv", index=False)
sub_df.head()

Unnamed: 0,id,predictions
0,4771_Accounting and Finance,1
1,4771_Antitrust,0
2,4771_Banking,0
3,4771_Broker Dealer,0
4,4771_Commodities Trading,0


# Upload artifacts to W&B

In [12]:
# save experiment config file
config_file_save_path = OUTPUT_DIR / f"{cfg.exp_name}_config.yaml"
with open(config_file_save_path, "w") as fp:
    OmegaConf.save(config=cfg, f=fp.name)
if cfg.wandb:
    # log artifacts to wandb
    if cfg.upload_artifacts_to_wandb:
        model_artifact = wandb.Artifact(name=cfg.exp_name, type="model")
        model_artifact.add_dir(OUTPUT_DIR)
        wandb.log_artifact(model_artifact)

    wandb.alert(
        title=f"Experiment {cfg.exp_name}",
        text=f"🎉 Finished experiment {cfg.exp_name}",
        level=AlertLevel.INFO,
        wait_duration=0,
    )
    shutil.copyfile(config_file_save_path, os.path.join(wandb.run.dir, f"{cfg.exp_name}_config.yaml"))
    wandb.finish()

[34m[1mwandb[0m: Adding directory to artifact (/kaggle/working/artifacts/xgb-countvect)... Done. 0.4s


0,1
cv,▁

0,1
cv,0.8888
