# Set up

⚙️ Step 1: Set your notebook to GPU

The next two cells take ~2 min.... start running them now while we talk! 👇👇

In [None]:
# Download data from google drive
import gdown
import os

file_ids = {'test_inputs' : '1Gyv_ldUTi0Ymy6wVMfruAO0UraCQ70CR',
            'train': '11S5p0QgP1X9rOFiIjNSLydLenJwm7hle'}

for name, file_id in file_ids.items():
    filename = f'crosstalk_{name}.parquet'
    if not os.path.exists(filename):
        gdown.download(id=file_id, output=filename, quiet=False)

Or, if you have the file located in your drive

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


# Load the train datasets

See the bonus content from last notebook to get a peek under the hood of the data loaders

Or check it out in the files you downloaded to colab on the left 👈

In [None]:
import pandas as pd
import numpy as np
from dataset import basic_dataloader

In [None]:
X_train, y_train = basic_dataloader('/content/drive/My Drive/crosstalk_train.parquet', x_col="AVALON", y_col = 'DELLabel', max_to_load=1000) # fingerprints available: 'ATOMPAIR', 'MACCS', 'ECFP6', 'ECFP4', 'FCFP4', 'FCFP6', 'TOPTOR', 'RDK', 'AVALON'

Loading chunks:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
X_train.shape

(1000, 2048)

In [None]:
y_train.shape

(1000,)

# Let's train catboost classifier and see how well it fits the training data

🐞 do you see a CUDA error? raise your hand now and brag about it

In [None]:
%%time
import catboost as cb
from eval import BinaryEvaluator
params = {
                'random_strength': 2, # only non-default hyperparam, default is 1
                'random_seed': 1234,
                'verbose': 0,
                'loss_function': 'Logloss',
                'task_type': 'GPU',
                'devices': '0'
            }
model = cb.CatBoostClassifier(**params)
model.fit(X_train, y_train)
yp = model.predict_proba(X_train)[:, 1] # or validation

CPU times: user 36.2 s, sys: 5.09 s, total: 41.3 s
Wall time: 38.1 s


In [None]:
eval = BinaryEvaluator(X_train.toarray(), y_train)
metric_dict = eval.compute_metrics(yt=y_train, yp=yp) # or validation

In [None]:
for metric_name, metric_value in metric_dict.items():
    print(f'{metric_name:20s}: {metric_value:.2f}')

# How well does it generalize though? Let's try 5-fold cross-validation

In [None]:
%%time
model_cv = cb.CatBoostClassifier(**params)
metric_dict_cv = eval.CV_model(model_cv)

In [None]:
for metric_name, metric_value in metric_dict_cv['mean'].items():
    print(f'{metric_name:20s}: {metric_value:.2f}')

# Submit predictions

Update the next cell with your team name

In [None]:
team_name = 'demo'

In [None]:
%%time
X_test = basic_dataloader('/content/drive/My Drive/crosstalk_test_inputs.parquet', x_col="AVALON", y_col = None, max_to_load = None, chunk_size = 20000)

In [None]:
X_test.shape

In [None]:
yp = model.predict_proba(X_test)[:,1]

Upload this baseline to kaggle and check out the leaderboard!

In [None]:
import pyarrow as pa
from pyarrow import parquet as pq

In [None]:
pf = pq.ParquetFile('/content/drive/My Drive/crosstalk_test_inputs.parquet')

In [None]:
preds = pf.read(columns = ['RandomID']).to_pandas()
preds['DELLabel'] = yp
display(preds)

In [None]:
preds.to_csv(f'{team_name}.csv', index=False)

# Let's compare it against some sklearn baselines

⚠️ these next cells are slow to run! Start them now and come back in 5 minutes

In [None]:
%%time
from eval import get_baseline_models

eval = BinaryEvaluator(X_train, y_train)
baselines = get_baseline_models()
baselines_res = {}

for m in baselines:
    baselines_res[m] = eval.CV_model(baselines[m])

In [None]:
# display all the models results
baselines_res.update({'catboost': metric_dict_cv})
pd.DataFrame({model: metrics['mean'] for model, metrics in baselines_res.items()}).T.round(2)