In [1]:
# link: https://www.kaggle.com/optimo/tabnetmultitaskclassifier

In [2]:
from pytorch_tabnet.multitask import TabNetMultiTaskClassifier

import torch
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, log_loss

import pandas as pd
import numpy as np
np.random.seed(0)

from tqdm.notebook import tqdm

import os

from matplotlib import pyplot as plt
%matplotlib inline

In [3]:
dataset_name = "lish-moa"
train = pd.read_csv("../data/train_features.csv")
train_targets = pd.read_csv('../data/train_targets_scored.csv')
train_targets.drop(columns=["sig_id"], inplace=True)
test = pd.read_csv('../data/test_features.csv')

In [4]:
np.random.seed(42)
if "Set" not in train.columns:
    train["Set"] = np.random.choice(["train", "valid"], p =[.8, .2], size=(train.shape[0],))

train_indices = train[train.Set=="train"].index
valid_indices = train[train.Set=="valid"].index

In [6]:
nunique = train.nunique()
types = train.dtypes

print(nunique, types)

categorical_columns = []
categorical_dims =  {}
for col in tqdm(train.columns):
    if types[col] == 'object' or nunique[col] < 200:
        print(col, train[col].nunique())
        l_enc = LabelEncoder()
        train[col] = train[col].fillna("VV_likely")
        train[col] = l_enc.fit_transform(train[col].values)
        try:
            test[col] = test[col].fillna("VV_likely")
            test[col] = l_enc.transform(test[col].values)
        except:
            print(f"Column {col} does not exist in test set")
        categorical_columns.append(col)
        categorical_dims[col] = len(l_enc.classes_)
    else:
        training_mean = train.loc[train_indices, col].mean()
        train.fillna(training_mean, inplace=True)
        test.fillna(training_mean, inplace=True)
        
print(categorical_columns, categorical_dims)

sig_id     23814
cp_type        2
cp_time        3
cp_dose        2
g-0        14367
           ...  
c-96       14493
c-97       14757
c-98       14812
c-99       14622
Set            2
Length: 877, dtype: int64 sig_id      object
cp_type     object
cp_time      int64
cp_dose     object
g-0        float64
            ...   
c-96       float64
c-97       float64
c-98       float64
c-99       float64
Set         object
Length: 877, dtype: object


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=877.0), HTML(value='')))

sig_id 23814
Column sig_id does not exist in test set
cp_type 2
cp_time 3
cp_dose 2
Set 2
Column Set does not exist in test set

['sig_id', 'cp_type', 'cp_time', 'cp_dose', 'Set'] {'sig_id': 23814, 'cp_type': 2, 'cp_time': 3, 'cp_dose': 2, 'Set': 2}


In [7]:
unused_feat = ['Set', 'sig_id'] # Let's not use splitting sets and sig_id

features = [ col for col in train.columns if col not in unused_feat] 

cat_idxs = [ i for i, f in enumerate(features) if f in categorical_columns]

cat_dims = [ categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]

In [8]:
unused_feat

['Set', 'sig_id']

In [11]:
cat_dims

[2, 3, 2]