In [None]:
# Check GPU version
!nvidia-smi

In [None]:
# install packages
!pip install catboost
!pip install xgboost
!pip install lightgbm
!pip install lightautoml

In [2]:
# import packages
import os
import numpy as np
import pandas as pd

# lightautoml
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML
from lightautoml.tasks import Task
from lightautoml.dataset.roles import NumericRole

import catboost
import lightgbm
import xgboost

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split

from joblib import load, dump

In [3]:
# # connect with Google Cloud
# from google.colab import drive
# drive.mount('/content/drive')
# path = "/content/drive/My Drive/colab/TPS Jun"
path = r'C:\Users\Chen\Desktop\Kaggle\Classifier\Tabular Playground Series - Jun 2021'
os.chdir(path)

In [4]:
# load data
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [5]:
# target labelling
f = LabelEncoder()
f.fit(df_train['target'])
y = f.transform(df_train['target'])
y = pd.DataFrame(y).astype('int64')

col = [i for i in df_train.columns if i not in ['id', 'target']]
x = df_train[col]
x = x.astype('int64')

In [38]:
PROJECT_NAME = 'lightautoml-20210608'
N_THREADS = 4
N_FOLDS = 5
RANDOM_STATE = 20210608
TEST_SIZE = 0.2
TIMEOUT = 8*60*60   # in seconds
TARGET_NAME = 'target'
NUM_CLASS = len(df_train['target'].value_counts())

In [7]:
# first step
# create task
task = Task('multiclass', loss = 'crossentropy', metric = 'crossentropy')

# setup columns
roles = {
    'target': TARGET_NAME,
    'drop': ['id'],
}

automl = TabularAutoML(task = task, 
                       timeout = TIMEOUT,
                       cpu_limit = N_THREADS,
                       general_params = {
                           'use_algos': [['linear_l2', 'lgb', 'lgb_tuned', 'cb', 'cb_tuned']],
                           'return_all_predictions': True,
                           'weighted_blender_max_nonzero_coef': 0.0
                       },
                       tuning_params = {'max_tuning_time': 60*60},
                       reader_params = {'n_jobs': N_THREADS, 'random_state': RANDOM_STATE},
                       # config_path = f'{PROJECT_NAME}.yml',
                       )

In [None]:
# train on full data
oof_pred = automl.fit_predict(train_data, roles = roles)
print('oof_pred:\n{}\nShape = {}'.format(oof_pred[:10], oof_pred.shape))

In [39]:
columns = ['PREDS_Linear' + str(i) for i in range(1, NUM_CLASS+1)]
columns += ['PREDS_LGB' + str(i) for i in range(1, NUM_CLASS+1)]
columns += ['PREDS_LGB_tuned' + str(i) for i in range(1, NUM_CLASS+1)]
columns += ['PREDS_CB_' + str(i) for i in range(1, NUM_CLASS+1)]
columns += ['PREDS_CB_tuned' + str(i) for i in range(1, NUM_CLASS+1)]

In [None]:
OOF_pred = pd.DataFrame(oof_pred.data, columns = columns)
OOF_pred = pd.concat([pd.DataFrame(df_train['id']), OOF_pred, pd.DataFrame(df_train[TARGET_NAME])], axis=1)

OOF_pred.to_csv(f'{PROJECT_NAME}_1st_oof.csv', index = False)

In [None]:
# predict for test data
test_pred = automl.predict(df_test)

TEST_pred = pd.DataFrame(test_pred.data, columns = columns)
TEST_pred = pd.concat([pd.DataFrame(df_test['id']), TEST_pred], axis=1)

TEST_pred.to_csv(f'{PROJECT_NAME}_1st_test.csv', index = False)

In [30]:
# load first step files
OOF_pred = pd.read_csv(f'{PROJECT_NAME}_1st_oof.csv')
TEST_pred = pd.read_csv(f'{PROJECT_NAME}_1st_test.csv')

In [41]:
# combine data and first step predictions
for col in columns:
    df_train[col] = OOF_pred[col]
    df_test[col] = TEST_pred[col]

In [None]:
# second step
roles = {
    'target': TARGET_NAME,
    'drop': ['id'],
    NumericRole(np.float32, prob = True): columns
}

automl = TabularUtilizedAutoML(task = task, 
                       timeout = TIMEOUT,
                       cpu_limit = N_THREADS,
                       general_params = {
                           'use_algos': [['lgb_tuned', 'cb_tuned']],
                       },
                       tuning_params = {'max_tuning_time': 60*60},
                       reader_params = {'n_jobs': N_THREADS},
                       # configs_list = ['../input/lightautoml-configs/conf_1_sel_type_1.yml'],
                       max_runs_per_config=1
                       )

In [None]:
# train on dataset with predictions
oof_pred = automl.fit_predict(df_train, roles = roles)
print('oof_pred:\n{}\nShape = {}'.format(oof_pred[:10], oof_pred.shape))

In [None]:
# Predict for test data and check score
test_pred = automl.predict(df_test)
print('Prediction for test data:\n{}\nShape = {}'.format(test_pred[:10], test_pred.shape))

print('Check scores...')
print('OOF score: {}'.format(log_loss(train_data[TARGET_NAME].values, oof_pred.data)))

In [None]:
# submission
sub = pd.read_csv('sample_submission.csv')
sub[sub.columns[1:]] = test_pred.data
sub.to_csv(f'{RESULTS_PATH}.csv', index=False)