In [None]:
# install packages
!pip install catboost
!pip install xgboost
!pip install lightgbm
!pip install mljar-supervised



In [2]:
# import packages
import os
import numpy as np
import pandas as pd

# mljar
from supervised.automl import AutoML

import catboost
import lightgbm
import xgboost

from joblib import load, dump

In [3]:
# # connect with Google Cloud
# from google.colab import drive
# drive.mount('/content/drive')
# path = "/content/drive/My Drive/colab/TPS May"
path = r'C:\Users\Chen\Desktop\Kaggle\Classifier\Tabular Playground Series - Apr 2021'
os.chdir(path)

In [4]:
# load data
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [5]:
df_train.isnull().sum()

PassengerId        0
Survived           0
Pclass             0
Name               0
Sex                0
Age             3292
SibSp              0
Parch              0
Ticket          4623
Fare             134
Cabin          67866
Embarked         250
dtype: int64

In [6]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,1,1,"Oconnor, Frankie",male,,2,0,209245,27.14,C12239,S
1,1,0,3,"Bryan, Drew",male,,0,0,27323,13.35,,S
2,2,0,3,"Owens, Kenneth",male,0.33,1,2,CA 457703,71.29,,S
3,3,0,3,"Kramer, James",male,19.0,0,0,A. 10866,13.04,,S
4,4,1,3,"Bond, Michael",male,25.0,0,0,427635,7.76,,S


In [7]:
# 1.Missing value of Age = 999
vari = 'Age'
print(f"Type {vari} is {df_train[vari].dtype}!!!")
# print(df_train[vari].describe())
print(f"Missing ratio for train is {df_train[vari].isnull().sum() / len(df_train):.3f}")
print(f"Missing ratio for test is {df_test[vari].isnull().sum() / len(df_test):.3f}")
for d in [df_train, df_test]:
    d[vari].fillna(999, inplace = True)

print(df_train[vari].isnull().sum())

Type Age is float64!!!
Missing ratio for train is 0.033
Missing ratio for test is 0.035
0


In [8]:
# 2.Missing value of Ticket = 'missing'
vari = 'Ticket'
print(f"Type {vari} is {df_train[vari].dtype}!!!")
# print(df_train[vari].describe())
print(f"Missing ratio for train is {df_train[vari].isnull().sum() / len(df_train):.3f}")
print(f"Missing ratio for test is {df_test[vari].isnull().sum() / len(df_test):.3f}")
for d in [df_train, df_test]:
    d[vari].fillna('missing', inplace = True)

print(df_train[vari].isnull().sum())

Type Ticket is object!!!
Missing ratio for train is 0.046
Missing ratio for test is 0.052
0


In [9]:
# 3.Missing value of Fare = 9999
vari = 'Fare'
print(f"Type {vari} is {df_train[vari].dtype}!!!")
# print(df_train[vari].describe())
print(f"Missing ratio for train is {df_train[vari].isnull().sum() / len(df_train):.3f}")
print(f"Missing ratio for test is {df_test[vari].isnull().sum() / len(df_test):.3f}")
for d in [df_train, df_test]:
    d[vari].fillna(9999, inplace = True)

print(df_train[vari].isnull().sum())

Type Fare is float64!!!
Missing ratio for train is 0.001
Missing ratio for test is 0.001
0


In [10]:
# 4.Missing value of Cabin = 'missing'
vari = 'Cabin'
print(f"Type {vari} is {df_train[vari].dtype}!!!")
# print(df_train[vari].describe())
print(f"Missing ratio for train is {df_train[vari].isnull().sum() / len(df_train):.3f}")
print(f"Missing ratio for test is {df_test[vari].isnull().sum() / len(df_test):.3f}")
for d in [df_train, df_test]:
    d[vari].fillna('missing', inplace = True)

print(df_train[vari].isnull().sum())

# or drop the feature directly
# for d in [df_train, df_test]:
#     d.drop(vari, axis=1, inplace=True)

Type Cabin is object!!!
Missing ratio for train is 0.679
Missing ratio for test is 0.708
0


In [11]:
# 5.Missing value of Embarked = 'missing'
vari = 'Embarked'
print(f"Type {vari} is {df_train[vari].dtype}!!!")
# print(df_train[vari].describe())
print(f"Missing ratio for train is {df_train[vari].isnull().sum() / len(df_train):.3f}")
print(f"Missing ratio for test is {df_test[vari].isnull().sum() / len(df_test):.3f}")
for d in [df_train, df_test]:
    d[vari].fillna('missing', inplace = True)

print(df_train[vari].isnull().sum())

Type Embarked is object!!!
Missing ratio for train is 0.003
Missing ratio for test is 0.003
0


In [12]:
# target labelling
y = df_train['Survived']
y = pd.DataFrame(y).astype('int64')

col = [i for i in df_train.columns if i not in ['PassengerId', 'Survived']]
x = df_train[col]

In [13]:
# construct model
# AutoML : mljar
RESULTS_PATH = 'mljar-20210531'
SEED = 20210531

cv = {"validation_type": "kfold",
      "k_folds": 5,
      "shuffle": True,
      "stratify": True,
      "random_seed": SEED}

automl = AutoML(results_path=RESULTS_PATH,
                mode="Optuna",                          # or 'Explain', 'Perform', 'Compete'
                ml_task='binary_classification',        # or 'auto', 'binary_classification', 'regression'
                algorithms=['Baseline', 'Linear', 'Decision Tree', 'Random Forest', 'Extra Trees', 'LightGBM', 'Xgboost', 'CatBoost', 'Neural Network', 'Nearest Neighbors'],
                train_ensemble=True,
                stack_models=True,
                eval_metric='accuracy',
                validation_strategy=cv,
                golden_features=True,
                boost_on_errors=True,
                optuna_time_budget=60*60,
                total_time_limit=8*60*60,
                optuna_verbose=False,
                n_jobs=-1,
                random_state=SEED)

In [14]:
# training model
automl.fit(x, y)

Linear algorithm was disabled.
AutoML directory: mljar-20210531
Expected computing time:
Total training time: Optuna + ML training = 61200 seconds
Total Optuna time: len(algorithms) * optuna_time_budget = 32400 seconds
Total ML model training time: 28800 seconds
The task is binary_classification with evaluation metric accuracy
AutoML will use algorithms: ['Baseline', 'Decision Tree', 'Random Forest', 'Extra Trees', 'LightGBM', 'Xgboost', 'CatBoost', 'Neural Network', 'Nearest Neighbors']
AutoML will stack models
AutoML will ensemble availabe models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'golden_features', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
* Step simple_algorithms will try to check up to 2 models
1_Baseline: trained.
2_DecisionTree: trained.
* Step default_algorithms will try to check up to 7 models
3_Optuna_LightGBM accuracy 0.77676 trained in 127.05 seconds
4_Optuna_Xgboost accuracy 0.77689 trained in 202.33 seconds
There was an error durin

AutoML(algorithms=['Baseline', 'Linear', 'Decision Tree', 'Random Forest',
                   'Extra Trees', 'LightGBM', 'Xgboost', 'CatBoost',
                   'Neural Network', 'Nearest Neighbors'],
       boost_on_errors=True, eval_metric='accuracy', golden_features=True,
       ml_task='binary_classification', mode='Optuna', optuna_time_budget=3600,
       optuna_verbose=False, random_state=20210531,
       results_path='mljar-20210531', stack_models=True, total_time_limit=28800,
       validation_strategy={'k_folds': 5, 'random_seed': 20210531,
                            'shuffle': True, 'stratify': True,
                            'validation_type': 'kfold'})

In [15]:
# load well-trained model
automl = AutoML(results_path=RESULTS_PATH)

In [16]:
# predict
x_test = df_test.drop(['PassengerId'], axis=1)
result = automl.predict(x_test)
result = result.reshape(-1,1)

In [31]:
# submission
sub = pd.read_csv('sample_submission.csv')
sub[sub.columns[1:]] = result
sub.to_csv(f'{RESULTS_PATH}.csv', index=False)