In [None]:
# install packages
!pip install catboost
!pip install xgboost
!pip install lightgbm
!pip install mljar-supervised



In [2]:
# import packages
import os
import numpy as np
import pandas as pd

# mljar
from supervised.automl import AutoML

import catboost
import lightgbm
import xgboost

from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from joblib import load, dump

In [3]:
# # connect with Google Cloud
# from google.colab import drive
# drive.mount('/content/drive')
# path = "/content/drive/My Drive/colab/TPS May"
path = r'C:\Users\Chen\Desktop\Kaggle\Classifier\Tabular Playground Series - Apr 2021'
os.chdir(path)

In [4]:
# load data
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [5]:
df_train.isnull().sum()

PassengerId        0
Survived           0
Pclass             0
Name               0
Sex                0
Age             3292
SibSp              0
Parch              0
Ticket          4623
Fare             134
Cabin          67866
Embarked         250
dtype: int64

In [6]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,1,1,"Oconnor, Frankie",male,,2,0,209245,27.14,C12239,S
1,1,0,3,"Bryan, Drew",male,,0,0,27323,13.35,,S
2,2,0,3,"Owens, Kenneth",male,0.33,1,2,CA 457703,71.29,,S
3,3,0,3,"Kramer, James",male,19.0,0,0,A. 10866,13.04,,S
4,4,1,3,"Bond, Michael",male,25.0,0,0,427635,7.76,,S


In [7]:
all_data = pd.concat([df_train, df_test]).reset_index(drop=True)

In [8]:
# 1.Missing value of Age = mean
vari = 'Age'
print(f"Type {vari} is {all_data[vari].dtype}!!!")

print(all_data[vari].describe())

print(f"Missing ratio for train is {df_train[vari].isnull().sum() / len(df_train):.3f}")
print(f"Missing ratio for test is {df_test[vari].isnull().sum() / len(df_test):.3f}")

all_data[vari] = all_data[vari].fillna(all_data[vari].mean())

print(all_data[vari].isnull().sum())

Type Age is float64!!!
count    193221.000000
mean         34.464565
std          16.783847
min           0.080000
25%          22.000000
50%          31.000000
75%          48.000000
max          87.000000
Name: Age, dtype: float64
Missing ratio for train is 0.033
Missing ratio for test is 0.035
0


In [9]:
# 2.Missing value of Ticket = 'X', split string and take first part
vari = 'Ticket'
print(f"Type {vari} is {all_data[vari].dtype}!!!")

print(all_data[vari].describe())

print(f"Missing ratio for train is {df_train[vari].isnull().sum() / len(df_train):.3f}")
print(f"Missing ratio for test is {df_test[vari].isnull().sum() / len(df_test):.3f}")

all_data[vari] = all_data[vari].fillna('X').map(lambda x:str(x).split()[0] if len(str(x).split()) > 1 else 'X')

print(f'There is {all_data[vari].nunique()} unique values in array : {all_data[vari].unique()} ')

print(all_data[vari].isnull().sum())

Type Ticket is object!!!
count     190196
unique    132613
top         A/5 
freq         646
Name: Ticket, dtype: object
Missing ratio for train is 0.046
Missing ratio for test is 0.052
There is 50 unique values in array : ['X' 'CA' 'A.' 'A/S' 'PC' 'W./C.' 'SC/PARIS' 'S.C./PARIS' 'SC/Paris' 'CA.'
 'SOTON/O.Q.' 'C.A.' 'A/5.' 'STON/O' 'A/4' 'C' 'AQ/4' 'STON/O2.' 'WE/P'
 'F.C.' 'F.C.C.' 'PP' 'S.O.C.' 'SC/AH' 'Fa' 'W.E.P.' 'C.A./SOTON' 'P/PP'
 'A/5' 'SOTON/O2' 'SW/PP' 'STON/OQ.' 'W/C' 'S.O./P.P.' 'SC' 'A./5.' 'A/4.'
 'S.O.P.' 'SOTON/OQ' 'SO/C' 'SCO/W' 'A.5.' 'S.W./PP' 'S.P.' 'LP' 'SC/A4'
 'AQ/3.' 'S.C./A.4.' 'A4.' 'SC/A.3'] 
0


In [10]:
# 3.Missing value of Fare = median by each Pclass and use log transform
vari = 'Fare'
print(f"Type {vari} is {all_data[vari].dtype}!!!")

print(all_data[vari].describe())

print(f"Missing ratio for train is {df_train[vari].isnull().sum() / len(df_train):.3f}")
print(f"Missing ratio for test is {df_test[vari].isnull().sum() / len(df_test):.3f}")

fare_map = all_data[['Fare', 'Pclass']].dropna().groupby('Pclass').median().to_dict()
all_data['Fare'] = all_data['Fare'].fillna(all_data['Pclass'].map(fare_map['Fare']))
all_data['Fare'] = np.log1p(all_data['Fare'])   # log1p = log(x+1) : Gaussian transform


print(all_data[vari].isnull().sum())

Type Fare is float64!!!
count    199733.000000
mean         44.652071
std          67.436104
min           0.050000
25%          10.080000
50%          20.250000
75%          34.850000
max         744.660000
Name: Fare, dtype: float64
Missing ratio for train is 0.001
Missing ratio for test is 0.001
0


In [11]:
# 4.Missing value of Cabin = 'X' and take first letter
vari = 'Cabin'
print(f"Type {vari} is {all_data[vari].dtype}!!!")

print(all_data[vari].describe())

print(f"Missing ratio for train is {df_train[vari].isnull().sum() / len(df_train):.3f}")
print(f"Missing ratio for test is {df_test[vari].isnull().sum() / len(df_test):.3f}")

all_data[vari] = all_data[vari].fillna('X').map(lambda x: x[0].strip())

print(f'There is {all_data[vari].nunique()} unique values in array : {all_data[vari].unique()} ')

print(all_data[vari].isnull().sum())

# or drop the feature directly
# for d in [df_train, df_test]:
#     d.drop(vari, axis=1, inplace=True)

Type Cabin is object!!!
count      61303
unique     45442
top       C10839
freq           7
Name: Cabin, dtype: object
Missing ratio for train is 0.679
Missing ratio for test is 0.708
There is 9 unique values in array : ['C' 'X' 'A' 'D' 'B' 'E' 'F' 'G' 'T'] 
0


In [12]:
# 5.Missing value of Embarked = 'X'
vari = 'Embarked'
print(f"Type {vari} is {all_data[vari].dtype}!!!")

print(all_data[vari].describe())

print(f"Missing ratio for train is {df_train[vari].isnull().sum() / len(df_train):.3f}")
print(f"Missing ratio for test is {df_test[vari].isnull().sum() / len(df_test):.3f}")

all_data[vari] = all_data[vari].fillna('X')

print(f'There is {all_data[vari].nunique()} unique values in array : {all_data[vari].unique()} ')

print(all_data[vari].isnull().sum())

Type Embarked is object!!!
count     199473
unique         3
top            S
freq      140981
Name: Embarked, dtype: object
Missing ratio for train is 0.003
Missing ratio for test is 0.003
There is 4 unique values in array : ['S' 'C' 'Q' 'X'] 
0


In [13]:
# 6.Name, take only surnames
vari = 'Name'
all_data[vari] = all_data[vari].map(lambda x: x.split(',')[0])

print(f'There is {all_data[vari].nunique()} unique values in array : {all_data[vari].unique()} ')

There is 26470 unique values in array : ['Oconnor' 'Bryan' 'Owens' ... 'Pecatoste' 'Conlisk' 'Peitz'] 


In [14]:
# Encoding and transform
label_cols = ['Name', 'Ticket', 'Sex']
onehot_cols = ['Cabin', 'Embarked']
num_cols = ['Pclass', 'Age', 'SibSp', 'Parch']
TARGET = 'Survived'

def label_encoder(c):
    le = LabelEncoder()
    return le.fit_transform(c)

scaler = StandardScaler()

onehot_encoded_df = pd.get_dummies(all_data[onehot_cols])
label_encoded_df = all_data[label_cols].apply(label_encoder)
num_df = pd.DataFrame(scaler.fit_transform(all_data[num_cols]), columns=num_cols)
fare_df = all_data['Fare']
target_df = all_data[TARGET]

all_data = pd.concat([num_df, fare_df, label_encoded_df, onehot_encoded_df, target_df], axis=1)

In [15]:
# split train and test
df_train = all_data[:len(df_train)]
df_test = all_data[len(df_train):].drop(TARGET, axis=1)

y = df_train[TARGET]
y = pd.DataFrame(y).astype('int64')

col = [i for i in df_train.columns if i not in ['PassengerId', 'Survived']]
x = df_train[col]

In [16]:
print(x.head())
print(x.shape)

     Pclass           Age     SibSp     Parch      Fare   Name  Ticket  Sex  \
0 -1.425730 -8.614253e-16  1.901268 -0.505478  3.337192  17441      49    1   
1  0.877699 -8.614253e-16 -0.539572 -0.505478  2.663750   3063      49    1   
2  0.877699 -2.069149e+00  0.680848  1.628715  4.280686  17798      14    1   
3  0.877699 -9.374220e-01 -0.539572 -0.505478  2.641910  12742       0    1   
4  0.877699 -5.737175e-01 -0.539572 -0.505478  2.170196   2335      49    1   

   Cabin_A  Cabin_B  ...  Cabin_D  Cabin_E  Cabin_F  Cabin_G  Cabin_T  \
0        0        0  ...        0        0        0        0        0   
1        0        0  ...        0        0        0        0        0   
2        0        0  ...        0        0        0        0        0   
3        0        0  ...        0        0        0        0        0   
4        0        0  ...        0        0        0        0        0   

   Cabin_X  Embarked_C  Embarked_Q  Embarked_S  Embarked_X  
0        0           0   

In [18]:
# construct model
# AutoML : mljar
RESULTS_PATH = 'mljar-20210601'
SEED = 20210601

cv = {"validation_type": "kfold",
      "k_folds": 5,
      "shuffle": True,
      "stratify": True,
      "random_seed": SEED}

automl = AutoML(results_path=RESULTS_PATH,
                mode="Optuna",                          # or 'Explain', 'Perform', 'Compete'
                ml_task='binary_classification',        # or 'auto', 'binary_classification', 'regression'
                algorithms=['Baseline', 'Linear', 'Decision Tree', 'Random Forest', 'Extra Trees', 'LightGBM', 'Xgboost', 'CatBoost', 'Neural Network', 'Nearest Neighbors'],
                train_ensemble=True,
                stack_models=True,
                eval_metric='accuracy',
                validation_strategy=cv,
                golden_features=True,
                boost_on_errors=True,
                optuna_time_budget=60*60,
                total_time_limit=8*60*60,
                optuna_verbose=False,
                n_jobs=-1,
                random_state=SEED)

In [19]:
# training model
automl.fit(x, y)

Linear algorithm was disabled.
AutoML directory: mljar-20210601
Expected computing time:
Total training time: Optuna + ML training = 61200 seconds
Total Optuna time: len(algorithms) * optuna_time_budget = 32400 seconds
Total ML model training time: 28800 seconds
The task is binary_classification with evaluation metric accuracy
AutoML will use algorithms: ['Baseline', 'Decision Tree', 'Random Forest', 'Extra Trees', 'LightGBM', 'Xgboost', 'CatBoost', 'Neural Network', 'Nearest Neighbors']
AutoML will stack models
AutoML will ensemble availabe models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'golden_features', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
* Step simple_algorithms will try to check up to 2 models
1_Baseline accuracy 0.57226 trained in 21.2 seconds
2_DecisionTree accuracy 0.75798 trained in 20.76 seconds
* Step default_algorithms will try to check up to 7 models
3_Optuna_LightGBM accuracy 0.78386 trained in 39.31 seconds
4_Optuna_Xgboost accur

AutoML(algorithms=['Baseline', 'Linear', 'Decision Tree', 'Random Forest',
                   'Extra Trees', 'LightGBM', 'Xgboost', 'CatBoost',
                   'Neural Network', 'Nearest Neighbors'],
       boost_on_errors=True, eval_metric='accuracy', golden_features=True,
       ml_task='binary_classification', mode='Optuna', optuna_time_budget=3600,
       optuna_verbose=False, random_state=20210601,
       results_path='mljar-20210601', stack_models=True, total_time_limit=28800,
       validation_strategy={'k_folds': 5, 'random_seed': 20210601,
                            'shuffle': True, 'stratify': True,
                            'validation_type': 'kfold'})

In [20]:
# load well-trained model
automl = AutoML(results_path=RESULTS_PATH)

In [23]:
# predict
x_test = df_test
result = automl.predict(x_test)
result = result.reshape(-1,1)

In [24]:
# submission
sub = pd.read_csv('sample_submission.csv')
sub[sub.columns[1:]] = result
sub.to_csv(f'{RESULTS_PATH}.csv', index=False)