In [1]:
# 0. Load libraries #

import numpy as np
import pandas as pd
import os, time, warnings
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.svm import SVC
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.inspection import permutation_importance
from xgboost import XGBClassifier

pd.set_option('display.max_columns', 20)
pd.set_option('mode.chained_assignment', None)
pd.set_option('display.expand_frame_repr', False)
warnings.filterwarnings('ignore')

def draw_histograms(df, variables, n_rows, n_cols):
    # stolen from https://stackoverflow.com/questions/29530355/plotting-multiple-histograms-in-grid
    fig=plt.figure()
    for i, var_name in enumerate(variables):
        ax=fig.add_subplot(n_rows,n_cols,i+1)
        df[var_name].hist(bins=10,ax=ax)
        ax.set_title(var_name+" Distribution")
    fig.tight_layout()  
    plt.show()


def fillna_mp_i1(df_train, df_test, df_pred, num_features, cat_features, num_fill='median', cat_fill='mode'):
    """This function speeds up filling missing values for 3 main datasets using different imputation methods.
    Later may replace it with some subclass.
    Example: fillna_mp_i1(X_train, X_test, X_pred, num_cols, cat_cols)"""
    # set df_pred to None if it does not exist
    if not ((cat_fill=='mode') and (num_fill=='median')):
        print ('Imputation method not Implemented yet!')
        return None
    
    df_train[num_features] = df_train[num_features].fillna(value=df_train[num_features].median())
    df_test[num_features] = df_test[num_features].fillna(value=df_train[num_features].median())
    df_train[cat_features] = df_train[cat_features].fillna(value=df_train[cat_features].mode().iloc[0])
    df_test[cat_features] = df_test[cat_features].fillna(value=df_train[cat_features].mode().iloc[0])
    if (df_pred is not None):
        df_pred[num_features] = df_pred[num_features].fillna(value=df_train[num_features].median())
        df_pred[cat_features] = df_pred[cat_features].fillna(value=df_train[cat_features].mode().iloc[0])
    df_train[num_features+cat_features].count
    
    all_good = (
    (np.prod(df_train[num_features+cat_features].shape)==df_train[num_features+cat_features].count().sum()) and 
    (np.prod(df_test[num_features+cat_features].shape) == df_test[num_features+cat_features].count().sum()) and 
    (np.prod(df_pred[num_features+cat_features].shape) == df_pred[num_features+cat_features].count().sum()))
    if (all_good):
        print('Missing values imputed successfully')
    else:
        print('There are still some missing values...')
    
def add_misDummy_mp_i1(df_train, df_test, df_pred, features):
    """This function creates new dummy columns for missing features.
    Example: add_misDummy_mp_i1(X_train, X_test, X_pred, ['Age'])"""
    # set df_pred to None if it does not exist
    for feature_name in features:
        misColName = 'mis'+feature_name
        df_train.loc[df_train[feature_name].isnull(), misColName]=1
        df_train.loc[df_train[feature_name].notnull(), misColName]=0
        df_test.loc[df_test[feature_name].isnull(), misColName]=1
        df_test.loc[df_test[feature_name].notnull(), misColName]=0
        if (df_pred is not None):
            df_pred.loc[df_pred[feature_name].isnull(), misColName]=1
            df_pred.loc[df_pred[feature_name].notnull(), misColName]=0
   

def discretize_mp_i1(df_train, df_test, df_pred, feature, ntiles, delete_feature=False):
    """This function divides a continuous feature into quantile groups.
    Example: discretize_mp_i1(X_train, X_test, X_pred, 'Age', 15)"""
    # set df_pred to None if it does not exist
    _,bin = pd.qcut(df_train[feature], ntiles, retbins = True, labels = False, duplicates = 'drop')
    df_train[feature+'Ntile'] = pd.cut(df_train[feature], labels=False, duplicates = 'drop', bins = bin ,include_lowest = True)
    df_test[feature+'Ntile'] = pd.cut(df_test[feature], labels=False, duplicates = 'drop', bins = bin ,include_lowest = True)
    if (df_pred is not None):
        df_pred[feature+'Ntile'] = pd.cut(df_pred[feature], labels=False, duplicates = 'drop', bins = bin ,include_lowest = True)
    if (delete_feature==True):
        df_train.drop(columns=[feature], inplace=True)
        df_test.drop(columns=[feature], inplace=True)
        df_pred.drop(columns=[feature], inplace=True)
    print('Discretized ',feature, ' into ', len(bin)-1, ' bins')


def log_transformer_mp_i1(df_train, df_test, df_pred, feature_subset=False, min_skew=3):
    """This function divides a continuous feature into quantile groups.
    Example: log_transformer_mp_i1(X_train, X_test, X_pred, feature_subset=num_cols)"""
    # set df_pred to None if it does not exist
    if (feature_subset==False):
        features_totransform = df_train.columns
    else:
        features_totransform = feature_subset.copy()
    skewed_vars = list(df_train.skew()[abs(df_train.skew())>min_skew].index)
    for col in list(set(skewed_vars)&set(features_totransform)):
        df_train[col] = np.log1p(df_train[col])
        df_test[col] = np.log1p(df_test[col])
        if (df_pred is not None):
            df_pred[col] = np.log1p(df_pred[col])
    print('Skewed columns log-transformed: ', list(set(skewed_vars)&set(features_totransform)))
    
    
# 1. Load data #

time0 = time.time()

path = '../input/spaceship-titanic/train.csv'
df = pd.read_csv(path) 

#df.drop(columns=['Name', 'Ticket', 'Cabin', 'PassengerId'],inplace=True)
pred=pd.read_csv('../input/spaceship-titanic/test.csv')
pred0 = pred.copy()
#pred.drop(columns=['Name', 'Ticket', 'Cabin', 'PassengerId'],inplace=True)

print(df.shape, pred.shape)
df.head()

# 2. pEDA #
df[['Deck', 'Room', 'Side']] = df['Cabin'].str.split('/', 2, expand=True)
pred[['Deck', 'Room', 'Side']] = pred['Cabin'].str.split('/', 2, expand=True)
df['Age2'] = df['Age']**2
pred['Age2'] = pred['Age']**2
df.drop(columns = ['PassengerId', 'Cabin', 'Room', 'Name'], inplace = True)
pred.drop(columns = ['PassengerId', 'Cabin', 'Room', 'Name'], inplace = True)
df

(8693, 14) (4277, 13)


Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,Side,Age2
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,B,P,1521.0
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,F,S,576.0
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,A,S,3364.0
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,A,S,1089.0
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,F,S,256.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,Europa,False,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,False,A,P,1681.0
8689,Earth,True,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,False,G,S,324.0
8690,Earth,False,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,True,G,S,676.0
8691,Europa,False,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,False,E,S,1024.0


In [2]:
# 3. Train-test split #

train_y = df[['Transported']]
train_x = df.drop(columns = ['Transported'])
X_pred = pred.copy()

cat_cols = [col for col in train_x.columns if train_x[col].nunique() in range(2,10)]
num_cols = list(set(train_x.columns)-set(cat_cols))

print('categorical features: ', cat_cols, 'numerical features: ', num_cols)

X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size = 0.05, random_state=1)
print(X_train.shape, X_test.shape, y_train.shape, X_pred.shape)

X_train.info()

categorical features:  ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Deck', 'Side'] numerical features:  ['Spa', 'VRDeck', 'RoomService', 'ShoppingMall', 'Age2', 'FoodCourt', 'Age']
(8258, 13) (435, 13) (8258, 1) (4277, 13)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 8258 entries, 1534 to 235
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8069 non-null   object 
 1   CryoSleep     8057 non-null   object 
 2   Destination   8089 non-null   object 
 3   Age           8087 non-null   float64
 4   VIP           8068 non-null   object 
 5   RoomService   8083 non-null   float64
 6   FoodCourt     8082 non-null   float64
 7   ShoppingMall  8060 non-null   float64
 8   Spa           8084 non-null   float64
 9   VRDeck        8076 non-null   float64
 10  Deck          8070 non-null   object 
 11  Side          8070 non-null   object 
 12  Age2          8087 non-null   float64
dtypes: float64(7

In [3]:
# 4. Missing values #

#add_misDummy_mp_i1(X_train, X_test, X_pred, ['Age', 'HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'RoomService',
#'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Deck', 'Side'])
fillna_mp_i1(X_train, X_test, X_pred, num_cols, cat_cols)
X_train.info()

Missing values imputed successfully
<class 'pandas.core.frame.DataFrame'>
Int64Index: 8258 entries, 1534 to 235
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8258 non-null   object 
 1   CryoSleep     8258 non-null   bool   
 2   Destination   8258 non-null   object 
 3   Age           8258 non-null   float64
 4   VIP           8258 non-null   bool   
 5   RoomService   8258 non-null   float64
 6   FoodCourt     8258 non-null   float64
 7   ShoppingMall  8258 non-null   float64
 8   Spa           8258 non-null   float64
 9   VRDeck        8258 non-null   float64
 10  Deck          8258 non-null   object 
 11  Side          8258 non-null   object 
 12  Age2          8258 non-null   float64
dtypes: bool(2), float64(7), object(4)
memory usage: 790.3+ KB


In [4]:
# 5. Feature engineering #

discretize_mp_i1(X_train, X_test, X_pred, 'Age', 10)
cat_cols.extend(['AgeNtile'])

X_train.describe()
X_train[:5]

log_transformer_mp_i1(X_train, X_test, X_pred, feature_subset=num_cols)

feature_transformer = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(sparse = False, handle_unknown="ignore"), cat_cols),
    ])

X_train = pd.DataFrame(feature_transformer.fit_transform(X_train), columns=feature_transformer.get_feature_names_out())
X_test = pd.DataFrame(feature_transformer.transform(X_test), columns=feature_transformer.get_feature_names_out())
X_pred = pd.DataFrame(feature_transformer.transform(X_pred), columns=feature_transformer.get_feature_names_out())

fewfeatures = []

X_train

Discretized  Age  into  10  bins
Skewed columns log-transformed:  ['Spa', 'VRDeck', 'RoomService', 'ShoppingMall', 'FoodCourt']


Unnamed: 0,num__Spa,num__VRDeck,num__RoomService,num__ShoppingMall,num__Age2,num__FoodCourt,num__Age,cat__HomePlanet_Earth,cat__HomePlanet_Europa,cat__HomePlanet_Mars,...,cat__AgeNtile_0,cat__AgeNtile_1,cat__AgeNtile_2,cat__AgeNtile_3,cat__AgeNtile_4,cat__AgeNtile_5,cat__AgeNtile_6,cat__AgeNtile_7,cat__AgeNtile_8,cat__AgeNtile_9
0,-0.662133,-0.386397,-0.636426,-0.623211,-0.702224,0.137587,-0.681202,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.191051,1.485313,-0.636426,-0.623211,-0.008829,2.496714,0.226115,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,-0.662133,-0.639300,-0.636426,-0.623211,-0.661436,-0.647950,-0.611408,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.473152,-0.639300,1.795465,1.205350,-0.426121,-0.647950,-0.262440,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,1.431702,1.476574,-0.636426,-0.623211,-0.372783,1.977648,-0.192647,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8253,-0.662133,-0.639300,1.856594,1.877423,0.430426,-0.647950,0.644877,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8254,2.052010,2.095139,-0.636426,-0.623211,0.510957,1.251716,0.714670,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8255,-0.662133,-0.639300,-0.636426,0.231544,-0.477367,0.544899,-0.332234,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
8256,-0.662133,-0.639300,-0.636426,-0.623211,1.230498,-0.647950,1.273019,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [7]:
# 6. Fit models #

lr = LogisticRegression()
param_grid = {'C':[0.01, 0.03, 0.1, 0.3, 1]}
lrm = GridSearchCV(lr, param_grid, cv=2)
lrm.fit(X_train, y_train)
print('Logistic ', lrm.best_params_, accuracy_score(y_train, lrm.predict(X_train)))

Logistic  {'C': 0.03} 0.7799709372729474


In [10]:
time1 = time.time()
svm = SVC()
param_grid = {'C':[0.3, 1, 3]}
svmm = GridSearchCV(svm, param_grid, cv=2)
svmm.fit(X_train, y_train)
print('SVM ', svmm.best_params_, accuracy_score(y_train, svmm.predict(X_train)), time.time()-time1)

SVM  {'C': 1} 0.8144829256478566 15.648156881332397


In [13]:
knn = KNeighborsClassifier()
param_grid = dict(n_neighbors=range(2,22,2))
knnm = GridSearchCV(knn, param_grid, cv=2)
knnm.fit(X_train, y_train)
print('KNN ', knnm.best_params_, accuracy_score(y_train, knnm.predict(X_train)))

KNN  {'n_neighbors': 14} 0.8038265923952531


In [14]:
time1 = time.time()
rf = RandomForestClassifier()
param_grid = {'n_estimators':[100], 'max_depth':[2,4,6,8], 'max_features':[5,6,7]}
rfm = GridSearchCV(rf, param_grid, cv=2)
rfm.fit(X_train, y_train)
print('RF ', rfm.best_params_, accuracy_score(y_train, rfm.predict(X_train)), time.time()-time1)

RF  {'max_depth': 8, 'max_features': 6, 'n_estimators': 100} 0.8228384596754662 10.099506378173828


In [None]:
time1 = time.time()
xgb = XGBClassifier()
# use 'gpu_hist' for more than 100,000 examples.
param_grid = {'n_estimators':[100,200], 'max_depth':[2,3,4], 'eta':[0.03, 0.04, 0.05]}
#'subsample':[0.4, 0.6],'colsample_bytree':[0.6, 0.8]
xgbm = GridSearchCV(xgb, param_grid, cv=4)
xgbm.fit(X_train, y_train)
print('XGB ', xgbm.best_params_, accuracy_score(y_train, xgbm.predict(X_train)), time.time()-time1)

In [None]:
# 7. accuracy #

print('Out of Sample:')
print('Logistic ', accuracy_score(y_test, lrm.predict(X_test)))
print('SVM ', accuracy_score(y_test, svmm.predict(X_test)))
print('KNN ', accuracy_score(y_test, knnm.predict(X_test[fewfeatures])))
print('RF ', accuracy_score(y_test, rfm.predict(X_test)))
print('XGB ', accuracy_score(y_test, xgbm.predict(X_test)))
print('Total time ', time.time()-time0)

In [None]:
# VotingClassifier:

estimator = []
#estimator.append(('LR', LogisticRegression(C=1)))
estimator.append(('SVM', SVC(C=1, probability = True)))
#estimator.append(('KNN', KNeighborsClassifier(n_neighbors=5)))
estimator.append(('RF', RandomForestClassifier(max_depth=5, max_features=4, n_estimators=200)))
estimator.append(('XGB', XGBClassifier(eta=0.04, max_depth=3, n_estimators=200, 
                                       subsample=0.6, colsample_bytree=0.6)))
vot_soft = VotingClassifier(estimators = estimator, voting ='soft')
vot_soft.fit(X_train, y_train)
print('VotingClassifier5 ', accuracy_score(y_train, vot_soft.predict(X_train)))
print('VotingClassifier5 ', accuracy_score(y_test, vot_soft.predict(X_test)))