In [48]:
##################################################################################################
### This script is ML Classification template, which should be applicable to most MLC projects ###
##################################################################################################

"""Structure of the script:
1.  Load all needed libraries and functions.
2.  Load data, do preliminary data exploration.
2.1 [Optional] Create more variables, delete variables.
3.  Deal with missing values, transform skewed variables.
4.  Trnasform features depending on their type. OHC.
5.  Create subsamples.
6.  Do scaling.
7.  Fit models, selecting hyperparameters via CV grid search.
8.  Evaluate performance of the selected models on test sample.
"""

### 1.Load main libraries ###

import time
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import svm
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OrdinalEncoder, MinMaxScaler, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from xgboost import XGBRegressor, XGBClassifier



pd.set_option('display.max_columns', 20)
pd.set_option('mode.chained_assignment', None)
pd.set_option('display.expand_frame_repr', False)

# Turn off warnings. Be warned!
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

def draw_histograms(df, variables, n_rows, n_cols):
    # stolen from https://stackoverflow.com/questions/29530355/plotting-multiple-histograms-in-grid
    fig=plt.figure()
    for i, var_name in enumerate(variables):
        ax=fig.add_subplot(n_rows,n_cols,i+1)
        df[var_name].hist(bins=10,ax=ax)
        ax.set_title(var_name+" Distribution")
    fig.tight_layout()  
    plt.show()

In [68]:
### 2.Load data ###

time1 = time.time()

path = '../input/spaceship-titanic/train.csv'
train = pd.read_csv(path) 
print(train.shape)
train.head(2)

(8693, 14)


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True


In [70]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [50]:
test_data=pd.read_csv('../input/spaceship-titanic/test.csv')

print(train.shape, test_data.shape)
test = test_data.copy()
train['sample']='train'
test['Transported'] = np.nan
test['sample']='test'

df=pd.concat([train, test])
df.reset_index(inplace=True, drop=True)
print(df.shape)
df.tail(3)

(8693, 14) (4277, 13)
(12970, 15)


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,sample
12967,9271_01,Mars,True,D/296/P,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,Jayrin Pore,,test
12968,9273_01,Europa,False,D/297/P,,,False,0.0,2680.0,0.0,0.0,523.0,Kitakan Conale,,test
12969,9277_01,Earth,True,G/1498/S,PSO J318.5-22,43.0,False,0.0,0.0,0.0,0.0,0.0,Lilace Leonzaley,,test


In [51]:
num_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
cat_cols = ['HomePlanet','CryoSleep','VIP','Transported']

print(df[num_cols].describe())
print(df[cat_cols].apply(pd.Series.value_counts))
print(df.shape)

# sns.pairplot(df[['Survived', 'Pclass', 'Age', 'Fare']])
#draw_histograms(df, df.columns, 4, 3)

                Age   RoomService     FoodCourt  ShoppingMall           Spa        VRDeck
count  12700.000000  12707.000000  12681.000000  12664.000000  12686.000000  12702.000000
mean      28.771969    222.897852    451.961675    174.906033    308.476904    306.789482
std       14.387261    647.596664   1584.370747    590.558690   1130.279641   1180.097223
min        0.000000      0.000000      0.000000      0.000000      0.000000      0.000000
25%       19.000000      0.000000      0.000000      0.000000      0.000000      0.000000
50%       27.000000      0.000000      0.000000      0.000000      0.000000      0.000000
75%       38.000000     49.000000     77.000000     29.000000     57.000000     42.000000
max       79.000000  14327.000000  29813.000000  23492.000000  22408.000000  24133.000000
        HomePlanet  CryoSleep      VIP  Transported
False          NaN     8079.0  12401.0       4315.0
True           NaN     4581.0    273.0       4378.0
Earth       6865.0        NaN     

In [52]:
#%% 2.5 Create more features ###

df[['Group_Id', 'Passeng_Id']] = df['PassengerId'].str.split('_', 1, expand=True)
df[['Deck', 'Room', 'Side']] = df['Cabin'].str.split('/', 2, expand=True)
print(df.dtypes)

df[['Group_Id', 'Passeng_Id', 'Room']] = df[['Group_Id', 'Passeng_Id', 'Room']].apply(pd.to_numeric)
df.drop(columns=['Passeng_Id', 'Cabin', 'Group_Id', 'Room', 'Name'], inplace=True)
df

PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported     float64
sample           object
Group_Id         object
Passeng_Id       object
Deck             object
Room             object
Side             object
dtype: object


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,sample,Deck,Side
0,0001_01,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,0.0,train,B,P
1,0002_01,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,1.0,train,F,S
2,0003_01,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,0.0,train,A,S
3,0003_02,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,0.0,train,A,S
4,0004_01,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,1.0,train,F,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12965,9266_02,Earth,True,TRAPPIST-1e,34.0,False,0.0,0.0,0.0,0.0,0.0,,test,G,S
12966,9269_01,Earth,False,TRAPPIST-1e,42.0,False,0.0,847.0,17.0,10.0,144.0,,test,,
12967,9271_01,Mars,True,55 Cancri e,,False,0.0,0.0,0.0,0.0,0.0,,test,D,P
12968,9273_01,Europa,False,,,False,0.0,2680.0,0.0,0.0,523.0,,test,D,P


In [53]:
#%% 3.Deal with missing values ###

df.info()
df.dropna(inplace=True, subset=df.columns.drop(['Transported']))
df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12970 entries, 0 to 12969
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   12970 non-null  object 
 1   HomePlanet    12682 non-null  object 
 2   CryoSleep     12660 non-null  object 
 3   Destination   12696 non-null  object 
 4   Age           12700 non-null  float64
 5   VIP           12674 non-null  object 
 6   RoomService   12707 non-null  float64
 7   FoodCourt     12681 non-null  float64
 8   ShoppingMall  12664 non-null  float64
 9   Spa           12686 non-null  float64
 10  VRDeck        12702 non-null  float64
 11  Transported   8693 non-null   float64
 12  sample        12970 non-null  object 
 13  Deck          12671 non-null  object 
 14  Side          12671 non-null  object 
dtypes: float64(7), object(8)
memory usage: 1.5+ MB


(10119, 15)

In [54]:
#%% Transform some skewed variables ###

df['RoomService'] = np.log1p(df.RoomService)
df['FoodCourt'] = np.log1p(df.FoodCourt)
df['ShoppingMall'] = np.log1p(df.ShoppingMall)
df['Spa'] = np.log1p(df.Spa)
df['VRDeck'] = np.log1p(df.VRDeck)

In [55]:
df['CryoSleep'] = df['CryoSleep'].astype(int)
df['VIP'] = df['VIP'].astype(int)
df.loc[~(df.Transported.isnull()),'Transported'] = df.loc[~(df.Transported.isnull()),'Transported'].astype(int)
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,sample,Deck,Side
0,0001_01,Europa,0,TRAPPIST-1e,39.0,0,0.0,0.0,0.0,0.0,0.0,0.0,train,B,P
1,0002_01,Earth,0,TRAPPIST-1e,24.0,0,4.70048,2.302585,3.258097,6.309918,3.806662,1.0,train,F,S
2,0003_01,Europa,0,TRAPPIST-1e,58.0,1,3.78419,8.18228,0.0,8.812248,3.912023,0.0,train,A,S
3,0003_02,Europa,0,TRAPPIST-1e,33.0,0,0.0,7.157735,5.918894,8.110728,5.267858,0.0,train,A,S
4,0004_01,Earth,0,TRAPPIST-1e,16.0,0,5.717028,4.26268,5.023881,6.338594,1.098612,1.0,train,F,S


In [56]:
#%% 4.Transform features depending on their type ###

# this is very important for ML application, where there are hundreds of features.
# If there are less than 20 features, can use standard approach.
# my approach of tackling one feature a time is not scalable. 

# use intuition to trim range or ordinary variables 
# can skip this step in general, since it is not scalable when number of features grows.

# identify binary and categorical variables
df_uniques = pd.DataFrame([[i, len(df[i].unique())] for i in df.columns], columns=['Variable', 'Unique Values']).set_index('Variable')
print(df_uniques)

binary_variables = list(df_uniques[df_uniques['Unique Values'] == 2].index)
categorical_variables = list(df_uniques[(10 >= df_uniques['Unique Values']) & (df_uniques['Unique Values'] > 2)].index)
numeric_variables = list(set(df.columns) - set(categorical_variables) - set(binary_variables))
print('Binary variables are ', binary_variables)
print('Categorical variables are ', categorical_variables)
print('Numeric variables are ', numeric_variables)

              Unique Values
Variable                   
PassengerId           10119
HomePlanet                3
CryoSleep                 2
Destination               3
Age                      80
VIP                       2
RoomService            1412
FoodCourt              1717
ShoppingMall           1235
Spa                    1484
VRDeck                 1438
Transported               3
sample                    2
Deck                      8
Side                      2
Binary variables are  ['CryoSleep', 'VIP', 'sample', 'Side']
Categorical variables are  ['HomePlanet', 'Destination', 'Transported', 'Deck']
Numeric variables are  ['Age', 'PassengerId', 'ShoppingMall', 'FoodCourt', 'VRDeck', 'Spa', 'RoomService']


In [57]:
# ohc for binary variables #
lb = LabelBinarizer()
binary_variables.remove('sample')
for column in binary_variables:
    df[column] = lb.fit_transform(df[column])

# ohc for categorical variables #
categorical_variables.remove('Transported')
df = pd.get_dummies(df, columns = categorical_variables, drop_first=True)

print(df.shape)
print(df.head())
print(df.dtypes)

(10119, 23)
  PassengerId  CryoSleep   Age  VIP  RoomService  FoodCourt  ShoppingMall       Spa    VRDeck  Transported  ... HomePlanet_Mars  Destination_PSO J318.5-22  Destination_TRAPPIST-1e  Deck_B  Deck_C  Deck_D  Deck_E  Deck_F  Deck_G  Deck_T
0     0001_01          0  39.0    0     0.000000   0.000000      0.000000  0.000000  0.000000          0.0  ...               0                          0                        1       1       0       0       0       0       0       0
1     0002_01          0  24.0    0     4.700480   2.302585      3.258097  6.309918  3.806662          1.0  ...               0                          0                        1       0       0       0       0       1       0       0
2     0003_01          0  58.0    1     3.784190   8.182280      0.000000  8.812248  3.912023          0.0  ...               0                          0                        1       0       0       0       0       0       0       0
3     0003_02          0  33.0    0     0.00

In [58]:
# %% 5.Creating subsamples ###

train = df[df['sample']=='train'].copy()
train.drop(columns=['sample'], inplace=True)
test = df[df['sample']=='test'].copy()
test.drop(columns=['sample'], inplace=True)

print(train.shape)
print(test.shape)
train.head(3)

(6764, 22)
(3355, 22)


Unnamed: 0,PassengerId,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,...,HomePlanet_Mars,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T
0,0001_01,0,39.0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,1,0,0,0,0,0,0
1,0002_01,0,24.0,0,4.70048,2.302585,3.258097,6.309918,3.806662,1.0,...,0,0,1,0,0,0,0,1,0,0
2,0003_01,0,58.0,1,3.78419,8.18228,0.0,8.812248,3.912023,0.0,...,0,0,1,0,0,0,0,0,0,0


In [59]:
# %% 5.Creating subsamples ###

y_train = train['Transported']
X_train = train.drop(columns=['Transported'])
X_test = test.drop(columns=['Transported'])
print(X_train.shape)

X_train, X_traintest, y_train, y_traintest = train_test_split(X_train,y_train,test_size=0.1, random_state=2)

X_train_id = X_train.copy()
X_traintest_id = X_traintest.copy() 
X_test = X_test.copy()

X_train.drop(columns=['PassengerId'], inplace=True)
X_traintest.drop(columns=['PassengerId'], inplace=True)
X_test.drop(columns=['PassengerId'], inplace=True)


print(X_train.shape)
print(X_traintest.shape)
print(X_test.shape)
X_traintest.head(3)

# 'traintest' is hold-out sample to veify that chosen model indeed works.
# it is different from 'test', which is truly out of sample.

(6764, 21)
(6087, 20)
(677, 20)
(3355, 20)


Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Side,HomePlanet_Europa,HomePlanet_Mars,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T
4412,0,50.0,0,5.420535,0.0,6.680855,4.682131,0.0,1,0,1,0,1,0,0,0,0,1,0,0
5164,0,27.0,0,0.0,1.94591,0.0,6.703188,4.682131,1,0,0,0,1,0,0,0,0,0,1,0
6800,0,18.0,0,1.386294,6.535241,0.0,0.0,3.178054,1,0,0,0,1,0,0,0,0,0,1,0


In [61]:
ss = StandardScaler()
numeric_variables.remove('PassengerId')

for column in [numeric_variables]:
    X_train[column] = ss.fit_transform(X_train[column])
    X_traintest[column] = ss.transform(X_traintest[column])
    X_test[column] = ss.transform(X_test[column])

In [62]:
####################
### 7.Fit models ###
####################

time3 = time.time()

#%% Logistic regression ###

grid_values = {'penalty': ['l2'], 'C': list(np.arange(0.2,2,0.2))}
lr = LogisticRegression()
model_lr = GridSearchCV(lr, param_grid=grid_values, cv = 4)
model_lr.fit(X_train, y_train)
print('logistic ', model_lr.best_score_, model_lr.best_params_)

# model_lr.predict(X_test)

#%% KNN ###

grid_values = dict(n_neighbors=np.arange(10,41,2))
knnm = KNeighborsClassifier()   
model_knn = GridSearchCV(knnm, param_grid=grid_values, cv = 4)
model_knn.fit(X_train, y_train)
print('knn ', model_knn.best_score_, model_knn.best_params_)

#%% SVM ###

grid_values = {'C': np.arange(0.1, 2.0, 0.2)} 
svmm = svm.SVC(kernel='rbf')
model_svm = GridSearchCV(svmm, param_grid=grid_values, cv = 4)
model_svm.fit(X_train, y_train)
print('svm ', model_svm.best_score_, model_svm.best_params_)

#%% RF ###

# may look here: https://www.geeksforgeeks.org/hyperparameter-tuning/

grid_values = [{'max_depth': list(range(2, 9, 1)), 'max_features': list(np.arange(0.3,0.51,0.1))}]
rfc = RandomForestClassifier(random_state=42)
model_rf = GridSearchCV(rfc, grid_values, cv = 4, scoring='accuracy')
model_rf.fit(X_train, y_train)
print('rf ', model_rf.best_score_, model_rf.best_params_)
print('4 models time is ', time.time()-time3)

logistic  0.7798589134508471 {'C': 0.2, 'penalty': 'l2'}
knn  0.794644361332929 {'n_neighbors': 38}
svm  0.8053230463394215 {'C': 1.3000000000000003}
rf  0.80811477683003 {'max_depth': 8, 'max_features': 0.4}
4 models time is  87.27648830413818


In [56]:
#%% XGBoost ###
# run this code only on Kaggle with GPU

time4 = time.time()

estimator = XGBClassifier(
    nthread=4,
    seed=42,
    use_label_encoder=False
)

parameters = {
    'max_depth': range (2, 4, 1),
    'n_estimators': range(50, 301, 50),
    'learning_rate': [0.01, 0.03, 0.05]
}

grid_search = GridSearchCV(
    estimator=estimator,
    param_grid=parameters,
    scoring = 'roc_auc',
    n_jobs = -1,
    cv = 4,
    verbose=True
)

grid_search.fit(X_train, y_train, eval_metric='rmse')
print(grid_search.best_score_, grid_search.best_params_)
print('XGB model time is ', time.time()-time4)

Fitting 4 folds for each of 36 candidates, totalling 144 fits


KeyboardInterrupt: 

In [63]:
#%% 8.Evaluate performance oos ###

yhat_lm = model_lr.predict(X_traintest)
yhat_knn = model_knn.predict(X_traintest)
yhat_svm = model_svm.predict(X_traintest)
yhat_rf = model_rf.predict(X_traintest)
#yhat_bt = grid_search.predict(X_test)
print('Accuracy of logistic regression is ', 1-(np.abs(yhat_lm-y_traintest)).mean())
print('Accuracy of KNN is ', 1-(np.abs(yhat_knn-y_traintest)).mean())
print('Accuracy of SVM is ', 1-(np.abs(yhat_svm-y_traintest)).mean())
print('Accuracy of RF is ', 1-(np.abs(yhat_rf-y_traintest)).mean())
#print('Accuracy of Boosted Tree is ', 1-(np.abs(yhat_bt-y_test)).mean())
print('Total time is ', time.time()-time1)

Accuracy of logistic regression is  0.793205317577548
Accuracy of KNN is  0.7813884785819794
Accuracy of SVM is  0.7991137370753323
Accuracy of RF is  0.8064992614475628
Total time is  150.75584435462952


In [64]:
### Export results ###
yhat_knn = model_knn.predict(X_test).astype(int)
yhat_svm = model_svm.predict(X_test).astype(int)
yhat_rf = model_rf.predict(X_test).astype(int)
#yhat_bt = grid_search.predict(X_test).astype(int)

In [66]:
submission_df_knn = pd.DataFrame({'PassengerId': test.PassengerId, 'Transported': yhat_knn}, columns=['PassengerId', 'Transported'])
submission_df_svm = pd.DataFrame({'PassengerId': test.PassengerId, 'Transported': yhat_svm}, columns=['PassengerId', 'Transported'])
submission_df_rf = pd.DataFrame({'PassengerId': test.PassengerId, 'Transported': yhat_rf}, columns=['PassengerId', 'Transported'])
#submission_df_bt = pd.DataFrame({'PassengerId': test.PassengerId, 'Transported': yhat_bt}, columns=['PassengerId', 'Transported'])

submission_df_knn.to_csv('submissions_SpaceTitanic_i1_knn.csv',index=False)
submission_df_svm.to_csv('submissions_SpaceTitanic_i1_svm.csv',index=False)
submission_df_rf.to_csv('submissions_SpaceTitanic_i1_rf.csv',index=False)
#submission_df_bt.to_csv('submissions_Titanic_i10_bt1.csv',index=False)

In [67]:
os.chdir(r'/kaggle/working')

from IPython.display import FileLink
FileLink(r'submissions_SpaceTitanic_i1_rf.csv')

In [None]:
FileLink(r'submissions_Titanic_i10_rf1.csv')