In [1]:
##################################################################################################
### This script is ML Classification template, which should be applicable to most MLC projects ###
##################################################################################################

"""Structure of the script:
1.  Load all needed libraries and functions.
2.  Load data, do preliminary data exploration.
2.1 [Optional] Create more variables, delete variables.
3.  Deal with missing values, transform skewed variables.
4.  Trnasform features depending on their type. OHC.
5.  Create subsamples.
6.  Do scaling.
7.  Fit models, selecting hyperparameters via CV grid search.
8.  Evaluate performance of the selected models on test sample.
"""

'Structure of the script:\n1. Load all needed libraries and functions.\n2. Load data, do preliminary data exploration.\n3. [Optional] Transform skewed variables.\n4. Trnasform features depending on their type. OHC.\n5. Create subsamples.\n6. Do scaling.\n7. Fit models, selecting hyperparameters via CV grid search.\n8. Evaluate performance of the selected models on test sample.\n'

In [2]:
### 1.Load main libraries ###

import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import svm
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OrdinalEncoder, MinMaxScaler, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from xgboost import XGBRegressor, XGBClassifier

pd.set_option('display.max_columns', 20)
pd.set_option('mode.chained_assignment', None)
pd.set_option('display.expand_frame_repr', False)

In [3]:
def draw_histograms(df, variables, n_rows, n_cols):
    # stolen from https://stackoverflow.com/questions/29530355/plotting-multiple-histograms-in-grid
    fig=plt.figure()
    for i, var_name in enumerate(variables):
        ax=fig.add_subplot(n_rows,n_cols,i+1)
        df[var_name].hist(bins=10,ax=ax)
        ax.set_title(var_name+" Distribution")
    fig.tight_layout()  
    plt.show()

In [49]:
### 2.Load data ###

path = '../input/spaceship-titanic/train.csv'
df = pd.read_csv(path) 
print(df.shape)
#df.drop(columns=['Name', 'Ticket', 'Cabin'],inplace=True)
#df.loc[df.Age.isnull(),'Age'] = df.Age.median()
df.head()

(8693, 14)


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [50]:
num_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
cat_cols = ['HomePlanet','CryoSleep','VIP','Transported']

print(df[num_cols].describe())
print(df[cat_cols].apply(pd.Series.value_counts))
print(df.shape)

# sns.pairplot(df[['Survived', 'Pclass', 'Age', 'Fare']])
#draw_histograms(df, df.columns, 4, 3)

               Age   RoomService     FoodCourt  ShoppingMall           Spa        VRDeck
count  8514.000000   8512.000000   8510.000000   8485.000000   8510.000000   8505.000000
mean     28.827930    224.687617    458.077203    173.729169    311.138778    304.854791
std      14.489021    666.717663   1611.489240    604.696458   1136.705535   1145.717189
min       0.000000      0.000000      0.000000      0.000000      0.000000      0.000000
25%      19.000000      0.000000      0.000000      0.000000      0.000000      0.000000
50%      27.000000      0.000000      0.000000      0.000000      0.000000      0.000000
75%      38.000000     47.000000     76.000000     27.000000     59.000000     46.000000
max      79.000000  14327.000000  29813.000000  23492.000000  22408.000000  24133.000000
        HomePlanet  CryoSleep     VIP  Transported
Earth       4602.0        NaN     NaN          NaN
Europa      2131.0        NaN     NaN          NaN
Mars        1759.0        NaN     NaN         

In [51]:
#%% 2.5 Create more features ###

df[['Group_Id', 'Passenger_Id']] = df['PassengerId'].str.split('_', 1, expand=True)
df[['Deck', 'Room', 'Side']] = df['Cabin'].str.split('/', 2, expand=True)
print(df.dtypes)

df[['Group_Id', 'Passenger_Id', 'Room']] = df[['Group_Id', 'Passenger_Id', 'Room']].apply(pd.to_numeric)
df.drop(columns=['PassengerId', 'Cabin'], inplace=True)
df

PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool
Group_Id         object
Passenger_Id     object
Deck             object
Room             object
Side             object
dtype: object


Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Group_Id,Passenger_Id,Deck,Room,Side
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,1,1,B,0.0,P
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,2,1,F,0.0,S
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,3,1,A,0.0,S
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,3,2,A,0.0,S
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,4,1,F,1.0,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,Europa,False,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False,9276,1,A,98.0,P
8689,Earth,True,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False,9278,1,G,1499.0,S
8690,Earth,False,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True,9279,1,G,1500.0,S
8691,Europa,False,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False,9280,1,E,608.0,S


In [52]:
#%% 3.Deal with missing values ###

df.info()
df.dropna(inplace=True)
df.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 17 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8492 non-null   object 
 1   CryoSleep     8476 non-null   object 
 2   Destination   8511 non-null   object 
 3   Age           8514 non-null   float64
 4   VIP           8490 non-null   object 
 5   RoomService   8512 non-null   float64
 6   FoodCourt     8510 non-null   float64
 7   ShoppingMall  8485 non-null   float64
 8   Spa           8510 non-null   float64
 9   VRDeck        8505 non-null   float64
 10  Name          8493 non-null   object 
 11  Transported   8693 non-null   bool   
 12  Group_Id      8693 non-null   int64  
 13  Passenger_Id  8693 non-null   int64  
 14  Deck          8494 non-null   object 
 15  Room          8494 non-null   float64
 16  Side          8494 non-null   object 
dtypes: bool(1), float64(7), int64(2), object(7)
memory usage: 1.1+ MB


(6606, 17)

In [53]:
#%% Transform some skewed variables ###

df['RoomService'] = np.log1p(df.RoomService)
df['FoodCourt'] = np.log1p(df.FoodCourt)
df['ShoppingMall'] = np.log1p(df.ShoppingMall)
df['Spa'] = np.log1p(df.Spa)
df['VRDeck'] = np.log1p(df.VRDeck)

In [None]:
#%% 4.Transform features depending on their type ###

# this is very important for ML application, where there are hundreds of features.
# If there are less than 20 features, can use standard approach.
# my approach of tackling one feature a time is not scalable. 

# use intuition to trim range or ordinary variables 
# can skip this step in general, since it is not scalable when number of features grows.
df.loc[df.SibSp>2,'SibSp']=3
df.loc[df.Parch>2,'Parch']=3

# identify binary and categorical variables
df_uniques = pd.DataFrame([[i, len(df[i].unique())] for i in df.columns], columns=['Variable', 'Unique Values']).set_index('Variable')
print(df_uniques)

binary_variables = list(df_uniques[df_uniques['Unique Values'] == 2].index)
categorical_variables = list(df_uniques[(6 >= df_uniques['Unique Values']) & (df_uniques['Unique Values'] > 2)].index)
numeric_variables = list(set(df.columns) - set(categorical_variables) - set(binary_variables))
print('Binary variables are ', binary_variables)
print('Categorical variables are ', categorical_variables)
print('Numeric variables are ', numeric_variables)

In [None]:
# ohc for binary variables #
lb = LabelBinarizer()
binary_variables.remove('Survived')
for column in binary_variables:
    df[column] = lb.fit_transform(df[column])

# ohc for categorical variables #
df = pd.get_dummies(df, columns = categorical_variables, drop_first=True)

print(df.shape)
print(df.head())

In [None]:
# %% 5.Creating subsamples ###

y = df['Survived']
X = df.drop(columns=['Survived'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

In [None]:
# %% 6.scaling numeric variables ###

draw_histograms(X_train, numeric_variables, 2, 3)

In [None]:
ss = StandardScaler()

for column in [numeric_variables]:
    X_train[column] = ss.fit_transform(X_train[column])
    X_test[column] = ss.transform(X_test[column])

In [None]:
####################
### 7.Fit models ###
####################

#%% Logistic regression ###

grid_values = {'penalty': ['l2'], 'C': list(np.arange(1,10.5,0.5))}
lr = LogisticRegression()
model_lr = GridSearchCV(lr, param_grid=grid_values, cv = 10)
model_lr.fit(X_train, y_train)
print(model_lr.best_score_, model_lr.best_params_)

# model_lr.predict(X_test)

In [None]:
#%% KNN ###

grid_values = dict(n_neighbors=np.arange(1,40))
knnm = KNeighborsClassifier()   
model_knn = GridSearchCV(knnm, param_grid=grid_values, cv = 10)
model_knn.fit(X_train, y_train)
print(model_knn.best_score_, model_knn.best_params_)

In [None]:
#%% SVM ###

grid_values = {'C': np.arange(0.05, 1, 0.05)} 
svmm = svm.SVC(kernel='rbf')
model_svm = GridSearchCV(svmm, param_grid=grid_values, cv = 10)
model_svm.fit(X_train, y_train)
print(model_svm.best_score_, model_svm.best_params_)

In [None]:
#%% RF ###

# may look here: https://www.geeksforgeeks.org/hyperparameter-tuning/

grid_values = [{'max_depth': list(range(2, 9, 2)), 'max_features': list(np.arange(0.3,0.71,0.1))}]
rfc = RandomForestClassifier(random_state=42)
model_rf = GridSearchCV(rfc, grid_values, cv = 5, scoring='accuracy')
model_rf.fit(X_train, y_train)
print(model_rf.best_score_, model_rf.best_params_)

In [None]:
#%% XGBoost ###
# run this code only on Kaggle with GPU

estimator = XGBClassifier(
    nthread=4,
    seed=42,
    use_label_encoder=False
)

parameters = {
    'max_depth': range (2, 5, 1),
    'n_estimators': range(5, 50, 5),
    'learning_rate': [0.01, 0.05, 0.1, 0.15]
}

grid_search = GridSearchCV(
    estimator=estimator,
    param_grid=parameters,
    scoring = 'roc_auc',
    n_jobs = 10,
    cv = 10,
    verbose=True
)

grid_search.fit(X_train, y_train, eval_metric='rmse')
print(grid_search.best_score_, grid_search.best_params_)

In [None]:
#%% 8.Evaluate performance oos ###

yhat_lm = model_lr.predict(X_test)
yhat_knn = model_knn.predict(X_test)
yhat_svm = model_svm.predict(X_test)
yhat_rf = model_rf.predict(X_test)
#yhat_bt = grid_search.predict(X_test)
print('Accuracy of logistic regression is ', 1-(np.abs(yhat_lm-y_test)).mean())
print('Accuracy of KNN is ', 1-(np.abs(yhat_knn-y_test)).mean())
print('Accuracy of SVM is ', 1-(np.abs(yhat_svm-y_test)).mean())
print('Accuracy of RF is ', 1-(np.abs(yhat_rf-y_test)).mean())
#print('Accuracy of Boosted Tree is ', 1-(np.abs(yhat_bt-y_test)).mean())