In [None]:
##################################################################################################
### This script is ML Classification template, which should be applicable to most MLC projects ###
##################################################################################################

"""Structure of the script:
1.  Load all needed libraries and functions.
2.  Load data, do preliminary data exploration.
2.1 [Optional] Create more variables, delete variables.
3.  Deal with missing values, transform skewed variables.
4.  Trnasform features depending on their type. OHC.
5.  Create subsamples.
6.  Do scaling.
7.  Fit models, selecting hyperparameters via CV grid search.
8.  Evaluate performance of the selected models on test sample.
"""

### 1.Load main libraries ###

import time
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import svm
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OrdinalEncoder, MinMaxScaler, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from xgboost import XGBRegressor, XGBClassifier



pd.set_option('display.max_columns', 20)
pd.set_option('mode.chained_assignment', None)
pd.set_option('display.expand_frame_repr', False)

# Turn off warnings. Be warned!
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

def draw_histograms(df, variables, n_rows, n_cols):
    # stolen from https://stackoverflow.com/questions/29530355/plotting-multiple-histograms-in-grid
    fig=plt.figure()
    for i, var_name in enumerate(variables):
        ax=fig.add_subplot(n_rows,n_cols,i+1)
        df[var_name].hist(bins=10,ax=ax)
        ax.set_title(var_name+" Distribution")
    fig.tight_layout()  
    plt.show()

### 2.Load data ###

time1 = time.time()

path = '../input/spaceship-titanic/train.csv'
train = pd.read_csv(path) 
print(train.shape)
train.head(2)

test_data=pd.read_csv('../input/spaceship-titanic/test.csv')

print(train.shape, test_data.shape)
test = test_data.copy()
train['sample']='train'
test['Transported'] = np.nan
test['sample']='test'

df=pd.concat([train, test])
df.reset_index(inplace=True, drop=True)
print(df.shape)
df.tail(3)

num_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
cat_cols = ['HomePlanet','CryoSleep','VIP', 'Destination', 'Transported']

print(df[num_cols].describe())
print(df[cat_cols].apply(pd.Series.value_counts))
print(df.shape)

# sns.pairplot(df[['Survived', 'Pclass', 'Age', 'Fare']])
#draw_histograms(df, df.columns, 4, 3)

#%% 2.5 Create more features ###

df[['Group_Id', 'Passeng_Id']] = df['PassengerId'].str.split('_', 1, expand=True)
df[['Deck', 'Room', 'Side']] = df['Cabin'].str.split('/', 2, expand=True)
print(df.dtypes)

df[['Group_Id', 'Passeng_Id', 'Room']] = df[['Group_Id', 'Passeng_Id', 'Room']].apply(pd.to_numeric)
df.drop(columns=['Passeng_Id', 'Cabin', 'Group_Id', 'Room', 'Name'], inplace=True)

#%% 3.Deal with missing values ###

df.info()
#df.dropna(inplace=True, subset=df.columns.drop(['Transported']))
df.shape

df.fillna(df.median()[num_cols], inplace=True)
df['HomePlanet'].fillna(df['HomePlanet'].value_counts().index[0], inplace=True)
df['CryoSleep'].fillna(df['CryoSleep'].value_counts().index[0], inplace=True)
df['Destination'].fillna(df['Destination'].value_counts().index[0], inplace=True)
df['VIP'].fillna(df['VIP'].value_counts().index[0], inplace=True)
df['Deck'].fillna(df['Deck'].value_counts().index[0], inplace=True)
df['Side'].fillna(df['Side'].value_counts().index[0], inplace=True)
df.describe(include='all')

#%% Transform some skewed variables ###

df['RoomService'] = np.log1p(df.RoomService)
df['FoodCourt'] = np.log1p(df.FoodCourt)
df['ShoppingMall'] = np.log1p(df.ShoppingMall)
df['Spa'] = np.log1p(df.Spa)
df['VRDeck'] = np.log1p(df.VRDeck)

df['CryoSleep'] = df['CryoSleep'].astype(int)
df['VIP'] = df['VIP'].astype(int)
df.loc[~(df.Transported.isnull()),'Transported'] = df.loc[~(df.Transported.isnull()),'Transported'].astype(int)
df.head()

#%% 4.Transform features depending on their type ###

# this is very important for ML application, where there are hundreds of features.
# If there are less than 20 features, can use standard approach.
# my approach of tackling one feature a time is not scalable. 

# use intuition to trim range or ordinary variables 
# can skip this step in general, since it is not scalable when number of features grows.

# identify binary and categorical variables
df_uniques = pd.DataFrame([[i, len(df[i].unique())] for i in df.columns], columns=['Variable', 'Unique Values']).set_index('Variable')
print(df_uniques)

binary_variables = list(df_uniques[df_uniques['Unique Values'] == 2].index)
categorical_variables = list(df_uniques[(10 >= df_uniques['Unique Values']) & (df_uniques['Unique Values'] > 2)].index)
numeric_variables = list(set(df.columns) - set(categorical_variables) - set(binary_variables))
print('Binary variables are ', binary_variables)
print('Categorical variables are ', categorical_variables)
print('Numeric variables are ', numeric_variables)

# ohc for binary variables #
lb = LabelBinarizer()
binary_variables.remove('sample')
for column in binary_variables:
    df[column] = lb.fit_transform(df[column])

# ohc for categorical variables #
categorical_variables.remove('Transported')
df = pd.get_dummies(df, columns = categorical_variables, drop_first=True)

print(df.shape)
print(df.head())
print(df.dtypes)

# %% 5.Creating subsamples ###

train = df[df['sample']=='train'].copy()
train.drop(columns=['sample'], inplace=True)
test = df[df['sample']=='test'].copy()
test.drop(columns=['sample'], inplace=True)

print(train.shape)
print(test.shape)
train.head(3)

# %% 5.Creating subsamples ###

y_train = train['Transported']
X_train = train.drop(columns=['Transported'])
X_test = test.drop(columns=['Transported'])
print(X_train.shape)

X_train, X_traintest, y_train, y_traintest = train_test_split(X_train,y_train,test_size=0.1, random_state=6)

X_train_id = X_train.copy()
X_traintest_id = X_traintest.copy() 
X_test = X_test.copy()

X_train.drop(columns=['PassengerId'], inplace=True)
X_traintest.drop(columns=['PassengerId'], inplace=True)
X_test.drop(columns=['PassengerId'], inplace=True)


print(X_train.shape)
print(X_traintest.shape)
print(X_test.shape)
X_traintest.head(3)

# 'traintest' is hold-out sample to veify that chosen model indeed works.
# it is different from 'test', which is truly out of sample.

ss = StandardScaler()
numeric_variables.remove('PassengerId')

for column in [numeric_variables]:
    X_train[column] = ss.fit_transform(X_train[column])
    X_traintest[column] = ss.transform(X_traintest[column])
    X_test[column] = ss.transform(X_test[column])

####################
### 7.Fit models ###
####################

time3 = time.time()

#%% KNN ###

grid_values = dict(n_neighbors=np.arange(10,41,5))
knnm = KNeighborsClassifier()   
model_knn = GridSearchCV(knnm, param_grid=grid_values, cv = 8)
model_knn.fit(X_train, y_train)
print('knn ', model_knn.best_score_, model_knn.best_params_)


#%% XGBoost ###
# run this code only on Kaggle with GPU

time4 = time.time()

estimator = XGBClassifier(
    nthread=4,
    seed=42,
    use_label_encoder=False
)

parameters = {
    'max_depth': range (2, 3, 1),
    'n_estimators': [400, 500, 600, 800, 1000],
    'learning_rate': [0.02, 0.03, 0.04]
}

grid_search = GridSearchCV(
    estimator=estimator,
    param_grid=parameters,
    scoring = 'roc_auc',
    n_jobs = -1,
    cv = 8,
    verbose=True
)

grid_search.fit(X_train, y_train, eval_metric='rmse')
print('XGB tree ', grid_search.best_score_, grid_search.best_params_)

xgbc = XGBClassifier(nthread=4, seed=42, use_label_encoder=False,
                     max_depth=2, n_estimators=400, learning_rate=0.02)
xgbc.fit(X_train, y_train)

#%% 8.Evaluate performance oos ###

yhat_knn = model_knn.predict(X_traintest)
yhat_btcv = grid_search.predict(X_traintest)
yhat_btm = xgbc.predict(X_traintest)

print('Accuracy of KNN is ', 1-(np.abs(yhat_knn-y_traintest)).mean())
print('Accuracy of Boosted Tree cv is ', 1-(np.abs(yhat_btcv-y_traintest)).mean())
print('Accuracy of Boosted Tree m is ', 1-(np.abs(yhat_btm-y_traintest)).mean())

print('Total time is ', time.time()-time1)

# when dealing only with nonmissing data, i reliable get 80.0-80.5% accuracy for svm and rf.

(8693, 14)
(8693, 14) (4277, 13)
(12970, 15)
                Age   RoomService     FoodCourt  ShoppingMall           Spa        VRDeck
count  12700.000000  12707.000000  12681.000000  12664.000000  12686.000000  12702.000000
mean      28.771969    222.897852    451.961675    174.906033    308.476904    306.789482
std       14.387261    647.596664   1584.370747    590.558690   1130.279641   1180.097223
min        0.000000      0.000000      0.000000      0.000000      0.000000      0.000000
25%       19.000000      0.000000      0.000000      0.000000      0.000000      0.000000
50%       27.000000      0.000000      0.000000      0.000000      0.000000      0.000000
75%       38.000000     49.000000     77.000000     29.000000     57.000000     42.000000
max       79.000000  14327.000000  29813.000000  23492.000000  22408.000000  24133.000000
               HomePlanet  CryoSleep      VIP  Destination  Transported
False                 NaN     8079.0  12401.0          NaN       4315.0
T

In [78]:
xgbc = XGBClassifier(nthread=4, seed=42, use_label_encoder=False,
                     max_depth=2, n_estimators=800, learning_rate=0.03)
xgbc.fit(X_train, y_train)
yhat_btm = xgbc.predict(X_traintest)
print('Accuracy of Boosted Tree m is ', 1-(np.abs(yhat_btm-y_traintest)).mean())

Accuracy of Boosted Tree m is  0.8011494252873563


In [42]:
# here I will add ANN

from keras.models  import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten
from tensorflow.keras.optimizers import Adam, SGD, RMSprop
from tensorflow.keras import optimizers
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping

model_2 = Sequential()
model_2.add(Dense(50, input_shape=(X_train.shape[1],), activation="relu"))
model_2.add(Dropout(0.4))
model_2.add(BatchNormalization())
model_2.add(Dense(50, input_shape=(X_train.shape[1],), activation="relu"))
model_2.add(Dropout(0.4))
model_2.add(BatchNormalization())
model_2.add(Dense(10, activation="relu"))
model_2.add(Dropout(0.4))
model_2.add(BatchNormalization())
model_2.add(Dense(1, activation="sigmoid"))

es = EarlyStopping(monitor='val_loss', patience=20)

model_2.compile(optimizer='adam', loss="binary_crossentropy", metrics=["accuracy"])
run_hist_2 = model_2.fit(X_train, y_train, validation_data=(X_traintest, y_traintest), epochs=300, callbacks=[es])


Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

In [17]:
### Export results ###

yhat_btm = xgbc.predict(X_test).astype(int)

replacements = {1:True, 0:False}
replacer = replacements.get

yhat_btm = [replacer(n, n) for n in yhat_btm]

In [20]:
submission_df_bt = pd.DataFrame({'PassengerId': test.PassengerId, 'Transported': yhat_btm}, columns=['PassengerId', 'Transported'])
submission_df_bt.to_csv('submissions_SpaceTitanic_i1_bt.csv',index=False)

In [21]:
os.chdir(r'/kaggle/working')

from IPython.display import FileLink
FileLink(r'submissions_SpaceTitanic_i1_bt.csv')

In [None]:
FileLink(r'submissions_Titanic_i10_rf1.csv')

In [None]:
#%% XGBoost ###
# run this code only on Kaggle with GPU

time4 = time.time()

estimator = XGBClassifier(
    nthread=4,
    seed=42,
    use_label_encoder=False
)

parameters = {
    'max_depth': range (2, 4, 1),
    'n_estimators': range(50, 301, 50),
    'learning_rate': [0.01, 0.03, 0.05]
}

grid_search = GridSearchCV(
    estimator=estimator,
    param_grid=parameters,
    scoring = 'roc_auc',
    n_jobs = -1,
    cv = 8,
    verbose=True
)

grid_search.fit(X_train, y_train, eval_metric='rmse')
print(grid_search.best_score_, grid_search.best_params_)
print('XGB model time is ', time.time()-time4)