In [3]:
##################################################################################################
### This script is ML Classification template, which should be applicable to most MLC projects ###
##################################################################################################

"""Structure of the script:
1.  Load all needed libraries and functions.
2.  Load data, do preliminary data exploration.
2.1 [Optional] Create more variables, delete variables.
3.  Deal with missing values, transform skewed variables.
4.  Trnasform features depending on their type. OHC.
5.  Create subsamples.
6.  Do scaling.
7.  Fit models, selecting hyperparameters via CV grid search.
8.  Evaluate performance of the selected models on test sample.
"""

### 1.Load main libraries ###

import time, os, warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import svm
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OrdinalEncoder, MinMaxScaler, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.inspection import permutation_importance
from xgboost import XGBRegressor, XGBClassifier


pd.set_option('display.max_columns', 25)
pd.set_option('mode.chained_assignment', None)
pd.set_option('display.expand_frame_repr', False)
warnings.simplefilter("ignore")


def draw_histograms(df, variables, n_rows, n_cols):
    # stolen from https://stackoverflow.com/questions/29530355/plotting-multiple-histograms-in-grid
    fig=plt.figure()
    for i, var_name in enumerate(variables):
        ax=fig.add_subplot(n_rows,n_cols,i+1)
        df[var_name].hist(bins=10,ax=ax)
        ax.set_title(var_name+" Distribution")
    fig.tight_layout()  
    plt.show()

### 2.Load data ###

time1 = time.time()

path = '../input/spaceship-titanic/train.csv'
train = pd.read_csv(path) 
print(train.shape)
train.head(2)

test_data=pd.read_csv('../input/spaceship-titanic/test.csv')

print(train.shape, test_data.shape)
test = test_data.copy()
train['sample']='train'
test['Transported'] = np.nan
test['sample']='test'

df=pd.concat([train, test])
df.reset_index(inplace=True, drop=True)
print(df.shape)
df.tail(3)

num_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
cat_cols = ['HomePlanet','CryoSleep','VIP', 'Destination', 'Transported']

print(df[num_cols].describe())
print(df[cat_cols].apply(pd.Series.value_counts))
print(df.shape)

# sns.pairplot(df[['Survived', 'Pclass', 'Age', 'Fare']])
#draw_histograms(df, df.columns, 4, 3)

#%% 2.5 Create more features ###

df[['Group_Id', 'Passeng_Id']] = df['PassengerId'].str.split('_', 1, expand=True)
df[['Deck', 'Room', 'Side']] = df['Cabin'].str.split('/', 2, expand=True)
print(df.dtypes)

df[['Group_Id', 'Passeng_Id', 'Room']] = df[['Group_Id', 'Passeng_Id', 'Room']].apply(pd.to_numeric)
df.drop(columns=['Passeng_Id', 'Cabin', 'Group_Id', 'Room', 'Name'], inplace=True)

#%% 3.Deal with missing values ###

df.info()
#df.dropna(inplace=True, subset=df.columns.drop(['Transported']))
df.shape

df.fillna(df.median()[num_cols], inplace=True)
df['HomePlanet'].fillna(df['HomePlanet'].value_counts().index[0], inplace=True)
df['CryoSleep'].fillna(df['CryoSleep'].value_counts().index[0], inplace=True)
df['Destination'].fillna(df['Destination'].value_counts().index[0], inplace=True)
df['VIP'].fillna(df['VIP'].value_counts().index[0], inplace=True)
df['Deck'].fillna(df['Deck'].value_counts().index[0], inplace=True)
df['Side'].fillna(df['Side'].value_counts().index[0], inplace=True)
df.describe(include='all')

#%% Transform some skewed variables ###

df['RoomService'] = np.log1p(df.RoomService)
df['FoodCourt'] = np.log1p(df.FoodCourt)
df['ShoppingMall'] = np.log1p(df.ShoppingMall)
df['Spa'] = np.log1p(df.Spa)
df['VRDeck'] = np.log1p(df.VRDeck)

df['CryoSleep'] = df['CryoSleep'].astype(int)
df['VIP'] = df['VIP'].astype(int)
df.loc[~(df.Transported.isnull()),'Transported'] = df.loc[~(df.Transported.isnull()),'Transported'].astype(int)
df.head()

#%% 4.Transform features depending on their type ###

# this is very important for ML application, where there are hundreds of features.
# If there are less than 20 features, can use standard approach.
# my approach of tackling one feature a time is not scalable. 

# use intuition to trim range or ordinary variables 
# can skip this step in general, since it is not scalable when number of features grows.

# identify binary and categorical variables
df_uniques = pd.DataFrame([[i, len(df[i].unique())] for i in df.columns], columns=['Variable', 'Unique Values']).set_index('Variable')
print(df_uniques)

binary_variables = list(df_uniques[df_uniques['Unique Values'] == 2].index)
categorical_variables = list(df_uniques[(10 >= df_uniques['Unique Values']) & (df_uniques['Unique Values'] > 2)].index)
numeric_variables = list(set(df.columns) - set(categorical_variables) - set(binary_variables))
print('Binary variables are ', binary_variables)
print('Categorical variables are ', categorical_variables)
print('Numeric variables are ', numeric_variables)

# ohc for binary variables #
lb = LabelBinarizer()
binary_variables.remove('sample')
for column in binary_variables:
    df[column] = lb.fit_transform(df[column])

# ohc for categorical variables #
categorical_variables.remove('Transported')
df = pd.get_dummies(df, columns = categorical_variables, drop_first=True)

print(df.shape)
print(df.head())
print(df.dtypes)

# %% 5.Creating subsamples ###

train = df[df['sample']=='train'].copy()
train.drop(columns=['sample'], inplace=True)
test = df[df['sample']=='test'].copy()
test.drop(columns=['sample'], inplace=True)

print(train.shape)
print(test.shape)
train.head(3)

# %% 5.Creating subsamples ###

y_train = train['Transported']
X_train = train.drop(columns=['Transported'])
X_test = test.drop(columns=['Transported'])
print(X_train.shape)

X_train, X_traintest, y_train, y_traintest = train_test_split(X_train,y_train,test_size=0.1, random_state=5)

X_train_id = X_train.copy()
X_traintest_id = X_traintest.copy() 
X_test = X_test.copy()

X_train.drop(columns=['PassengerId'], inplace=True)
X_traintest.drop(columns=['PassengerId'], inplace=True)
X_test.drop(columns=['PassengerId'], inplace=True)


print(X_train.shape)
print(X_traintest.shape)
print(X_test.shape)
X_traintest.head(3)

# 'traintest' is hold-out sample to veify that chosen model indeed works.
# it is different from 'test', which is truly out of sample.

ss = StandardScaler()
numeric_variables.remove('PassengerId')

for column in [numeric_variables]:
    X_train[column] = ss.fit_transform(X_train[column])
    X_traintest[column] = ss.transform(X_traintest[column])
    X_test[column] = ss.transform(X_test[column])

####################
### 7.Fit models ###
####################

time3 = time.time()

#%% Logistic regression ###

grid_values = {'penalty': ['l2'], 'C': list(np.arange(0.2,2,0.2))}
lr = LogisticRegression()
model_lr = GridSearchCV(lr, param_grid=grid_values, cv = 8)
model_lr.fit(X_train, y_train)
print('logistic ', model_lr.best_score_, model_lr.best_params_)

# model_lr.predict(X_test)

#%% KNN ###

grid_values = dict(n_neighbors=np.arange(10,41,4))
knnm = KNeighborsClassifier()   
model_knn = GridSearchCV(knnm, param_grid=grid_values, cv = 2)
model_knn.fit(X_train, y_train)
print('knn ', model_knn.best_score_, model_knn.best_params_)

#%% SVM ###

grid_values = {'C': [1,2,3,4]} 
svmm = svm.SVC(kernel='rbf')
model_svm = GridSearchCV(svmm, param_grid=grid_values, cv = 2)
model_svm.fit(X_train, y_train)
print('svm ', model_svm.best_score_, model_svm.best_params_)

#%% RF ###

# may look here: https://www.geeksforgeeks.org/hyperparameter-tuning/

grid_values = [{'max_depth': [4,6,8], 'max_features': [0.3, 0.4, 0.5],
               'n_estimators': [100, 200]}]
rfc = RandomForestClassifier(random_state=42)
model_rf = GridSearchCV(rfc, grid_values, cv = 2, scoring='accuracy')
model_rf.fit(X_train, y_train)
print('rf ', model_rf.best_score_, model_rf.best_params_)
print('4 models time is ', time.time()-time3)


#%% 8.Evaluate performance oos ###

yhat_lm = model_lr.predict(X_traintest)
yhat_knn = model_knn.predict(X_traintest)
yhat_svm = model_svm.predict(X_traintest)
yhat_rf = model_rf.predict(X_traintest)
#yhat_bt = grid_search.predict(X_test)
print('Accuracy of logistic regression is ', 1-(np.abs(yhat_lm-y_traintest)).mean())
print('Accuracy of KNN is ', 1-(np.abs(yhat_knn-y_traintest)).mean())
print('Accuracy of SVM is ', 1-(np.abs(yhat_svm-y_traintest)).mean())
print('Accuracy of RF is ', 1-(np.abs(yhat_rf-y_traintest)).mean())
#print('Accuracy of Boosted Tree is ', 1-(np.abs(yhat_bt-y_test)).mean())
print('Total time is ', time.time()-time1)

# when dealing only with nonmissing data, i reliable get 80.0-80.5% accuracy for svm and rf.

(8693, 14)
(8693, 14) (4277, 13)
(12970, 15)
                Age   RoomService     FoodCourt  ShoppingMall           Spa        VRDeck
count  12700.000000  12707.000000  12681.000000  12664.000000  12686.000000  12702.000000
mean      28.771969    222.897852    451.961675    174.906033    308.476904    306.789482
std       14.387261    647.596664   1584.370747    590.558690   1130.279641   1180.097223
min        0.000000      0.000000      0.000000      0.000000      0.000000      0.000000
25%       19.000000      0.000000      0.000000      0.000000      0.000000      0.000000
50%       27.000000      0.000000      0.000000      0.000000      0.000000      0.000000
75%       38.000000     49.000000     77.000000     29.000000     57.000000     42.000000
max       79.000000  14327.000000  29813.000000  23492.000000  22408.000000  24133.000000
               HomePlanet  CryoSleep      VIP  Destination  Transported
False                 NaN     8079.0  12401.0          NaN       4315.0
T

In [None]:
# for this problem boosted tree seems to work best, see 

In [None]:
### Export results ###
yhat_knn = model_knn.predict(X_test).astype(int)
yhat_svm = model_svm.predict(X_test).astype(int)
yhat_rf = model_rf.predict(X_test).astype(int)
#yhat_bt = grid_search.predict(X_test).astype(int)

replacements = {1:True, 0:False}
replacer = replacements.get

yhat_knn = [replacer(n, n) for n in yhat_knn]
yhat_svm = [replacer(n, n) for n in yhat_svm]
yhat_rf = [replacer(n, n) for n in yhat_rf]

In [None]:
submission_df_knn = pd.DataFrame({'PassengerId': test.PassengerId, 'Transported': yhat_knn}, columns=['PassengerId', 'Transported'])
submission_df_svm = pd.DataFrame({'PassengerId': test.PassengerId, 'Transported': yhat_svm}, columns=['PassengerId', 'Transported'])
submission_df_rf = pd.DataFrame({'PassengerId': test.PassengerId, 'Transported': yhat_rf}, columns=['PassengerId', 'Transported'])
#submission_df_bt = pd.DataFrame({'PassengerId': test.PassengerId, 'Transported': yhat_bt}, columns=['PassengerId', 'Transported'])

submission_df_knn.to_csv('submissions_SpaceTitanic_i1_knn.csv',index=False)
submission_df_svm.to_csv('submissions_SpaceTitanic_i1_svm.csv',index=False)
submission_df_rf.to_csv('submissions_SpaceTitanic_i1_rf.csv',index=False)
#submission_df_bt.to_csv('submissions_Titanic_i10_bt1.csv',index=False)

In [None]:
os.chdir(r'/kaggle/working')

from IPython.display import FileLink
FileLink(r'submissions_SpaceTitanic_i1_rf.csv')

In [None]:
FileLink(r'submissions_Titanic_i10_rf1.csv')

In [16]:
#%% XGBoost ###
# run this code only on Kaggle with GPU

time4 = time.time()

estimator = XGBClassifier(
    nthread=4,
    seed=42,
    use_label_encoder=False,
    tree_method = 'gpu_hist',
    gpu_id = 0
)

parameters = {
    'max_depth': [2,4,6,8],
    'n_estimators': [100,200,300],
    'learning_rate': [0.01, 0.03, 0.05]
}

grid_search = GridSearchCV(
    estimator=estimator,
    param_grid=parameters,
    scoring = 'accuracy',
    n_jobs = -1,
    cv = 2,
    verbose=True
)

grid_search.fit(X_train, y_train, eval_metric='rmse')
print(grid_search.best_score_,grid_search.best_params_,accuracy_score(y_train, grid_search.predict(X_train)))
print('XGB model time is ', time.time()-time4)

Fitting 2 folds for each of 36 candidates, totalling 72 fits
0.8018658963052666 {'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 300} 0.8306276364566023
XGB model time is  61.23125982284546


In [18]:
print('Accuracy of logistic regression is ', accuracy_score(y_traintest, yhat_lm))
print('Accuracy of KNN is ', accuracy_score(y_traintest, yhat_knn))
print('Accuracy of SVM is ', accuracy_score(y_traintest, yhat_svm))
print('Accuracy of RF is ', accuracy_score(y_traintest, yhat_rf))
print('xgboost', accuracy_score(y_traintest, grid_search.predict(X_traintest)))

Accuracy of logistic regression is  0.7505747126436781
Accuracy of KNN is  0.7850574712643679
Accuracy of SVM is  0.7873563218390804
Accuracy of RF is  0.7965517241379311
xgboost 0.8045977011494253


In [17]:
from sklearn.inspection import permutation_importance

In [19]:
### 8. feature importance ###

results = permutation_importance(model_lr, X_traintest, y_traintest, scoring='accuracy', n_jobs=-1)
fi_lr = pd.DataFrame({'col':X_test.columns, 'FI':results.importances_mean})
fi_lr.sort_values('FI', ascending = False)

Unnamed: 0,col,FI
7,VRDeck,0.061839
6,Spa,0.06092
3,RoomService,0.042299
0,CryoSleep,0.017701
4,FoodCourt,0.015172
9,HomePlanet_Europa,0.010575
16,Deck_E,0.002299
14,Deck_C,0.002069
12,Destination_TRAPPIST-1e,0.001609
15,Deck_D,0.001379


In [20]:
results = permutation_importance(model_knn, X_traintest, y_traintest, scoring='accuracy', n_jobs=-1)
fi_lr = pd.DataFrame({'col':X_test.columns, 'FI':results.importances_mean})
fi_lr.sort_values('FI', ascending = False)

Unnamed: 0,col,FI
7,VRDeck,0.07931
6,Spa,0.077931
3,RoomService,0.063448
4,FoodCourt,0.030345
5,ShoppingMall,0.027816
8,Side,0.015632
1,Age,0.008276
12,Destination_TRAPPIST-1e,0.007356
0,CryoSleep,0.007126
10,HomePlanet_Mars,0.006437


In [21]:
results = permutation_importance(model_svm, X_traintest, y_traintest, scoring='accuracy', n_jobs=-1)
fi_lr = pd.DataFrame({'col':X_test.columns, 'FI':results.importances_mean})
fi_lr.sort_values('FI', ascending = False)

Unnamed: 0,col,FI
7,VRDeck,0.093333
6,Spa,0.074713
3,RoomService,0.054023
4,FoodCourt,0.034253
5,ShoppingMall,0.022069
0,CryoSleep,0.008506
8,Side,0.008276
14,Deck_C,0.006897
16,Deck_E,0.005287
10,HomePlanet_Mars,0.003678


In [22]:
results = permutation_importance(model_rf, X_traintest, y_traintest, scoring='accuracy', n_jobs=-1)
fi_lr = pd.DataFrame({'col':X_test.columns, 'FI':results.importances_mean})
fi_lr.sort_values('FI', ascending = False)

Unnamed: 0,col,FI
0,CryoSleep,0.067586
7,VRDeck,0.057241
6,Spa,0.048276
4,FoodCourt,0.048046
5,ShoppingMall,0.029885
3,RoomService,0.017931
8,Side,0.008736
1,Age,0.008276
16,Deck_E,0.007356
17,Deck_F,0.005977


In [23]:
results = permutation_importance(grid_search, X_traintest, y_traintest, scoring='accuracy', n_jobs=-1)
fi_lr = pd.DataFrame({'col':X_test.columns, 'FI':results.importances_mean})
fi_lr.sort_values('FI', ascending = False)

Unnamed: 0,col,FI
0,CryoSleep,0.07241379
7,VRDeck,0.06850575
6,Spa,0.06390805
4,FoodCourt,0.04114943
5,ShoppingMall,0.02344828
3,RoomService,0.02321839
8,Side,0.01471264
14,Deck_C,0.01057471
16,Deck_E,0.008505747
1,Age,0.004597701


In [56]:
res = (X_train).copy()
res[['prob_0', 'prob_1']] = grid_search.predict_proba(X_train.iloc[:,0:20])
res['yhat'] = grid_search.predict(X_train.iloc[:,0:20])
res['y'] = y_train
res

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Side,HomePlanet_Europa,HomePlanet_Mars,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,y,prob_0,prob_1,yhat
2903,0,0.086635,0,-0.634238,-0.095711,2.277619,1.704508,2.071880,1,0,0,0,1,0,0,0,0,1,0,0,0.0,0.953091,0.046909,0
5986,0,0.225731,0,0.448412,-0.646203,-0.618896,1.962363,-0.638825,0,0,0,0,1,0,0,0,0,1,0,0,0.0,0.969460,0.030540,0
1109,1,0.364827,0,-0.634238,-0.646203,-0.618896,-0.658997,-0.638825,1,0,1,0,1,0,0,0,0,1,0,0,1.0,0.044450,0.955550,1
3906,1,0.225731,0,-0.634238,-0.646203,-0.618896,-0.658997,-0.638825,1,1,0,0,1,1,0,0,0,0,0,0,1.0,0.008993,0.991007,1
534,1,-0.122009,0,-0.634238,-0.646203,-0.618896,-0.658997,-0.638825,1,1,0,0,0,0,0,1,0,0,0,0,1.0,0.017495,0.982505,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7286,1,0.017087,0,-0.634238,-0.646203,-0.618896,-0.658997,-0.638825,1,0,0,0,1,0,0,0,0,1,0,0,0.0,0.087948,0.912052,1
3046,0,-0.261105,1,-0.634238,-0.646203,-0.618896,-0.658997,-0.638825,1,1,0,0,1,1,0,0,0,0,0,0,1.0,0.104958,0.895042,1
4079,1,-0.052461,0,-0.634238,-0.646203,-0.618896,-0.658997,-0.638825,1,1,0,0,0,1,0,0,0,0,0,0,1.0,0.005980,0.994020,1
2254,0,0.921211,0,-0.634238,1.923224,-0.187762,2.258880,2.208839,0,1,0,0,1,0,0,0,0,0,0,1,0.0,0.996754,0.003246,0


In [40]:
X_train.loc[X_train.y==1].describe()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Side,HomePlanet_Europa,HomePlanet_Mars,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,y
count,3953.0,3953.0,3953.0,3953.0,3953.0,3953.0,3953.0,3953.0,3953.0,3953.0,3953.0,3953.0,3953.0,3953.0,3953.0,3953.0,3953.0,3953.0,3953.0,3953.0,3953.0
mean,0.5745,-0.078429,0.017961,-0.357542,-0.136909,-0.183689,-0.358177,-0.336959,0.570706,0.323299,0.209461,0.090311,0.662282,0.132305,0.116873,0.0468,0.069061,0.300278,0.30382,0.000253,1.0
std,0.494481,1.045238,0.132827,0.686648,0.985749,0.936655,0.689495,0.707766,0.495038,0.467795,0.406976,0.286663,0.472992,0.338864,0.32131,0.211237,0.253591,0.458437,0.459964,0.015905,0.0
min,0.0,-1.999805,0.0,-0.634238,-0.646203,-0.618896,-0.658997,-0.638825,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,0.0,-0.747941,0.0,-0.634238,-0.646203,-0.618896,-0.658997,-0.638825,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,1.0,-0.122009,0.0,-0.634238,-0.646203,-0.618896,-0.658997,-0.638825,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,1.0,0.573471,0.0,-0.634238,-0.646203,-0.618896,-0.658997,-0.638825,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
max,1.0,3.424939,1.0,2.384568,2.877741,3.330749,2.35662,2.474111,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [41]:
X_train.loc[X_train.y==0].describe()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Side,HomePlanet_Europa,HomePlanet_Mars,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,y
count,3870.0,3870.0,3870.0,3870.0,3870.0,3870.0,3870.0,3870.0,3870.0,3870.0,3870.0,3870.0,3870.0,3870.0,3870.0,3870.0,3870.0,3870.0,3870.0,3870.0,3870.0
mean,0.126615,0.080111,0.028941,0.36521,0.139845,0.187629,0.365859,0.344186,0.468992,0.166408,0.193798,0.090181,0.747028,0.047804,0.05478,0.062791,0.127132,0.389406,0.28708,0.001034,0.0
std,0.332584,0.945149,0.167661,1.129754,0.995344,1.027685,1.127565,1.129508,0.499102,0.372495,0.395324,0.286478,0.434771,0.213378,0.22758,0.242617,0.333164,0.487679,0.452458,0.032137,0.0
min,0.0,-1.999805,0.0,-0.634238,-0.646203,-0.618896,-0.658997,-0.638825,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,-0.539297,0.0,-0.634238,-0.646203,-0.618896,-0.658997,-0.638825,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,-0.122009,0.0,-0.379373,-0.646203,-0.618896,-0.260787,-0.385866,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.643019,0.0,1.619171,1.101783,1.127998,1.52386,1.521971,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
max,1.0,3.494487,1.0,2.884574,2.682706,3.011897,2.971907,3.043957,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [42]:
false_pos = res.loc[(res.yhat==1)&(res.y==0)]
false_pos.describe()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Side,HomePlanet_Europa,HomePlanet_Mars,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,prob_1,prob_0,yhat,y
count,94.0,94.0,94.0,94.0,94.0,94.0,94.0,94.0,94.0,94.0,94.0,94.0,94.0,94.0,94.0,94.0,94.0,94.0,94.0,94.0,94.0,94.0,94.0,94.0
mean,0.56383,-0.298839,0.0,-0.478567,-0.206325,-0.086963,-0.394244,-0.444932,0.5,0.074468,0.106383,0.180851,0.648936,0.0,0.010638,0.031915,0.138298,0.223404,0.56383,0.0,0.637418,0.362582,1.0,0.0
std,0.498568,1.023295,0.0,0.460333,0.891248,1.02896,0.616026,0.529295,0.502681,0.263939,0.30998,0.386959,0.479862,0.0,0.103142,0.176716,0.347063,0.41876,0.498568,0.0,0.104889,0.104889,0.0,0.0
min,0.0,-1.999805,0.0,-0.634238,-0.646203,-0.618896,-0.658997,-0.638825,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.502919,0.027693,1.0,0.0
25%,0.0,-0.921811,0.0,-0.634238,-0.646203,-0.618896,-0.658997,-0.638825,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.563296,0.299265,1.0,0.0
50%,1.0,-0.330653,0.0,-0.634238,-0.646203,-0.618896,-0.658997,-0.638825,0.5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.617375,0.382625,1.0,0.0
75%,1.0,0.295279,0.0,-0.634238,-0.646203,-0.618896,-0.658997,-0.638825,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.700735,0.436704,1.0,0.0
max,1.0,3.494487,0.0,1.275176,2.106255,2.323922,2.132429,1.802501,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.972307,0.497081,1.0,0.0


In [43]:
false_neg = res.loc[(res.yhat==0)&(res.y==1)]
false_neg.describe()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Side,HomePlanet_Europa,HomePlanet_Mars,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,prob_1,prob_0,yhat,y
count,76.0,76.0,76.0,76.0,76.0,76.0,76.0,76.0,76.0,76.0,76.0,76.0,76.0,76.0,76.0,76.0,76.0,76.0,76.0,76.0,76.0,76.0,76.0,76.0
mean,0.105263,0.186381,0.0,0.538913,-0.033123,0.302477,0.083276,0.07711,0.407895,0.013158,0.263158,0.078947,0.802632,0.0,0.013158,0.078947,0.197368,0.460526,0.25,0.0,0.26919,0.73081,0.0,1.0
std,0.308931,1.029394,0.0,1.057248,0.891161,1.104268,0.947333,0.989516,0.494709,0.114708,0.443273,0.271448,0.400657,0.0,0.114708,0.271448,0.400657,0.501751,0.43589,0.0,0.150898,0.150898,0.0,0.0
min,0.0,-1.860709,0.0,-0.634238,-0.646203,-0.618896,-0.658997,-0.638825,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.059446,0.50053,0.0,1.0
25%,0.0,-0.539297,0.0,-0.634238,-0.646203,-0.618896,-0.658997,-0.638825,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.138024,0.571952,0.0,1.0
50%,0.0,-0.017687,0.0,0.484794,-0.646203,-0.618896,-0.658997,-0.638825,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.221892,0.778108,0.0,1.0
75%,0.0,0.938598,0.0,1.657902,0.624119,1.639111,1.157702,0.941555,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.25,0.0,0.428048,0.861976,0.0,1.0
max,1.0,3.146747,0.0,2.414783,1.822449,2.393438,1.930896,2.198098,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.49947,0.940554,0.0,1.0


In [57]:
res['error'] = np.abs(res.prob_1-res.y)
res = res.sort_values('error', ascending = False)
res.head(13)

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Side,HomePlanet_Europa,HomePlanet_Mars,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,y,prob_0,prob_1,yhat,error
538,1,-0.678393,0,-0.634238,-0.646203,-0.618896,-0.658997,-0.638825,0,1,0,0,1,0,1,0,0,0,0,0,0.0,0.020571,0.979429,1,0.979429
2509,1,-0.122009,0,-0.634238,-0.646203,-0.618896,-0.658997,-0.638825,0,1,0,0,0,0,0,0,0,0,0,0,0.0,0.024052,0.975948,1,0.975948
3795,1,-0.261105,0,-0.634238,-0.646203,-0.618896,-0.658997,-0.638825,1,1,0,0,0,0,0,0,0,0,0,0,0.0,0.026953,0.973047,1,0.973047
4681,1,0.434375,0,-0.634238,-0.646203,-0.618896,-0.658997,-0.638825,0,1,0,0,1,1,0,0,0,0,0,0,0.0,0.032915,0.967085,1,0.967085
7457,1,0.225731,0,-0.634238,-0.646203,-0.618896,-0.658997,-0.638825,0,1,0,0,1,1,0,0,0,0,0,0,0.0,0.032915,0.967085,1,0.967085
6229,1,-0.330653,0,-0.634238,-0.646203,-0.618896,-0.658997,-0.638825,0,1,0,0,1,1,0,0,0,0,0,0,0.0,0.033545,0.966455,1,0.966455
3271,1,0.573471,0,-0.634238,-0.646203,-0.618896,-0.658997,-0.638825,0,0,1,0,1,0,0,0,0,1,0,0,0.0,0.036543,0.963457,1,0.963457
1475,1,-0.400201,0,-0.634238,-0.646203,-0.618896,-0.658997,-0.638825,1,0,1,0,1,0,0,0,0,1,0,0,0.0,0.04301,0.95699,1,0.95699
7061,1,-0.330653,0,-0.634238,-0.646203,-0.618896,-0.658997,-0.638825,1,0,1,0,1,0,0,0,0,1,0,0,0.0,0.045291,0.954709,1,0.954709
6831,1,-1.860709,0,-0.634238,-0.646203,-0.618896,-0.658997,-0.638825,0,1,0,0,1,0,1,0,0,0,0,0,0.0,0.050939,0.949061,1,0.949061


In [55]:
res.head(15)

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Side,HomePlanet_Europa,HomePlanet_Mars,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,prob_1,prob_0,yhat,y,error
2210,1,0.434375,0,-0.634238,-0.646203,-0.618896,-0.658997,-0.638825,0,0,1,0,1,0,0,0,0,1,0,0,0.972307,0.027693,1,0.0,0.972307
4761,1,0.017087,0,-0.634238,-0.646203,-0.618896,-0.658997,-0.638825,0,1,0,0,1,0,0,0,1,0,0,0,0.951071,0.048929,1,0.0,0.951071
5710,1,-0.261105,0,-0.634238,-0.646203,-0.618896,-0.658997,-0.638825,1,0,1,0,1,0,0,0,0,1,0,0,0.947745,0.052255,1,0.0,0.947745
524,0,1.060307,0,2.040028,1.297807,1.075437,0.608369,-0.638825,1,0,0,0,1,0,0,0,0,1,0,0,0.059446,0.940554,0,1.0,0.940554
1502,0,-0.122009,0,-0.634238,-0.172035,-0.618896,-0.260787,1.853269,0,0,0,0,1,0,0,0,0,0,1,0,0.069611,0.930389,0,1.0,0.930389
1633,0,-0.191557,0,0.502317,-0.646203,1.819933,1.640446,-0.638825,0,0,0,0,1,0,0,0,0,1,0,0,0.072303,0.927697,0,1.0,0.927697
4183,1,-1.860709,0,-0.634238,-0.646203,-0.618896,-0.658997,-0.638825,1,1,0,0,1,0,0,0,0,0,0,0,0.920416,0.079584,1,0.0,0.920416
792,0,0.156183,0,0.361493,-0.03335,-0.618896,1.762133,-0.638825,0,0,0,0,1,0,0,0,0,1,0,0,0.079775,0.920225,0,1.0,0.920225
1925,0,1.825335,0,-0.230286,-0.646203,-0.618896,-0.658997,1.846894,0,0,0,0,1,0,0,0,0,0,1,0,0.081501,0.918499,0,1.0,0.918499
1296,0,-0.469749,0,1.877824,-0.646203,-0.618896,-0.658997,0.955776,0,0,0,0,1,0,0,0,0,1,0,0,0.088931,0.911069,0,1.0,0.911069
