In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('max_rows',100)
pd.set_option('max_columns',100)
import seaborn as sns
import matplotlib.pyplot as  plt
import scipy
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score

In [2]:
df_train = pd.read_csv('Data/train.csv')
df_test = pd.read_csv('Data/test.csv')
target = df_train['Transported']

df_train.drop(['Transported'], axis= 1, inplace = True)
cdata = pd.concat([df_train, df_test],axis = 0,ignore_index=True)
cdata.drop(['Name'], axis= 1, inplace = True)

In [3]:
cdata2 = cdata.copy()
cdata2[['pass_grp','pass_no']]= cdata2['PassengerId'].str.split('_', n = -1, expand = True)
cdata2.drop('PassengerId', axis = 1, inplace = True)
cdata2[['deck','num','side']]= cdata2['Cabin'].str.split('/', n = -1, expand = True)
cdata2.drop('Cabin', axis = 1, inplace = True)

In [4]:
cdata3 = cdata2.copy()

cdata3['Total_cost'] = pd.Series(np.zeros(cdata3.shape[0]))
for feat in ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']:
    cdata3[feat].fillna(0, inplace = True)
    cdata3['Total_cost'] += cdata3[feat]
#     cdata3.drop(feat, axis = 1,inplace = True )
cdata3.drop(['num', 'pass_grp'],axis = 1, inplace = True)


In [5]:
cdata4 = cdata3.copy()

categorical_features = cdata4.select_dtypes(object)
num_features = cdata4.select_dtypes(np.number)
for feat in categorical_features:
    cdata4[feat].fillna(cdata4[feat].mode()[0], inplace= True)

cdata4['Age'].fillna(int(cdata['Age'].mean()), inplace = True)

In [6]:
cdata5 = cdata4.copy()

def skew_log_transform(df):
  # using log1p transform to fix skew in data
  df = df.copy()
  numeric_features = df.select_dtypes(np.number).columns
  for column in numeric_features:
    skew = abs(scipy.stats.skew(df[column]))
  
    if skew >= 0.5:
      print(column)
      df[column] = np.log1p(df[column])
      # l1p = np.log1p(df[column])
   
      # tskew = abs(scipy.stats.skew(l1p))
      # # print(tskew)
      # # print(l1p)
      # if tskew < skew:
      #   df[column] = l1p
  return df

cdata5 = skew_log_transform(cdata5)

RoomService
FoodCourt
ShoppingMall
Spa
VRDeck
Total_cost


In [7]:
cdata6 = cdata5.copy()

cdata6[['CryoSleep', 'VIP']] = cdata6[['CryoSleep','VIP']].astype(int)
cdata6 = pd.get_dummies(cdata6,drop_first= True)

In [8]:
cdata7 = cdata6.copy()

scaler = StandardScaler()
scaler.fit(cdata7)
cdata7 = pd.DataFrame(scaler.transform(cdata7), index = cdata7.index, columns = cdata7.columns)

In [21]:
cdata7.head()

Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Total_cost,HomePlanet_Europa,HomePlanet_Mars,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,pass_no_02,pass_no_03,pass_no_04,pass_no_05,pass_no_06,pass_no_07,pass_no_08,deck_B,deck_C,deck_D,deck_E,deck_F,deck_G,deck_T,side_S
0,-0.738967,0.719562,-0.146633,-0.640372,-0.648903,-0.624889,-0.663234,-0.63481,-1.151868,1.771949,-0.51082,-0.316951,0.646731,-0.443899,-0.263154,-0.163577,-0.119961,-0.091634,-0.071517,-0.038302,3.219817,-0.304721,-0.242437,-0.337033,-0.733613,-0.641459,-0.029135,-1.030535
1,-0.738967,-0.334062,-0.146633,1.086538,0.136898,0.641264,1.620304,0.755227,0.638416,-0.56435,-0.51082,-0.316951,0.646731,-0.443899,-0.263154,-0.163577,-0.119961,-0.091634,-0.071517,-0.038302,-0.310577,-0.304721,-0.242437,-0.337033,1.363117,-0.641459,-0.029135,0.970369
2,-0.738967,2.054151,6.819762,0.749902,2.143455,-0.624889,2.525889,0.7937,1.355722,1.771949,-0.51082,-0.316951,0.646731,-0.443899,-0.263154,-0.163577,-0.119961,-0.091634,-0.071517,-0.038302,-0.310577,-0.304721,-0.242437,-0.337033,-0.733613,-0.641459,-0.029135,0.970369
3,-0.738967,0.298112,-0.146633,-0.640372,1.79381,1.675295,2.272011,1.288796,1.166992,1.771949,-0.51082,-0.316951,0.646731,2.252763,-0.263154,-0.163577,-0.119961,-0.091634,-0.071517,-0.038302,-0.310577,-0.304721,-0.242437,-0.337033,-0.733613,-0.641459,-0.029135,0.970369
4,-0.738967,-0.895994,-0.146633,1.460007,0.805817,1.327477,1.630682,-0.233642,0.745026,-0.56435,-0.51082,-0.316951,0.646731,-0.443899,-0.263154,-0.163577,-0.119961,-0.091634,-0.071517,-0.038302,-0.310577,-0.304721,-0.242437,-0.337033,1.363117,-0.641459,-0.029135,0.970369


In [10]:
x_train = cdata7.iloc[:8693,:]
x_test = cdata7.iloc[8693:,:]
final = pd.concat([x_train,target], axis = 1)

In [11]:
from gokinjo import knn_kfold_extract
from gokinjo import knn_extract

KNN_trn_features = knn_kfold_extract(x_train.to_numpy(), target, k = 2, normalize = 'standard')
KNN_tst_features = knn_extract(x_train.to_numpy(), target, x_test.to_numpy(), k = 2, normalize = 'standard')

knn_cols = ['KNN_K1_01',
            'KNN_K1_02',
            'KNN_K2_01',
            'KNN_K2_02']

KNN_feat = pd.DataFrame(KNN_trn_features, columns = knn_cols)

x_train = pd.concat([x_train, KNN_feat], axis = 1)


KNN_feat = pd.DataFrame(KNN_tst_features, columns = knn_cols).set_index(x_test.index)

x_test = pd.concat([x_test, KNN_feat], axis = 1)
x_test.head()
x_train.head()


Unnamed: 0,CryoSleep,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Total_cost,HomePlanet_Europa,HomePlanet_Mars,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,pass_no_02,pass_no_03,pass_no_04,pass_no_05,pass_no_06,pass_no_07,pass_no_08,deck_B,deck_C,deck_D,deck_E,deck_F,deck_G,deck_T,side_S,KNN_K1_01,KNN_K1_02,KNN_K2_01,KNN_K2_02
0,-0.738967,0.719562,-0.146633,-0.640372,-0.648903,-0.624889,-0.663234,-0.63481,-1.151868,1.771949,-0.51082,-0.316951,0.646731,-0.443899,-0.263154,-0.163577,-0.119961,-0.091634,-0.071517,-0.038302,3.219817,-0.304721,-0.242437,-0.337033,-0.733613,-0.641459,-0.029135,-1.030535,0.069709,0.487961,0.697087,1.812426
1,-0.738967,-0.334062,-0.146633,1.086538,0.136898,0.641264,1.620304,0.755227,0.638416,-0.56435,-0.51082,-0.316951,0.646731,-0.443899,-0.263154,-0.163577,-0.119961,-0.091634,-0.071517,-0.038302,-0.310577,-0.304721,-0.242437,-0.337033,1.363117,-0.641459,-0.029135,0.970369,1.270899,2.748427,1.236608,2.770518
2,-0.738967,2.054151,6.819762,0.749902,2.143455,-0.624889,2.525889,0.7937,1.355722,1.771949,-0.51082,-0.316951,0.646731,-0.443899,-0.263154,-0.163577,-0.119961,-0.091634,-0.071517,-0.038302,-0.310577,-0.304721,-0.242437,-0.337033,-0.733613,-0.641459,-0.029135,0.970369,2.462258,5.250767,2.707978,5.580019
3,-0.738967,0.298112,-0.146633,-0.640372,1.79381,1.675295,2.272011,1.288796,1.166992,1.771949,-0.51082,-0.316951,0.646731,2.252763,-0.263154,-0.163577,-0.119961,-0.091634,-0.071517,-0.038302,-0.310577,-0.304721,-0.242437,-0.337033,-0.733613,-0.641459,-0.029135,0.970369,2.320376,5.11923,2.559447,5.330534
4,-0.738967,-0.895994,-0.146633,1.460007,0.805817,1.327477,1.630682,-0.233642,0.745026,-0.56435,-0.51082,-0.316951,0.646731,-0.443899,-0.263154,-0.163577,-0.119961,-0.091634,-0.071517,-0.038302,-0.310577,-0.304721,-0.242437,-0.337033,1.363117,-0.641459,-0.029135,0.970369,0.849576,1.819217,1.290118,2.773648


In [12]:
from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from sklearn.ensemble import VotingClassifier

In [13]:
models = {'catboost':CatBoostClassifier(),
           'gbc':GradientBoostingClassifier(),
           'ridge':RidgeClassifier(),
           'lr':LogisticRegression()}

In [14]:
# results = {}
# kf = KFold(n_splits = 10)
def age_groups(df, age_limit = 13):
    df['AgeGroup'] = np.where(df['Age'] < age_limit, 0, 1)
    return df
# for name,model in models.items():
    
#     result = cross_val_score(model, x_train, target, cv=5)
#     results[name] = result

In [15]:
# for name, result in results.items():
#     print("----------\n" + name)
#     print(np.mean(result))
#     print(np.std(result))

In [16]:
estimators = [('catboost', CatBoostClassifier()), ('gbc', GradientBoostingClassifier()),  ('lr', LogisticRegression())]
eclf = VotingClassifier(estimators=estimators, voting='soft', weights=[1, 1, 1])
eclf.fit(x_train,target)

Learning rate set to 0.025939
0:	learn: 0.6804795	total: 66.3ms	remaining: 1m 6s
1:	learn: 0.6701193	total: 70.5ms	remaining: 35.2s
2:	learn: 0.6591427	total: 74.3ms	remaining: 24.7s
3:	learn: 0.6488337	total: 77.9ms	remaining: 19.4s
4:	learn: 0.6387123	total: 82.1ms	remaining: 16.3s
5:	learn: 0.6296124	total: 85.4ms	remaining: 14.1s
6:	learn: 0.6199545	total: 89ms	remaining: 12.6s
7:	learn: 0.6120690	total: 92.3ms	remaining: 11.4s
8:	learn: 0.6027136	total: 95.8ms	remaining: 10.6s
9:	learn: 0.5951147	total: 99.2ms	remaining: 9.82s
10:	learn: 0.5882773	total: 103ms	remaining: 9.27s
11:	learn: 0.5812265	total: 107ms	remaining: 8.82s
12:	learn: 0.5749054	total: 111ms	remaining: 8.39s
13:	learn: 0.5682939	total: 114ms	remaining: 8.02s
14:	learn: 0.5629102	total: 117ms	remaining: 7.71s
15:	learn: 0.5571314	total: 121ms	remaining: 7.47s
16:	learn: 0.5516619	total: 125ms	remaining: 7.23s
17:	learn: 0.5465563	total: 129ms	remaining: 7.04s
18:	learn: 0.5410875	total: 132ms	remaining: 6.84s
19:

VotingClassifier(estimators=[('catboost',
                              <catboost.core.CatBoostClassifier object at 0x133ae0ac0>),
                             ('gbc', GradientBoostingClassifier()),
                             ('lr', LogisticRegression())],
                 voting='soft', weights=[1, 1, 1])

In [17]:
y_pred = eclf.predict(x_test)

In [18]:
sample_sub = pd.read_csv('Data/sample_submission.csv')
y_pred = pd.Series(y_pred, name='Transported')
sample_sub['Transported'] = y_pred
submission = sample_sub.copy()
submission.to_csv('submission.csv',index= False)