In [1]:
import pandas as pd
import numpy as np

import collections
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

In [2]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

#####               delete features with more than 20% missing values

In [66]:
features = pd.concat([train, test]).reset_index(drop=True)
feature_flag = features.isna().sum() > features.shape[0] * 0.2
features_omit = list(feature_flag[feature_flag == True].index)
features_omit.remove("Transported")
features = features.loc[:, ~features.columns.isin(features_omit)]

##### fill NA variables

In [67]:
num_features = []
str_features = []
for i in features.columns:
    if pd.api.types.is_string_dtype(features[i]):
        str_features.append(i)
    if pd.api.types.is_numeric_dtype(features[i]):
        num_features.append(i)

In [68]:
for i in str_features:
    features.loc[features[i].isna(), i] = "NAN"
for i in num_features:
    features[i].fillna(features[i].median(), inplace=True)

##### Features Engineering

In [70]:
features.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transported'],
      dtype='object')

special treatment passenger ID, cabin, Name

In [72]:
features["Cabin_deck"] = features["Cabin"].str.split("/", expand=True).iloc[:, 0]
features["Cabin_side"] = features["Cabin"].str.split("/", expand=True).iloc[:, 2].fillna("NAN")

In [73]:
onehot_cols = ["HomePlanet", "CryoSleep", "Destination", "VIP", "Cabin_deck", "Cabin_side"]
for i in onehot_cols:
    features = pd.get_dummies(features, columns=[i], prefix=[i])

In [76]:
features["PassengerId_family"] = features["PassengerId"].str.split("_", expand=True).iloc[:, 0]

In [89]:
features["Last_Name"] = features["Name"].str.split(" ", expand=True).iloc[:, -1]

In [99]:
mapping = {False: 0, True: 1, "NAN": 0.5}
features["Transported_num"] = features.replace({"Transported": mapping})["Transported"]
features["Family_transported"] = 0.5

In [100]:
for grp, grp_df in features[["PassengerId", "PassengerId_family", "Transported_num"]].groupby(["PassengerId_family"]):
    if len(grp_df) != 1:
        for ind, row in grp_df.iterrows():
            smax = grp_df.drop(ind)["Transported_num"].max()
            smin = grp_df.drop(ind)["Transported_num"].min()
            passID = row["PassengerId"]
            if smax == 1.0:
                features.loc[features["PassengerId"] == passID, "Family_transported"] = 1
            elif smin == 0.0:
                features.loc[features["PassengerId"] == passID, "Family_transported"] = 0

  for grp, grp_df in features[["PassengerId", "PassengerId_family", "Transported_num"]].groupby(["PassengerId_family"]):


In [106]:
features.drop(["PassengerId", "Cabin", "Name", "PassengerId_family", "Transported_num", "Last_Name"], axis=1, inplace=True)

##### k nearest neighbors

In [111]:
train_y = features["Transported"].iloc[:train.shape[0]]

In [112]:
features.drop(["Transported"], axis=1, inplace=True)
train_x = features.iloc[:train.shape[0], :]
test_x = features.iloc[train.shape[0]:, :]

In [113]:
std_scaler = StandardScaler()
train_x = std_scaler.fit_transform(train_x)
test_x = std_scaler.transform(test_x)

In [129]:
n_neighbors = [6,7,8,9,10,11,12,14,16,18,20,22]
algorithm = ['auto']
weights = ['uniform', 'distance']
leaf_size = list(range(1,50,5))
hyperparams = {'algorithm': algorithm, 'weights': weights, 'leaf_size': leaf_size, 
               'n_neighbors': n_neighbors}
gd=GridSearchCV(estimator = KNeighborsClassifier(), param_grid = hyperparams, verbose=True, 
                cv=10, scoring = "roc_auc")
gd.fit(train_x, train_y.astype(int))
print(gd.best_score_)
print(gd.best_estimator_)

Fitting 10 folds for each of 240 candidates, totalling 2400 fits
0.8507640620782919
KNeighborsClassifier(leaf_size=1, n_neighbors=18)


In [131]:
test["Transported"] = gd.best_estimator_.predict(test_x)

In [135]:
test["Transported"] = test["Transported"].astype(bool)
test[["PassengerId", "Transported"]].to_csv("submission.csv", index=False)