In [None]:
import xgboost as xgb
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.model_selection import train_test_split,cross_val_score, cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, auc, accuracy_score

In [None]:
train_data = pd.read_csv('./train.csv')
test_data = pd.read_csv('./test.csv')
display(train_data.shape)
display(test_data.shape)
dfs = [train_data, test_data]

In [None]:
def summary(df):
    print(f"dataset has {df.shape[1]} and {df.shape[0]} examples")
    summary = pd.DataFrame(index=df.columns)
    summary['Missing'] = df.isna().sum().values
    summary['Unique_values'] = df.nunique().values
    summary['Duplicated'] = df.duplicated().sum()
    summary['Types'] = df.dtypes
    
    return summary

In [None]:
for df in dfs:
    passengerId_split = df.PassengerId.str.split('_')
    # type(passengerId_split)
    df['group_id'] = passengerId_split.str[0]
    df['group_id'] = df['group_id'].astype(int)

In [None]:
summary(df)

In [None]:
summary(test_data)

In [None]:
for df in dfs:
    tmp_df = df[df['HomePlanet'].notna()]
    tmp_df = tmp_df.groupby('group_id').apply(lambda x: x['HomePlanet'].unique()[0])
    df['HomePlanet'] = df.apply(
        lambda row: ((tmp_df[row['group_id']] if row['group_id'] in tmp_df.index else np.NaN) 
                    if pd.isnull(row['HomePlanet']) else row['HomePlanet']), axis = 1
    )

In [None]:
for df in dfs:
    tmp = df['HomePlanet'].value_counts()
    v = tmp.index # ['Earth', 'Europa', 'Mars']
    p = tmp.values 
    p = p/sum(p)
    df.loc[df['HomePlanet'].isna(), 'HomePlanet'] = np.random.choice(v, df['HomePlanet'].isna().sum(), p=p)
    df['HomePlanet'] = df['HomePlanet'].astype('category')

In [None]:
for df in dfs:
    df['total_spend'] = df['RoomService'] + df['FoodCourt'] + \
                                df['ShoppingMall'] + df['Spa'] + df['VRDeck']
    df['any_spend'] = np.where(df['total_spend'] > 0, True, False)
    spend_criteria = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

In [None]:
for df in dfs:
    df['CryoSleep'] = df['CryoSleep'].astype(bool)
    df.loc[df['CryoSleep'] == True][spend_criteria].isna().sum()

In [None]:
for df in dfs:
    for criteria in spend_criteria:
        df.loc[df['CryoSleep'] == True & df[criteria].isna(), criteria] = 0

In [None]:
for df in dfs:
    df.loc[df['any_spend'] == True & df['CryoSleep'].isna(), 'CryoSleep'] = False
    df['CryoSleep'] = df['CryoSleep'].astype(int)
    df['CryoSleep'].isna().sum()

In [None]:
for df in dfs:
    df.loc[df['CryoSleep'] == 1 & df['total_spend'].isna(), 'total_spend'] = 0
    df['total_spend'].isna().sum()

In [None]:
for df in dfs:
    tmp_df = df[df['Cabin'].notna()]
    tmp_df = tmp_df.groupby('group_id').apply(lambda x: x['Cabin'].unique()[0])
    df['Cabin'] = df.apply(
        lambda row: ((tmp_df[row['group_id']] if row['group_id'] in tmp_df.index else np.NaN) 
                    if pd.isnull(row['Cabin']) else row['Cabin']), axis = 1
    )

In [None]:
for df in dfs:
    tmp = df['Cabin'].apply(lambda x: x.split('/') if type(x) != float else ['-1', '-1', '-1']).to_list()
    tmp = np.array(tmp)
    df['cabin_deck'] = tmp[:, 0]
    df['cabin_num'] = tmp[:, 1]
    df['cabin_side'] = tmp[:, 2]
    df.drop(columns = 'Cabin', inplace = True)

In [None]:
for df in dfs:
    df.loc[df['cabin_deck']=='-1', 'cabin_deck'] = np.random.choice(['F', 'G'], 
                                                    sum(df['cabin_deck']=='-1'), p=[0.5, 0.5])
    df['cabin_deck'] = df['cabin_deck'].astype('category')

In [None]:
for df in dfs:
    # df['cabin_num'].value_counts()
    df['cabin_num'] = df['cabin_num'].astype(int)
    calculated_mean = df[df['cabin_num'] != -1]['cabin_num'].mean()
    df.loc[df['cabin_num'] == -1, 'cabin_num'] =  calculated_mean

In [None]:
for df in dfs:
    df.loc[df['cabin_side']=='-1', 'cabin_side'] = np.random.choice(['S', 'P'],
                                                    sum(df['cabin_side']=='-1'), p=[0.5, 0.5])
    df['cabin_side'] = df['cabin_side'].map({'S':0, 'P':1})

In [None]:
df['Destination'].value_counts()

In [None]:
for df in dfs:
    df.loc[df['Destination'].isna(), 'Destination'] = np.random.choice(
        ['TRAPPIST-1e', '55 Cancri e', 'PSO J318.5-22'], sum(df['Destination'].isna()),
        p = [0.5, 0.3, 0.2]
    )
    df['Destination'] = df['Destination'].astype('category')

In [None]:
sns.histplot(df['Age'], bins=30)

In [None]:
# mean_age = df['Age'].mean()
# std_age = df['Age'].std()
# is_null = df['Age'].isnull().sum()
# random_sample = np.random.uniform(mean_age - std_age, mean_age + std_age, size=is_null)
# df.loc[df['Age'].isna(), 'Age'] = random_sample

In [None]:
for df in dfs:
    # df['Destination'].fillna(df['Destination'].mode()[0], inplace=True)
    df['VIP'].fillna(df['VIP'].mode()[0], inplace=True)
    df['VIP'] = df['VIP'].astype(int)

In [None]:
for df in dfs:
    df['Name'].fillna('Unknown Unknown', inplace=True)
    df['PassengerId'] = df['PassengerId'].astype(str)
    df.drop(columns='total_spend', inplace=True)
    df.drop(columns='any_spend', inplace=True)

In [None]:
# inefficient code, takes too much time to run 
# df['group_size'] = df['group_id'].map(lambda x: df['group_id'].value_counts()[x])
# df['group_size'] = df['group_size'].astype(int)

In [None]:
for df in dfs:
    group_size = df['group_id'].value_counts().to_dict()
    df['group_size'] = df['group_id'].apply(lambda x: group_size[x])

In [None]:
quant_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'group_size']
quant_pipeline = Pipeline(steps=[
    ('scaler', MinMaxScaler()),
    ('imputer', KNNImputer(n_neighbors=7))
])

In [None]:
for df in dfs:
    df[quant_features] = quant_pipeline.fit_transform(df[quant_features])

In [None]:
cat_features = train_data.select_dtypes(include='category').columns.to_list()
cat_features

In [None]:
train_data = pd.concat([train_data, pd.get_dummies(train_data[cat_features], drop_first=True)], axis=1)
train_data.drop(columns=cat_features, inplace=True)

In [None]:
test_data = pd.concat([test_data, pd.get_dummies(test_data[cat_features], drop_first=True)], axis=1)
test_data.drop(columns=cat_features, inplace=True)

In [None]:
# df.columns
# summary(train_data)
# summary(test_data)

In [None]:
x_train = train_data.drop(['Name', 'PassengerId', 'Transported'], axis=1)
y_train = train_data['Transported']
display(x_train.shape)
display(y_train.shape)

In [None]:
x_test = test_data.drop(['Name', 'PassengerId'], axis=1)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2, random_state=123)

In [None]:
params = {
    'n_estimators' : [100, 200, 400],
    'learning_rate' : [0.01, 0.1, 0.3, 0.5, 0.9],
    'max_depth' : range(3, 8, 2),
    'min_child_weight' : range(1, 6, 2),
    'gamma' : [i/10.0 for i in range(1, 5)],
    'subsample' : [i/10.0 for i in range(6, 10)],
    'colsample_bytree' : [i/10.0 for i in range(6, 10)],
    'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}

In [None]:
gSearch = GridSearchCV(estimator=xgb.XGBClassifier(objective='binary:logistic', n_jobs=-1), 
                       param_grid=params, cv=5) 

In [None]:
gSearch.fit(X_train, y_train)

In [None]:
gSearch.best_params_, gSearch.best_score_

In [None]:
# xg_cl = xgb.XGBClassifier(objective='binary:logistic', n_estimators =10, seed = 123)
# xg_cl.fit(X_train, y_train)