In [28]:
import os
import re
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [11]:
data_dir = f'{os.getcwd()}/data/'

ss_titanic_train_raw = pd.read_csv('/Users/jonyarber/Documents/Projects/spaceship_titanic/data/train.csv')
ss_titanic_test_raw = pd.read_csv('/Users/jonyarber/Documents/Projects/spaceship_titanic/data/test.csv')

In [12]:
def make_dummies(df, var_list):
    
    for var in var_list:
        
        df = pd.concat([df,
                        pd.get_dummies(df[var],
                                       prefix = var,
                                       dtype = 'int')],
                       axis = 1)

    df = df.drop(var_list, axis = 1)
        
    return df

In [14]:
def clean_dfs(df):

        # Create a copy of DF
        df = df.copy()

        # Convert True/False columns to binary
        binary_vars = ['CryoSleep', 'VIP', 'Transported']

        # 'Transported' won't be in test DF
        binary_vars = list(set(df.columns).intersection(binary_vars))

        #df.loc[:, binary_vars] = df[binary_vars].map(lambda x: 1 if x == True else 0 if x == False else x)
    
        for var in binary_vars:
            df[var] = [1 if x == True else 0 if x == False else x for x in df[var]]

    
        #### Billing ####
        # Billing Vars
        billing_vars = ['RoomService', 'FoodCourt', 'ShoppingMall', 'VRDeck', 'Spa']

        # Fill the NAs with 0
        df.loc[:, billing_vars] = df[billing_vars].fillna(0) 

        # Create TotalSpent
        df['TotalSpent'] = df[billing_vars].sum(axis = 1)

    
        #### Group ####
        # Check if in group
        df['GroupNo'] = [x.split('_')[0] for x in df['PassengerId']]
        df['InGroup'] = np.where((df['GroupNo'] == df['GroupNo'].shift(1))  | (df['GroupNo'] == df['GroupNo'].shift(-1)) , 1, 0)

    
        #### Cabin ####
        # Separate out cabin
        df[['Deck', 'CabinNo', 'Side']] = df['Cabin'].str.split("/", expand = True)


        #### Home Planet ####
        # Some Home Planets can be determined by Deck
        # If Deck is A-C, Europa
        df.loc[df['Deck'].isin(['A', 'B', 'C']), 'HomePlanet'] = df['HomePlanet'].fillna('Europa')

        # If Deck is D, Mars
        df.loc[df['Deck'] == 'D', 'HomePlanet'] = df['HomePlanet'].fillna('Mars')

        # If Deck is G, Earth
        df.loc[df['Deck'] == 'G', 'HomePlanet'] = df['HomePlanet'].fillna('Earth')


        #### Age ####
        # Impute Age based on Home Planet
        median_age_by_planet = df.groupby('HomePlanet')['Age'].median()

        for planet in median_age_by_planet.index:
            df.loc[df['HomePlanet'] == planet, 'Age'] = df['Age'].fillna(median_age_by_planet[planet])

        # Fill remaining with median age
        df['Age'] = df['Age'].fillna(df['Age'].median())


        #### Cryo Sleep ####
        # If money was spent, not in CryoSleep (where NA)
        df['CryoSleep'] = df.apply(lambda x: 0 if x['TotalSpent'] > 0 & pd.isna(x['CryoSleep']) else x['CryoSleep'], axis = 1)
    
        # If Age < 12 and no money spent, not in CryoSleep (where NA)
        df['CryoSleep'] = df.apply(lambda x: 1 if x['TotalSpent'] == 0 & int(x['Age']) < 12 & pd.isna(x['CryoSleep']) else x['CryoSleep'], axis = 1)
    
        # Fill rest of CyroSleep with 1 (no money spent - in CryoSleep)
        df['CryoSleep'] = df['CryoSleep'].fillna(1)

    
        #### VIP ####
        # Most people didn't travel VIP - can fill NA with 0
        df['VIP'] = df['VIP'].fillna(0)


        #### Final Clean Up ####
        # Clean the Destinations for final DF
        df['Destination'] = df['Destination'].apply(lambda x: re.sub('[^A-Za-z0-9]+', '', str(x)).upper() if pd.notnull(x) else np.nan)

        # Drop Cabin, Name, and GroupNo
        df.drop(['GroupNo', 'Name', 'Cabin', 'CabinNo'], axis = 1, inplace = True)

        # Dummy categorical variables
        df = make_dummies(df, ['HomePlanet', 'Destination', 'Deck', 'Side'])

        return df

In [77]:
def df_prep(df, scaler):
    
    vars_to_drop = ['PassengerId', 'Transported']
    
    vars_to_drop = list(set(vars_to_drop).intersection(df.columns))
    
    df_copy = df.copy().drop(vars_to_drop, axis = 1)
    
    # Scale
    if scaler == 'standard':
        df_scaled = StandardScaler().fit_transform(df_copy)
        df_scaled = pd.DataFrame(df_scaled, columns = df_copy.columns)
        
    if scaler == 'minmax':
        df_scaled = MinMaxScaler().fit_transform(df_copy)
        df_scaled = pd.DataFrame(df_scaled, columns = df_copy.columns)
    
    if 'Transported' in vars_to_drop:
        df_scaled['Transported'] = df['Transported']
     
    return df_scaled

In [56]:
def run_random_state_generator(df, n_states):
    
    best_score = 0
    worst_score = 1

    X = df.copy().drop('PassengerId', axis = 1)
    y = X.pop('Transported')

    X = df_prep(X, 'minmax')
      
    xgb = XGBClassifier(n_estimators = 150, random_state = 0)

    for n in np.arange(n_states + 1):

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, stratify = y, random_state = n)

        xgb.fit(X_train, y_train)
        
        y_pred = xgb.predict(X_test)

        score = accuracy_score(y_pred, y_test)

        if score > best_score:
            best_score = score
            best_ran_state = n

        if score < worst_score:
            worst_score = score
            worst_ran_state = n

    return best_ran_state, worst_ran_state, best_score, worst_score

In [25]:
ss_titanic_train, ss_titanic_test = [clean_dfs(df) for df in [ss_titanic_train_raw, ss_titanic_test_raw]]

In [115]:
best_random_state, worst_random_state, best_score, worst_score = run_random_state_generator(ss_titanic_train, 10000)

In [116]:
best_random_state #9447 - unscaled

9447

In [117]:
worst_random_state # 2317 - unscaled

2317

In [118]:
best_score

0.8292121909143185

In [119]:
worst_score

0.7625071880391029

Best Random State

In [120]:
X = df_prep(ss_titanic_train.copy(), 'minmax')
y = X.pop('Transported')

X_train_good, X_test_good, y_train_good, y_test_good = train_test_split(X, y, stratify = y, random_state = best_random_state)

In [121]:
# Good score
X.mean() - X_train_good.mean()

CryoSleep                 0.000764
Age                      -0.001211
VIP                      -0.000578
RoomService              -0.000136
FoodCourt                -0.000202
ShoppingMall              0.000036
Spa                      -0.000255
VRDeck                   -0.000203
TotalSpent               -0.000493
InGroup                   0.001789
HomePlanet_Earth         -0.001864
HomePlanet_Europa        -0.001294
HomePlanet_Mars           0.003160
Destination_55CANCRIE    -0.001711
Destination_PSOJ318522   -0.000011
Destination_TRAPPIST1E    0.001647
Deck_A                    0.001991
Deck_B                   -0.002426
Deck_C                   -0.000585
Deck_D                    0.001144
Deck_E                   -0.000625
Deck_F                    0.004642
Deck_G                   -0.003370
Deck_T                   -0.000192
Side_P                    0.001095
Side_S                   -0.000517
dtype: float64

In [122]:
xgb = XGBClassifier(n_estimators = 150, random_state = 0)
xgb.fit(X_train_good, y_train_good)
pd.DataFrame({'fis':xgb.feature_importances_}, index = X.columns).sort_values(by = 'fis', ascending = False)

Unnamed: 0,fis
HomePlanet_Earth,0.235229
CryoSleep,0.104588
TotalSpent,0.081048
Deck_E,0.068311
HomePlanet_Mars,0.046809
FoodCourt,0.033976
ShoppingMall,0.032126
Side_S,0.030786
HomePlanet_Europa,0.029978
Deck_C,0.029831


Worst Random state

In [123]:
X = df_prep(ss_titanic_train.copy(), 'minmax')
y = X.pop('Transported')

X_train_bad, X_test_bad, y_train_bad, y_test_bad = train_test_split(X, y, stratify = y, random_state = worst_random_state)

In [124]:
# Bad score
X.mean() - X_train_bad.mean()

CryoSleep                 0.002911
Age                      -0.000755
VIP                       0.000649
RoomService              -0.000224
FoodCourt                 0.000203
ShoppingMall              0.000153
Spa                      -0.000464
VRDeck                   -0.000261
TotalSpent               -0.000285
InGroup                   0.004550
HomePlanet_Earth          0.002124
HomePlanet_Europa        -0.003135
HomePlanet_Mars           0.001472
Destination_55CANCRIE    -0.002938
Destination_PSOJ318522    0.006125
Destination_TRAPPIST1E   -0.003108
Deck_A                   -0.000770
Deck_B                    0.000028
Deck_C                   -0.000892
Deck_D                   -0.001464
Deck_E                    0.003210
Deck_F                    0.000193
Deck_G                    0.001385
Deck_T                   -0.000192
Side_P                    0.000788
Side_S                    0.000710
dtype: float64

In [125]:
xgb = XGBClassifier(n_estimators = 150, random_state = 0)
xgb.fit(X_train_bad, y_train_bad)
pd.DataFrame({'fis':xgb.feature_importances_}, index = X.columns).sort_values(by = 'fis', ascending = False)

Unnamed: 0,fis
HomePlanet_Earth,0.282607
Deck_E,0.086452
TotalSpent,0.080857
CryoSleep,0.075383
HomePlanet_Mars,0.04807
Side_S,0.035816
Deck_C,0.034211
FoodCourt,0.033005
HomePlanet_Europa,0.030099
ShoppingMall,0.029342
