In [23]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import * 
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer

In [28]:
def clean_data(path, id_col = 0, is_test = False):
    df = pd.read_csv(path, index_col=id_col)
    cabins = df["Cabin"].tolist()
    decks = [np.NaN] * len(cabins)
    rows = [np.NaN] * len(cabins)
    sides = [np.NaN] * len(cabins)
    for i in range(len(cabins)):
        try:
            decks[i] = cabins[i][0:1]
            rows[i] = float(cabins[i][2:-2])
            sides[i] = cabins[i][-1:]
        except TypeError:
            continue
    df["Decks"] = decks
    df["Rows"] = rows
    df["Sides"] = sides
    deckValues = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'T': 7}
    sideValues = {'P': 0, 'S': 1}
    planets = {"Earth": 0, "Mars": 1, "Europa": 2}
    TorF = {True: 1, False: 0}
    destinations = {'TRAPPIST-1e': 0,'PSO J318.5-22': 1, '55 Cancri e': 2}
    names = df.Name.values.tolist()
    surnames = []
    for name in names:
        try:
            surname = name.split(" ")[-1].strip()
        except AttributeError:
            surname = np.NaN
        surnames.append(surname)
    surname_dict = {surnames[i] : i for i in range(len(surnames))}
    df["HomePlanet"] = df["HomePlanet"].replace(planets)
    for col in ["CryoSleep", "VIP"]:
        df[col] = df[col].replace(TorF)
    if not is_test:
       df["Transported"] = df["Transported"].replace(TorF) 
    df["Destination"] = df["Destination"].replace(destinations)
    df["Name"] = surnames
    df["Name"] = df["Name"].replace(surname_dict)
    df["Decks"] = df["Decks"].replace(deckValues)
    df["Sides"] = df["Sides"].replace(sideValues)
    new_cols = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
            'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
            'Name', 'Decks', 'Rows', 'Sides', 'Transported'] if not is_test else ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
            'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
            'Name', 'Decks', 'Rows', 'Sides']
    df = df[new_cols]
    df = df.drop("Cabin", axis=1)

    return df


In [29]:
train_df = clean_data('data/train.csv', is_test = False)
train_df

Unnamed: 0_level_0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Decks,Rows,Sides,Transported
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0001_01,2.0,0.0,0.0,39.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1.0,0.0,0.0,0
0002_01,0.0,0.0,0.0,24.0,0.0,109.0,9.0,25.0,549.0,44.0,7623,5.0,0.0,1.0,1
0003_01,2.0,0.0,0.0,58.0,1.0,43.0,3576.0,0.0,6715.0,49.0,7873,0.0,0.0,1.0,0
0003_02,2.0,0.0,0.0,33.0,0.0,0.0,1283.0,371.0,3329.0,193.0,7873,0.0,0.0,1.0,0
0004_01,0.0,0.0,0.0,16.0,0.0,303.0,70.0,151.0,565.0,2.0,6446,5.0,1.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9276_01,2.0,0.0,2.0,41.0,1.0,0.0,6819.0,0.0,1643.0,74.0,8688,0.0,98.0,0.0,0
9278_01,0.0,1.0,1.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,8689,6.0,1499.0,1.0,0
9279_01,0.0,0.0,0.0,26.0,0.0,0.0,0.0,1872.0,1.0,0.0,8690,6.0,1500.0,1.0,1
9280_01,2.0,0.0,2.0,32.0,0.0,0.0,1049.0,0.0,353.0,3235.0,8692,4.0,608.0,1.0,0


In [30]:
test_df = clean_data('data/test.csv', is_test = True)
test_df

Unnamed: 0_level_0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Decks,Rows,Sides
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0013_01,0.0,1.0,0.0,27.0,0.0,0.0,0.0,0.0,0.0,0.0,3953,6.0,3.0,1.0
0018_01,0.0,0.0,0.0,19.0,0.0,0.0,9.0,0.0,2823.0,0.0,1,5.0,4.0,1.0
0019_01,2.0,1.0,2.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,2,2.0,0.0,1.0
0021_01,2.0,0.0,0.0,38.0,0.0,0.0,6652.0,0.0,181.0,585.0,3,2.0,1.0,1.0
0023_01,0.0,0.0,0.0,20.0,0.0,10.0,0.0,635.0,0.0,0.0,1604,5.0,5.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9266_02,0.0,1.0,0.0,34.0,0.0,0.0,0.0,0.0,0.0,0.0,4272,6.0,1496.0,1.0
9269_01,0.0,0.0,0.0,42.0,0.0,0.0,847.0,17.0,10.0,144.0,4273,,,
9271_01,1.0,1.0,2.0,,0.0,0.0,0.0,0.0,0.0,0.0,4274,3.0,296.0,0.0
9273_01,2.0,0.0,,,0.0,0.0,2680.0,0.0,0.0,523.0,4275,3.0,297.0,0.0


In [34]:
train = train_df.values

X_train = train[:, :-1]
y_train = train[:, -1]

In [None]:
# Imputation
knn_imp = KNNImputer(missing_values=np.nan, n_neighbors = 4)
X_train_imp = knn_imp.fit_transform(X_train)

In [35]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled

array([[ 1.53406268, -0.74724474, -0.62974294, ..., -1.8589168 ,
        -1.17296632, -1.00970093],
       [-0.84252407, -0.74724474, -0.62974294, ...,  0.39064002,
        -1.17296632,  0.99039228],
       [ 1.53406268, -0.74724474, -0.62974294, ..., -2.42130601,
        -1.17296632,  0.99039228],
       ...,
       [-0.84252407, -0.74724474, -0.62974294, ...,  0.95302922,
         1.75765363,  0.99039228],
       [ 1.53406268, -0.74724474,  1.80872029, ..., -0.17174919,
         0.01491164,  0.99039228],
       [ 1.53406268, -0.74724474, -0.62974294, ..., -0.17174919,
         0.01491164,  0.99039228]])

In [None]:
you see me typing