Initialization

In [2]:
import pandas as pd
import numpy as np
import csv
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

Data Cleaning

In [3]:
from sklearn.preprocessing import OneHotEncoder

def wrangle(df):
    df["group"] = df["PassengerId"].str.split("_", expand = True)[0]
    df = df.drop(columns=['Name','PassengerId'])
    
    df["deck"] = df["Cabin"].str.split("/", expand = True)[0]
    #df["num"] = df["Cabin"].str.split("/", expand = True)[1]
    df["side"] = df["Cabin"].str.split("/", expand = True)[2]
    df = df.drop(columns=['Cabin'])

    bool_features = ["CryoSleep","VIP"]
    for feature in bool_features:
        df[feature] *= 1
    df = df.drop(columns=['VIP'])
    
    df = pd.get_dummies(df, columns = ['deck', 'side','Destination','HomePlanet'])
    return df

Data Exploration & Visualization

In [4]:
df = wrangle(pd.read_csv("train.csv"))
df.head()

Unnamed: 0,CryoSleep,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,group,deck_A,...,deck_G,deck_T,side_P,side_S,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars
0,0,39.0,0.0,0.0,0.0,0.0,0.0,False,1,0,...,0,0,1,0,0,0,1,0,1,0
1,0,24.0,109.0,9.0,25.0,549.0,44.0,True,2,0,...,0,0,0,1,0,0,1,1,0,0
2,0,58.0,43.0,3576.0,0.0,6715.0,49.0,False,3,1,...,0,0,0,1,0,0,1,0,1,0
3,0,33.0,0.0,1283.0,371.0,3329.0,193.0,False,3,1,...,0,0,0,1,0,0,1,0,1,0
4,0,16.0,303.0,70.0,151.0,565.0,2.0,True,4,0,...,0,0,0,1,0,0,1,1,0,0


Modelling

In [5]:
target = "Transported"
y_train = df[target]
X_train = df[df.columns.drop(target)]

In [6]:
from sklearn.metrics import mean_absolute_error
y_mean = y_train.mean()
y_pred_baseline = [y_mean] * len(y_train)
mean_absolute_error(y_train, y_pred_baseline)

0.49997373897071506

In [7]:
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

model = make_pipeline(
    OneHotEncoder(handle_unknown = 'ignore'),
    SimpleImputer(),
    StandardScaler(with_mean=False),
    Ridge()
)
model.fit(X_train, y_train)
y_pred1 = model.predict(X_train)
mean_absolute_error(y_train, y_pred1 * 1)

0.028966898306554165

In [8]:
from keras.models import Sequential
from keras.layers import Dense
from scikeras.wrappers import KerasClassifier

def create_baseline():
	model = Sequential()
	model.add(Dense(20, input_dim=24, activation='relu'))
	model.add(Dense(10, activation='relu'))
	model.add(Dense(3, activation='relu'))
	model.add(Dense(1, activation='sigmoid'))
	model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
	return model

model2 = make_pipeline(
    SimpleImputer(),
    StandardScaler(with_mean=False),
    KerasClassifier(create_baseline, epochs=100, batch_size=50, verbose=0)
)
model2.fit(X_train, y_train)

Pipeline(steps=[('simpleimputer', SimpleImputer()),
                ('standardscaler', StandardScaler(with_mean=False)),
                ('kerasclassifier',
                 KerasClassifier(batch_size=50, epochs=100, model=<function create_baseline at 0x000001E801E3C040>, verbose=0))])

In [9]:
y_pred1 = model2.predict_proba(X_train)
y_pred2 = []
for e in y_pred1:
    y_pred2.append(e[1])
mean_absolute_error(y_pred2, y_train)

0.2319693

Submit Answer

In [111]:
test_data = pd.read_csv("test.csv")
X_test = wrangle(test_data)
y_test = model2.predict_proba(X_test)
print(y_test)

[[4.2670792e-01 5.7329208e-01]
 [9.8510075e-01 1.4899224e-02]
 [8.5067749e-04 9.9914932e-01]
 ...
 [1.0036230e-03 9.9899638e-01]
 [3.2745457e-01 6.7254543e-01]
 [3.3000606e-01 6.6999394e-01]]


In [112]:
ans = pd.DataFrame({'PassengerId':[],'Transported':[]})
ptr = 0
for index, row in test_data.iterrows():
    s = "True"
    if y_test[ptr][0] > y_test[ptr][1]:
        s = "False"
    tmp = pd.DataFrame({'PassengerId':[row['PassengerId']],'Transported':[s]})
    ptr += 1
    ans = ans.append(tmp, ignore_index = True)
ans.to_csv('neural_net.csv', index=False) #change name here
ans.head()

Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,False
