In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from IPython.display import display

In [2]:
train_data = pd.read_csv("data/train.csv").sample(frac=1)
test_data = pd.read_csv("data/test.csv")

In [3]:
train_num_data = train_data._get_numeric_data()
train_cat_data = train_data.drop(columns=test_data._get_numeric_data().columns)
test_num_data = test_data._get_numeric_data()
test_cat_data = test_data.drop(columns=test_data._get_numeric_data().columns)

In [4]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler

In [5]:
scaler, le, onehot = MinMaxScaler(), LabelEncoder(), OneHotEncoder()

In [6]:
null_col = (train_num_data.isnull().sum() > 0).index
for col in null_col:
    train_num_data.loc[train_num_data[col].isnull(), col] = train_num_data[col].mean()
null_col = (test_num_data.isnull().sum() > 0).index
for col in null_col:
    test_num_data.loc[test_num_data[col].isnull(), col] = test_num_data[col].mean()

In [7]:
scaler.fit(pd.concat([train_num_data.drop(columns=["Transported"]), test_num_data], axis=0))

In [8]:
sca_train_num_data = scaler.transform(train_num_data.drop(columns=["Transported"]))
sca_test_num_data = scaler.transform(test_num_data)

In [9]:
y_train = np.where(train_num_data.Transported == True, 1, 0).reshape(-1,1)

In [10]:
train_cat_data.shape[0]

8693

In [11]:
cat_data = pd.concat([
    train_cat_data.drop(columns=["Transported"]).set_index([["train"]*train_cat_data.shape[0], train_cat_data.index]),
    test_cat_data.set_index([["test"]*test_cat_data.shape[0], test_cat_data.index]),
])

In [12]:
lab_cat_data = cat_data.apply(le.fit_transform)
lab_cat_data = lab_cat_data.drop(columns=["PassengerId", "Cabin", "Name"])

In [13]:
train_lab_cat_data = lab_cat_data.loc["train"]
test_lab_cat_data = lab_cat_data.loc["test"]

In [14]:
train_one_cat_data = onehot.fit_transform(train_lab_cat_data).toarray()
test_one_cat_data = onehot.fit_transform(test_lab_cat_data).toarray()

In [15]:
x_train = np.concatenate([sca_train_num_data.T, train_one_cat_data.T]).T
x_test = np.concatenate([sca_test_num_data.T, test_one_cat_data.T]).T
x_train.shape, x_test.shape

((8693, 20), (4277, 20))

In [16]:
from keras.models import Sequential
from keras.layers import Dense, Input, Dropout
from keras.activations import relu, sigmoid
from keras.optimizers import Adam
from keras.losses import BinaryCrossentropy

In [17]:
model = Sequential([
    Input(shape=20),
    Dense(64, activation=relu),
    Dense(32, activation=relu),
    Dense(16, activation=relu),
    Dense(1, activation=sigmoid),
])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 64)                1344      
                                                                 
 dense_1 (Dense)             (None, 32)                2080      
                                                                 
 dense_2 (Dense)             (None, 16)                528       
                                                                 
 dense_3 (Dense)             (None, 1)                 17        
                                                                 
Total params: 3,969
Trainable params: 3,969
Non-trainable params: 0
_________________________________________________________________


In [18]:
model.compile(optimizer=Adam(), loss=BinaryCrossentropy(), metrics=["accuracy"])
model.fit(x_train, y_train, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x212dde99210>

In [19]:
prediction = model.predict(x_test).reshape(-1)
prediction = np.where(prediction <= 0.5, 0, 1).astype(bool)



In [20]:
submission = pd.DataFrame({
    "PassengerId": test_data.PassengerId.values,
    "Transported": prediction,
}).set_index("PassengerId")

In [21]:
submission.to_csv("submission.csv")