In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from IPython.display import display

In [2]:
train_data = pd.read_csv("data/train.csv").sample(frac=1).reset_index(drop=True)
test_data = pd.read_csv("data/test.csv")

In [3]:
def clean_data(data):
    def feature_creation(data):
        def passengerid(feature):
            return pd.DataFrame({
                "Group": [value[5:] for value in data[feature]]
            })
        def cabin(feature):
            return pd.DataFrame({
                "Deck": [value.split("/")[0] if value is not np.nan else value for value in data[feature]],
                "Side": [value.split("/")[2] if value is not np.nan else value for value in data[feature]],
            })
        return_feature = lambda feature: data[feature]
        feature_func = {
            "PassengerId": passengerid,
            "HomePlanet": return_feature,
            "CryoSleep": return_feature,
            "Cabin": cabin,
            "Destination": return_feature,
            "Age": return_feature,
            "RoomService": return_feature,
            "FoodCourt": return_feature,
            "ShoppingMall": return_feature,
            "Spa": return_feature,
            "VRDeck": return_feature,
        }
        return [func(key) for key, func in feature_func.items()]
    
    def onehotencoding(f_df):
        temp = pd.DataFrame()
        for col in f_df.columns:
            if f_df[col].dtype != float and f_df[col].dtype != int:
                for uniq_value in f_df[col].unique():
                    if uniq_value is not np.nan:
                        temp = pd.concat([
                            temp,
                            pd.DataFrame({f"{col}_{uniq_value}": (f_df[col] == uniq_value).astype(float)})
                        ],axis=1)
            else:
                f_df.loc[f_df[col].isnull(), col] = f_df[col].mean()
                temp = pd.concat([
                    temp,
                    f_df[col]
                ], axis=1)
        return temp.reindex(temp.columns.sort_values(), axis=1)
                
    feature_df = pd.concat(feature_creation(data), axis=1)
    return onehotencoding(feature_df)

In [4]:
def new_clean_data(data):
    def feature_creation(data):
        def passengerid(feature):
            return pd.DataFrame({
                "Group": [value[5:] for value in data[feature]]
            })
        def cabin(feature):
            return pd.DataFrame({
                "Deck": [value.split("/")[0] if value is not np.nan else value for value in data[feature]],
                "Num": [int(value.split("/")[1]) if value is not np.nan else value for value in data[feature]],
                "Side": [value.split("/")[2] if value is not np.nan else value for value in data[feature]],
            })
        return_feature = lambda feature: data[feature]
        feature_func = {
            "PassengerId": passengerid,
            "HomePlanet": return_feature,
            "CryoSleep": return_feature,
            "Cabin": cabin,
            "Destination": return_feature,
            "Age": return_feature,
            "RoomService": return_feature,
            "FoodCourt": return_feature,
            "ShoppingMall": return_feature,
            "Spa": return_feature,
            "VRDeck": return_feature,
        }
        return [func(key) for key, func in feature_func.items()]
    
    def onehotencoding(f_df):
        temp = pd.DataFrame()
        for col in f_df.columns:
            if f_df[col].dtype != float and f_df[col].dtype != int:
                for uniq_value in f_df[col].unique():
                    if uniq_value is not np.nan:
                        temp = pd.concat([
                            temp,
                            pd.DataFrame({f"{col}_{uniq_value}": (f_df[col] == uniq_value).astype(float)})
                        ],axis=1)
            else:
                f_df.loc[f_df[col].isnull(), col] = f_df[col].mean()
                temp = pd.concat([
                    temp,
                    f_df[col]
                ], axis=1)
        return temp.reindex(temp.columns.sort_values(), axis=1)
                
    feature_df = pd.concat(feature_creation(data), axis=1)
    return onehotencoding(feature_df)

In [5]:
clean_train_data = new_clean_data(train_data.drop(columns=["Transported"]))
clean_test_data = new_clean_data(test_data)

In [6]:
from sklearn.preprocessing import MinMaxScaler

In [7]:
scaler = MinMaxScaler()
scaler.fit(pd.concat([clean_train_data, clean_test_data], axis=0))
x_train = scaler.transform(clean_train_data)
x_test = scaler.transform(clean_test_data)
y_train = np.where(train_data.Transported.to_numpy(), 1, 0).reshape(-1,1)
x_train.shape, y_train.shape, x_test.shape

((8693, 33), (8693, 1), (4277, 33))

In [8]:
from keras.models import Sequential, load_model
from keras.layers import Dense, Input, Dropout
from keras.activations import relu, sigmoid
from keras.optimizers import Adam
from keras.losses import BinaryCrossentropy
from keras.callbacks import EarlyStopping

In [9]:
def create_model(layers):
    model = Sequential()
    model.add(Input(shape=x_train.shape[1]))
    for i, u in enumerate(layers):
        if type(u) == float:
            model.add(Dropout(u))
        elif type(u) == int:
            model.add(Dense(u, activation=relu))
    model.add(Dense(1, activation=sigmoid))
    return model

In [10]:
callback = EarlyStopping(
    monitor="val_accuracy",
    min_delta=0.0001,
    patience=20,
    verbose=1,
    mode="max",
    restore_best_weights=True
)

In [14]:
layers_arr = [
    [0.1,1024,1024,1024,1024],
]
all_history = pd.DataFrame()
for i, l in enumerate(layers_arr):
    #print(f"""[{i+1}/{len(layers_arr)}] - {l}{" "*20}""", end="\r")
    model = create_model(l)
    model.compile(optimizer=Adam(), loss=BinaryCrossentropy(), metrics=["accuracy"])
    history = model.fit(x_train, y_train, epochs=100, shuffle=True, batch_size=10, validation_split=0.1, verbose=1, callbacks=[callback])
    df = pd.DataFrame(history.history)
    all_history = pd.concat([all_history, df.set_index([[str(l)[1:-1]]*df.shape[0], df.index])])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 45: early stopping


In [None]:
all_history.to_pickle("model_testing_results_dropout.pkl")

In [None]:
all_history = pd.read_pickle("model_testing_results_dropout.pkl")

In [None]:
nb_col = 1
fig = make_subplots(
    rows=len(all_history.columns)//nb_col+1 if len(all_history.columns)%nb_col else len(all_history.columns)//nb_col,
    cols=nb_col,
    subplot_titles=all_history.columns
)
for i, col in enumerate(all_history.columns):
    for ind in all_history.index.get_level_values(0).unique():
        fig.add_trace(go.Scatter(
            y=all_history.loc[ind, col],
            mode="lines+markers",
            name=ind
        ),row=i//nb_col+1, col=i%nb_col+1)
fig.update_layout(
    height=1500//nb_col
)
fig.show()

In [None]:
all_history.sort_values("val_accuracy", ascending=False)[:50]

In [17]:
model.save("model_0.80780.hdf5")

In [None]:
model = load_model("model1.hdf5")

In [15]:
model.evaluate(x_train, y_train)



[0.3655907213687897, 0.8256068229675293]

In [None]:
prediction = model.predict(x_test).reshape(-1)

In [11]:
from sklearn.linear_model import LogisticRegression

In [12]:
logreg = LogisticRegression(max_iter=1000)
logreg.fit(x_train, y_train.reshape(-1))

In [13]:
logreg.score(x_train, y_train)

0.7723455653974463

In [14]:
list(zip(clean_train_data.columns, logreg.coef_[0]))

[('Age', -0.9214733715215058),
 ('CryoSleep_False', -0.551744088765526),
 ('CryoSleep_True', 1.3500144996706276),
 ('Deck_A', -0.3864996390218637),
 ('Deck_B', 0.4687110689660122),
 ('Deck_C', 0.6835940306360718),
 ('Deck_D', 0.019243026467506738),
 ('Deck_E', -0.26832646172707336),
 ('Deck_F', 0.15654923443992338),
 ('Deck_G', -0.21405342704288402),
 ('Deck_T', -0.16024728630302268),
 ('Destination_55 Cancri e', 0.035599806781618226),
 ('Destination_PSO J318.5-22', -0.3165992972540114),
 ('Destination_TRAPPIST-1e', -0.3173018316754298),
 ('FoodCourt', 4.753345077891745),
 ('Group_01', -0.028268218220210983),
 ('Group_02', 0.05486841186073854),
 ('Group_03', 0.2847833041748906),
 ('Group_04', 0.16167792512313078),
 ('Group_05', -0.1501668033648223),
 ('Group_06', 0.09507211935676513),
 ('Group_07', -0.2266600577103713),
 ('Group_08', -0.13765978576819163),
 ('HomePlanet_Earth', -0.34316243141680086),
 ('HomePlanet_Europa', 0.7841459313266589),
 ('HomePlanet_Mars', 0.08499345834105249),

In [15]:
prediction = logreg.predict(x_test)

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
x_tr, x_va, y_tr, y_va = train_test_split(x_train, y_train, test_size=0.3)
x_tr.shape, x_va.shape, y_tr.shape, y_va.shape

((6085, 33), (2608, 33), (6085, 1), (2608, 1))

In [12]:
from catboost import CatBoostClassifier

In [13]:
params = {
            'depth':[3,1,2,6,4,5],
            'iterations':[250,100,500],
            'learning_rate':[0.03,0.001,0.01,0.1,0.2,0.3],
            'l2_leaf_reg':[3,1,5,10,100],
            'border_count':[32,5,10,20,50,100,200],
            'bagging_temperature':[0.03,0.09,0.25,0.75],
            'random_strength':[0.2,0.5,0.8],
            'max_ctr_complexity':[1,2,3,4,5]
            }

In [45]:
catb = CatBoostClassifier(
    iterations=2000,
    learning_rate=0.5,
    random_strength=0.1,
)
catb.fit(x_tr, y_tr, verbose=0)

<catboost.core.CatBoostClassifier at 0x2358b3ca5c0>

In [46]:
catb.score(x_va, y_va.reshape(-1))

0.8052147239263804

In [16]:
prediction = catb.predict(x_test)

In [17]:
prediction_bool = np.where(prediction <= 0.5, 0, 1).astype(bool)
submission = pd.DataFrame({
    "PassengerId": test_data.PassengerId.values,
    "Transported": prediction_bool,
}).set_index("PassengerId")
submission.to_csv("submission.csv")