## Notebook d'application du modèle pour la génération de l'output

In [10]:
import pandas as pd
from sklearn.linear_model import LogisticRegression

In [11]:
train_df = pd.read_csv("./data/train.csv", sep=",")
test_df = pd.read_csv("./data/test.csv", sep=",")

In [14]:
def prepare_df(df):
    prepared_df = (
        df
        .copy(deep=True)
        .drop(columns=["Name", "Sex", "Ticket", "Cabin", "Embarked"])
        .dropna()
    )
    return prepared_df

In [35]:
prepared_train_df = train_df.pipe(prepare_df)
prepared_test_df = test_df.pipe(prepare_df)

X_train = prepared_train_df.drop(columns=["Survived"])
y_train = prepared_train_df.loc[:, "Survived"]
X_test = prepared_test_df.copy(deep=True)

model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [16]:
y_pred

array([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0,

In [41]:
# Valeurs prédites par le modèle
y_pred_series = pd.Series(name="Survived", data=y_pred, index=X_test.PassengerId)
y_pred_series

PassengerId
892     0
893     0
894     0
895     0
896     0
       ..
1301    0
1303    1
1304    0
1306    1
1307    0
Name: Survived, Length: 331, dtype: int64

In [49]:
# Valeurs éliminées avant la prédiction
eliminated_mask = ~test_df.PassengerId.isin(y_pred_series.index)
eliminated_df = test_df[eliminated_mask]

eliminated_pred_series = pd.Series(name="Survived", data=[0]*eliminated_df.shape[0], index=eliminated_df.PassengerId)
eliminated_pred_series

PassengerId
902     0
914     0
921     0
925     0
928     0
       ..
1300    0
1302    0
1305    0
1308    0
1309    0
Name: Survived, Length: 87, dtype: int64

In [53]:
survival_pred_series = pd.concat([y_pred_series, eliminated_pred_series]).sort_index()
survival_pred_series.to_csv("./data/submission.csv", index=True)