In [None]:
# import libraries
from sklearn import linear_model, ensemble, neural_network as neural_network_module
from sklearn import model_selection, preprocessing, metrics, pipeline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import joblib

In [None]:
def exploratory_data_analysis(df: pd.DataFrame, inplace=False):
    df = df if inplace else df.copy()
    
    # Let's put all column names as lower case.
    df.columns = [col.lower() for col in df.columns]
    
    # Let's convert the appropiate categorical columns as Pandas Categorical variables.
    df["pclass"] = pd.Categorical(df["pclass"].astype(str), categories=["1", "2", "3"])
    
    df["sex"] = pd.Categorical(df["sex"], categories=["male", "female"])
    
    df["embarked"] = pd.Categorical(df["embarked"], categories=["C", "Q", "S"])
    
    # We are going to create a new category to identify the type of cabin the passenger had (deck).
    df["deck"] = df["cabin"].map(lambda cabin: cabin[0] if pd.notna(cabin) else "Unknown")
    df["deck"] = pd.Categorical(df["deck"], categories=["A", "B", "C", "D", "E", "F", "T", "G", "Unknown"])
    
    # We can treat this column as categorical because it has a small amount of unique values.
    # df["sibsp"] = pd.Categorical(df["sibsp"].map(lambda x: str(x)), categories=["0", "1", "2", "3", "4", "5", "6", "7", "8"], ordered=True)
    
    # We can also treat this column as categorical because it has a small amount of unique values.
    # df["parch"] = pd.Categorical(df["parch"].map(lambda x: str(x)), categories=["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"], ordered=True)
    
    return df

    
def data_engineering(df: pd.DataFrame, df_info : dict, inplace=False):
    df = df if inplace else df.copy()
    df.set_index('passengerid', inplace=True)
    
    # remove columns
    df.drop(['name', 'cabin', 'ticket'], axis=1, inplace=True)
    
    # fill nulls
    df['age'].fillna(df_info['age_median'], inplace=True)
    df['embarked'].fillna(df_info['embarked_mode'], inplace=True)
    df['fare'].fillna(df_info['fare_median'], inplace=True)
    
    # df['FamilySize'] = df['SibSp'] + df['Parch']
    df['alone'] = (df['sibsp'] == 0) & (df['parch'] == 0)
    
    # df["Name"] = df["Name"].apply(normalizeName)
    
    return df

In [None]:
train_df = pd.read_csv('/kaggle/input/titanic/train.csv')

df = exploratory_data_analysis(train_df)

df_info = dict(
    age_median = df['age'].median(),
    embarked_mode = df['embarked'].mode()[0],
    fare_median  = df['fare'].median()
)

df = data_engineering(df, df_info)
df.head()

In [None]:
df.info()

In [None]:
# separate categorical and numerical columns
categorical = df.select_dtypes(include='category')
categorical_columns = list(categorical.columns)

numerical = df.drop('survived', axis=1).select_dtypes(include='float64')
numerical_columns = list(numerical.columns)

In [None]:
# Encode the categorical variables into numerical using One Hot Encoding.
encoder = preprocessing.OneHotEncoder(sparse_output=False)
encoder.fit(categorical)

categorical_encoded_columns = []
for i, column in enumerate(categorical_columns):
    for category in encoder.categories_[i]:
        categorical_encoded_columns.append(f'{column}_{category}')
print(categorical_encoded_columns)
encoder.transform(categorical)

In [None]:
scaler = preprocessing.StandardScaler()
scaler.fit(numerical)
scaler.transform(numerical)

In [None]:
# useful functions

def as_function_transformer(func, *args, **kwargs):
    return preprocessing.FunctionTransformer(func, *args, **kwargs)

@as_function_transformer
def eda_process(df: pd.DataFrame):
    return exploratory_data_analysis(df)

@as_function_transformer
def data_engineering_process(df: pd.DataFrame):
    return data_engineering(df, df_info)
    
@as_function_transformer
def scale_numerical(df: pd.DataFrame):
    numerical = df[numerical_columns]
    numerical_scaled = pd.DataFrame(scaler.transform(numerical), columns=numerical_columns)
    
    df_complement = df.drop(numerical_columns,axis=1).reset_index()
    
    return pd.concat([numerical_scaled, df_complement], axis=1)

@as_function_transformer
def encode_categorical(df : pd.DataFrame):
    categorical = df[categorical_columns]
    categorical_encoded = pd.DataFrame(encoder.transform(categorical), columns=categorical_encoded_columns)
    
    
    
    df_complement = df.drop(categorical_columns, axis=1).reset_index()
    pdf = pd.concat([categorical_encoded, df_complement], axis=1)
    return pdf
    

In [None]:
# split data into train and test data
X, y = train_df.drop('Survived', axis=1), train_df['Survived']
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y)

# build processing steps
processing_steps = [
    ('exploratory_data_analysis', eda_process),
    ('data_engineering', data_engineering_process),
    ('scaler', scale_numerical),
    ('encoder', encode_categorical)
]

In [None]:
# build pipeline
steps = processing_steps + [('model', linear_model.LogisticRegression(max_iter=1000))]

pipe = pipeline.Pipeline(steps)
pipe.fit(X_train, y_train)

print(f'Accuracy: {metrics.accuracy_score(y_test, pipe.predict(X_test))}')

In [None]:
# logistic regression - cross validation
param_grid = dict(
    penalty = ['l2', None],
    solver = ['lbfgs', 'newton-cg', 'sag', 'saga'],
    max_iter= [8000]
)
gscv = model_selection.GridSearchCV(linear_model.LogisticRegression(), param_grid, cv=5)

steps = processing_steps + [('model', gscv)]
pipe = pipeline.Pipeline(steps)
pipe.fit(X_train, y_train)

logistic_regression_params = gscv.best_params_
gscv.best_score_

In [None]:
# neural networks - cross validation
param_grid = dict(
    hidden_layer_sizes = [(50, 50), (50, 50, 50), (100, 50, 100)],
    activation = ['relu', 'tanh', 'logistic'],
    max_iter = [1000]
)
gscv = model_selection.GridSearchCV(neural_network_module.MLPClassifier(), param_grid, cv=5)

steps = processing_steps + [('model', gscv)]
pipe = pipeline.Pipeline(steps)
pipe.fit(X_train, y_train)

neural_network_params = gscv.best_params_
gscv.best_score_

In [None]:
# random forest - cross validation
param_grid = dict(
    n_estimators = [10, 50, 100, 200],
    max_depth = [10, 50, 100, 200, None],
    min_samples_split = [2, 5, 10],
    min_samples_leaf = [1, 2, 3, 5]
)
gscv = model_selection.GridSearchCV(ensemble.RandomForestClassifier(), param_grid, cv=5)

steps = processing_steps + [('model', gscv)]
pipe = pipeline.Pipeline(steps)
pipe.fit(X_train, y_train)

random_forest_params = gscv.best_params_
gscv.best_score_

In [None]:
# compare model performances - select the best one
models = dict(
    LogisticRegression = dict(
        estimator_class=linear_model.LogisticRegression,
        params=logistic_regression_params
    ),
    NeuralNetwork = dict(
        estimator_class=neural_network_module.MLPClassifier,
        params=neural_network_params
    ),
    RandomForest = dict(
        estimator_class=ensemble.RandomForestClassifier,
        params=random_forest_params
    )
)

best_model_score = [None, -1]
for model in models:
    model_info = models[model]
    steps = processing_steps + [('model', model_info['estimator_class'](**model_info['params']))]
    pipe = pipeline.Pipeline(steps)
    pipe.fit(X_train, y_train)
    score = metrics.accuracy_score(y_test, pipe.predict(X_test))
    if score > best_model_score[1]:
        best_model_score = model, score
    print(f':: {model} : {score}')

best_model_info = models[best_model_score[0]]
best_model = best_model_info['estimator_class'](**best_model_info['params'])
print(f'\n Best model: {best_model}')      
      
steps = processing_steps + [('model', best_model)]
      
final_pipe = pipeline.Pipeline(steps)
final_pipe.fit(X, y)

In [None]:
# output 

# read test csv
test_df = pd.read_csv('/kaggle/input/titanic/test.csv')

# make predictions
output = test_df[['PassengerId']]
result = final_pipe.predict(test_df)
output['Survived'] = result


# save dataset
output.to_csv('output.csv', index=False)
output.head()