In [1]:
import pandas as pd

from datetime import datetime

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping

from xgboost import XGBClassifier

import numpy as np

from sklearn import tree
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
submission = pd.read_csv('data/gender_submission.csv')
print(f"Size of the training set: {train.shape[0]}")
print(f"Size of the test set: {test.shape[0]}")

Size of the training set: 891
Size of the test set: 418


In [79]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [80]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [81]:
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


# EDA

Done with Rapid Miner, cool tool and faster to check everything compare to using pandas.

## Feature Engineering

In [3]:
def fill_age(grouped_median_train, row):
    condition = (
        (grouped_median_train['Sex'] == row['Sex']) & 
        (grouped_median_train['Title'] == row['Title']) & 
        (grouped_median_train['Pclass'] == row['Pclass'])
    ) 
    return grouped_median_train[condition]['Age'].values[0]


def process_age(df,grouped_median_train):
    # a function that fills the missing values of the Age variable
    df['Age'] = df.apply(lambda row: fill_age(grouped_median_train,row) if np.isnan(row['Age']) else row['Age'], axis=1)
    return df

def extract_title(name):
    return name.split(',')[1].split('.')[0].strip()

def feature_engineering(df):
    df.drop('PassengerId', axis=1, inplace=True)  
        
    df.drop('Ticket', axis=1, inplace=True)  
    df['Cabin'] = df['Cabin'].astype(str).str.extract(r'([A-Za-z])', expand=False)
    
    # Feature Engineer 'Name' feature.
    
    Title_Dictionary = {
    "Capt": "Officer",
    "Col": "Officer",
    "Major": "Officer",
    "Jonkheer": "Royalty",
    "Don": "Royalty",
    "Sir" : "Royalty",
    "Dr": "Officer",
    "Rev": "Officer",
    "the Countess":"Royalty",
    "Mme": "Mrs",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr" : "Mr",
    "Mrs" : "Mrs",
    "Miss" : "Miss",
    "Master" : "Master",
    "Lady" : "Royalty"
    }
    
    df['Title'] = df['Name'].apply(extract_title)
    df['Title'] = df.Title.map(Title_Dictionary)
    df.drop('Name', axis=1, inplace=True) 
    
    # Feature Engineering for the 'Age' feature.
    grouped_train = df.groupby(['Sex', 'Pclass', 'Title'])    
    grouped_median_train = grouped_train['Age'].median().reset_index()
    df = process_age(df,grouped_median_train)
        
    df['Age_bin'] = pd.cut(df['Age'], bins=[0,12,20,40,120], labels=['Children','Teenage','Adult','Elder'])
    df.drop('Age', axis=1, inplace=True)
        
    # Feature Engineering for SibSP and Parch.
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['Singleton'] = df['FamilySize'].map(lambda s: 1 if s == 1 else 0)
    df['SmallFamily'] = df['FamilySize'].map(lambda s: 1 if 2 <= s <= 4 else 0)
    df['LargeFamily'] = df['FamilySize'].map(lambda s: 1 if 5 <= s else 0)
    
    df['Fare_bin'] = pd.cut(df['Fare'], bins=[0,7.91,14.45,31,120], labels=['Low_fare','median_fare','Average_fare','high_fare'])
    df.drop('Fare', axis=1, inplace=True)  
    
    return df

## Preprocessing

In [4]:
train = feature_engineering(train)
test = feature_engineering(test)

In [5]:
numeric_features = ['Pclass', 'SibSp', 'Parch', 'FamilySize', 'Singleton','SmallFamily', 'LargeFamily']
categorical_features = ['Embarked','Title','Age_bin','Fare_bin', 'Sex', 'Cabin']

In [6]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Cabin,Embarked,Title,Age_bin,FamilySize,Singleton,SmallFamily,LargeFamily,Fare_bin
0,0,3,male,1,0,n,S,Mr,Adult,2,0,1,0,Low_fare
1,1,1,female,1,0,C,C,Mrs,Adult,2,0,1,0,high_fare
2,1,3,female,0,0,n,S,Miss,Adult,1,1,0,0,median_fare
3,1,1,female,1,0,C,S,Mrs,Adult,2,0,1,0,high_fare
4,0,3,male,0,0,n,S,Mr,Adult,1,1,0,0,median_fare


In [7]:
X = train.drop('Survived', axis=1)
y = train['Survived']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set shape: {X_train.shape}")
print(f"Validating set shape: {X_val.shape}")

Training set shape: (712, 13)
Validating set shape: (179, 13)


In [8]:
preprocessor = ColumnTransformer(
    transformers=[
        # Pipeline for numeric features
        ('num', Pipeline([
            # Impute missing values in numeric columns with the median of the column
            ('imputer', SimpleImputer(strategy='median')),
            # Scale numeric features to have mean=0 and standard deviation=1
            ('scaler', StandardScaler())
        ]), numeric_features),
        
        # Pipeline for categorical features
        ('cat', Pipeline([
            # Impute missing values in categorical columns with the string 'missing'
            ('imputer', SimpleImputer(strategy='constant', fill_value='M')),
            # Convert categorical features to string type
            ('to_string', FunctionTransformer(lambda x: x.astype(str))),
            # One-hot encode categorical features, ignoring unknown categories
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_features)
    ])

# Create a full pipeline that applies the preprocessor to the dataset
full_pipeline = Pipeline([
    ('preprocessor', preprocessor)  # Apply the preprocessor to the data
])

In [9]:
X_train = full_pipeline.fit_transform(X_train)
X_val = full_pipeline.transform(X_val)

In [10]:
test = full_pipeline.transform(test)

## Model building

### Logistic Regression

In [108]:
log_reg = LogisticRegression(random_state=42, max_iter=1000)

# Define the parameter grid for GridSearch
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'class_weight': [None, 'balanced']
}

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=log_reg,
    param_grid=param_grid,
    cv=10,
    scoring='accuracy',
    verbose=0,
    n_jobs=-1
)

# Fit the model
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

# Use the best model to make predictions on the test set
lr_best_model = grid_search.best_estimator_

predictions_train = lr_best_model.predict(X_train)
predictions_val = lr_best_model.predict(X_val)

train_accuracy = accuracy_score(y_train, predictions_train)
val_accuracy = accuracy_score(y_val, predictions_val)

print(f"Train accuracy: {train_accuracy}.")
print(f"Validation accuracy: {val_accuracy}.")

y_pred = lr_best_model.predict(test)



Best parameters found:  {'C': 0.1, 'class_weight': None, 'penalty': 'l2', 'solver': 'newton-cg'}
Best cross-validation score: 0.83
Train accuracy: 0.8286516853932584.
Validation accuracy: 0.8212290502793296.


1560 fits failed out of a total of 2400.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
120 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/geronimobasso/Desktop/extra/drones/code/paint-bb/.venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/geronimobasso/Desktop/extra/drones/code/paint-bb/.venv/lib/python3.12/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/geronimobasso/Desktop/extra/drones/code/paint-bb/.venv/lib/python3.12/site-packages/sklearn/linear_model/_log

### Random Forest

In [38]:
rf = RandomForestClassifier(random_state=42)

# Define the parameter grid for GridSearch
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt'],
    'bootstrap': [True, False],
    'class_weight': [None, 'balanced', 'balanced_subsample']
}

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=10,
    scoring='accuracy',
    verbose=0,
    n_jobs=-1
)

# Fit the model
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

# Use the best model to make predictions on the test set
rf_best_model = grid_search.best_estimator_

predictions_train = rf_best_model.predict(X_train)
predictions_val = rf_best_model.predict(X_val)

train_accuracy = accuracy_score(y_train, predictions_train)
val_accuracy = accuracy_score(y_val, predictions_val)

print(f"Train accuracy: {train_accuracy}.")
print(f"Validation accuracy: {val_accuracy}.")

predictions_rfc = rf_best_model.predict(test)

6480 fits failed out of a total of 12960.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4646 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/geronimobasso/Desktop/extra/drones/code/paint-bb/.venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/geronimobasso/Desktop/extra/drones/code/paint-bb/.venv/lib/python3.12/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/Users/geronimobasso/Desktop/extra/drones/code/paint-bb/.venv/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File

Best parameters found:  {'bootstrap': True, 'class_weight': None, 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 300}
Best cross-validation score: 0.84
Train accuracy: 0.8497191011235955.
Validation accuracy: 0.8212290502793296.


## XGBoost

In [39]:
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

param_grid = {
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.001,0.01, 0.1, 0.2, 0.3],
    'n_estimators': [100, 200, 300],
    'min_child_weight': [1, 3, 5],
    'gamma': [0.0, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, 
                           cv=5, scoring='accuracy', verbose=0, n_jobs=-1)

grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

xgb_best_model = grid_search.best_estimator_
predictions_xgb = xgb_best_model.predict(test)

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Best parameters found:  {'colsample_bytree': 1.0, 'gamma': 0.0, 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 300, 'subsample': 0.9}
Best cross-validation score: 0.84


Parameters: { "use_label_encoder" } are not used.



## SVC

In [40]:
# Define the SVC model
svc = SVC()

# Define the parameter grid for GridSearch
param_grid = {
    'C': [0.1, 1, 10, 50],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto', 0.1, 1],
    'degree': [2, 3, 4],  # Only relevant for poly kernel
    'class_weight': [None, 'balanced']
}

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=svc,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    verbose=0,
    n_jobs=-1
)

# Fit the model
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

# Use the best model to make predictions on the test set
svc_best_model = grid_search.best_estimator_
predictions_svc = svc_best_model.predict(test)

svc_best_model = grid_search.best_estimator_

predictions_train = svc_best_model.predict(X_train)

predictions_val = svc_best_model.predict(X_val)

train_accuracy = accuracy_score(y_train, predictions_train)
val_accuracy = accuracy_score(y_val, predictions_val)

print(f"Train accuracy: {train_accuracy}.")
print(f"Validation accuracy: {val_accuracy}.")

Best parameters found:  {'C': 1, 'class_weight': None, 'degree': 2, 'gamma': 'auto', 'kernel': 'rbf'}
Best cross-validation score: 0.83
Train accuracy: 0.8342696629213483.
Validation accuracy: 0.8268156424581006.


## Voting Classifier

In [11]:
# Define parameter grids for each model
param_grids = {
    # 'log_reg': {
    #     'C': [0.1, 1, 10, 100],
    #     'penalty': ['l1', 'l2'],
    #     'solver': ['liblinear', 'newton-cg']
    # }, 
    'log_reg': {
        'C': [0.01, 0.1, 1, 10, 100, 1000],
        'penalty': ['l1', 'l2', 'elasticnet', 'none'],
        'solver': ['liblinear', 'newton-cg', 'sag', 'saga', 'lbfgs'],
        'max_iter': [100, 200, 300, 500],
        'class_weight': [None, 'balanced']
    },
    'rf_clf': {
        'n_estimators': [10, 50, 100],
        'max_depth': [10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'xgb_clf': {
        'n_estimators': [10, 50, 100],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 6, 9]
    },
    'dt': {
        'max_depth': [3,5,7],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'knn': {
        'n_neighbors': [3, 5, 7],
        'weights': ['uniform', 'distance'],
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
    },
    'svc': {
        'C': [0.1, 1],
        'gamma': [1, 0.1, 0.01, 0.001],
        'kernel': ['linear', 'rbf', 'poly', 'sigmoid']
    }
}

# Define the models
models = {
    'log_reg': LogisticRegression(),
    'rf_clf': RandomForestClassifier(random_state=42),
    'xgb_clf': XGBClassifier(random_state=42),
    'dt': tree.DecisionTreeClassifier(random_state=1),
    'knn': KNeighborsClassifier(),
    'svc': SVC(probability=True)
}

# Dictionary to store the best models
best_params = {}

# Perform GridSearchCV for each model
for model_name, model in models.items():
    if model_name in param_grids:
        grid_search = GridSearchCV(estimator=model, param_grid=param_grids[model_name], cv=5, verbose=0, n_jobs=-1)
        grid_search.fit(X_train, y_train)
        best_params[model_name] = grid_search.best_estimator_
        print(f"Best parameters for {model_name}: {grid_search.best_params_}")
        print(f"Best score for {model_name}: {grid_search.best_score_}")
    else:
        model.fit(X_train, y_train)
        best_params[model_name] = model

3120 fits failed out of a total of 4800.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
240 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/geronimobasso/Desktop/extra/drones/code/paint-bb/.venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/geronimobasso/Desktop/extra/drones/code/paint-bb/.venv/lib/python3.12/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/geronimobasso/Desktop/extra/drones/code/paint-bb/.venv/lib/python3.12/site-packages/sklearn/linear_model/_log

Best parameters for log_reg: {'C': 0.1, 'class_weight': None, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best score for log_reg: 0.8243967300305328
Best parameters for rf_clf: {'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 50}
Best score for rf_clf: 0.8370530877573131
Best parameters for xgb_clf: {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 50}
Best score for xgb_clf: 0.8328474342558849
Best parameters for dt: {'max_depth': 7, 'min_samples_leaf': 4, 'min_samples_split': 2}
Best score for dt: 0.8230276765488034
Best parameters for knn: {'algorithm': 'ball_tree', 'n_neighbors': 7, 'weights': 'uniform'}
Best score for knn: 0.8201713779178569
Best parameters for svc: {'C': 0.1, 'gamma': 1, 'kernel': 'linear'}
Best score for svc: 0.8300108342361863


In [12]:
# Initialize again the new the models
log_reg =  LogisticRegression(C=0.1, max_iter=100 ,penalty='l2', solver='liblinear')
rf_clf = RandomForestClassifier(max_depth=20, min_samples_leaf=2, min_samples_split=2, n_estimators=50, random_state=42)
xgb_clf = XGBClassifier(learning_rate=0.1, max_depth=6, n_estimators=50, random_state=42)

dt = tree.DecisionTreeClassifier(max_depth=7, min_samples_leaf=4, min_samples_split=2, random_state=1)
knn = KNeighborsClassifier(algorithm='ball_tree', n_neighbors=7, weights='uniform')
svc =  SVC(C=0.1, gamma=0.1, kernel='linear', probability=True)

voting_clf = VotingClassifier(estimators=[
    ('lr', log_reg),
    ('rf', rf_clf),
    ('xgb', xgb_clf),
    ('dt', dt),
    ('knn', knn),
    ('svc', svc)
], voting='soft')

# Train the models
log_reg.fit(X_train, y_train)
rf_clf.fit(X_train, y_train)
xgb_clf.fit(X_train, y_train)
dt.fit(X_train, y_train)
knn.fit(X_train, y_train)
svc.fit(X_train, y_train)
voting_clf.fit(X_train, y_train)

# Make predictions
log_reg_train_preds = log_reg.predict(X_train)
log_reg_test_preds = log_reg.predict(X_val)

rf_clf_train_preds = rf_clf.predict(X_train)
rf_clf_test_preds = rf_clf.predict(X_val)

xgb_clf_train_preds = xgb_clf.predict(X_train)
xgb_clf_test_preds = xgb_clf.predict(X_val)

dt_train_preds = dt.predict(X_train)
dt_test_preds = dt.predict(X_val)

knn_train_preds = knn.predict(X_train)
knn_test_preds = knn.predict(X_val)

svc_train_preds = svc.predict(X_train)
svc_test_preds = svc.predict(X_val)

voting_train_preds = voting_clf.predict(X_train)
voting_test_preds = voting_clf.predict(X_val)

# Calculate accuracies
log_reg_train_acc = accuracy_score(y_train, log_reg_train_preds)
log_reg_test_acc = accuracy_score(y_val, log_reg_test_preds)

rf_clf_train_acc = accuracy_score(y_train, rf_clf_train_preds)
rf_clf_test_acc = accuracy_score(y_val, rf_clf_test_preds)

xgb_clf_train_acc = accuracy_score(y_train, xgb_clf_train_preds)
xgb_clf_test_acc = accuracy_score(y_val, xgb_clf_test_preds)

dt_train_acc = accuracy_score(y_train, dt_train_preds)
dt_test_acc = accuracy_score(y_val, dt_test_preds)

knn_train_acc = accuracy_score(y_train, knn_train_preds)
knn_test_acc = accuracy_score(y_val, knn_test_preds)

svc_train_acc = accuracy_score(y_train, svc_train_preds)
svc_test_acc = accuracy_score(y_val, svc_test_preds)

voting_train_acc = accuracy_score(y_train, voting_train_preds)
voting_test_acc = accuracy_score(y_val, voting_test_preds)

In [13]:
data_2 = {
    'Model': ['Logistic Regression', 'Random Forest', 'XGBoost', 'Decision Tree', 'K-Nearest Neighbors', 'SVC', 'Voting Classifier'],
    'Training Accuracy': [log_reg_train_acc, rf_clf_train_acc, xgb_clf_train_acc, dt_train_acc, knn_train_acc, svc_train_acc, voting_train_acc],
    'Test Accuracy': [log_reg_test_acc, rf_clf_test_acc, xgb_clf_test_acc, dt_test_acc, knn_test_acc, svc_test_acc, voting_test_acc]
}

accuracy_table_2 = pd.DataFrame(data_2)
accuracy_table_2 = accuracy_table_2.sort_values(by='Test Accuracy', ascending=False)

# Print the table
print(accuracy_table_2)  

                 Model  Training Accuracy  Test Accuracy
1        Random Forest           0.882022       0.837989
3        Decision Tree           0.873596       0.837989
4  K-Nearest Neighbors           0.858146       0.826816
6    Voting Classifier           0.867978       0.826816
0  Logistic Regression           0.830056       0.821229
5                  SVC           0.830056       0.815642
2              XGBoost           0.882022       0.810056


In [14]:
log_reg_final_predictions = log_reg.predict(test)
rf_final_predictions = rf_clf.predict(test)
xgb_final_predictions = xgb_clf.predict(test)
dt_final_predictions = dt.predict(test)
knn_final_predictions = knn.predict(test)
svc_final_predictions = svc.predict(test)
voting_final_predictions = voting_clf.predict(test)

### Deep learning

In [43]:
model = Sequential([
    Dense(units=4096, activation='relu', name='L1'),  # Increased units
    Dense(units=2048, activation='relu', name='L2'),  # Increased units
    Dense(units=1024, activation='relu', name='L3'),  # Increased units
    Dense(units=512, activation='relu', name='L4'),  # Increased units
    Dense(units=256, activation='relu', name='L5'),  # Increased units
    Dense(units=128, activation='relu', name='L6'),  # Increased units
    Dense(units=64, activation='relu', name='L7'),  # Increased units
    Dense(units=32, activation='relu', name='L8'),  # Increased units
    Dense(units=16, activation='relu', name='L9'),  # Increased units
    Dense(units=1, activation='sigmoid', name='L10')  # Output layer
])

In [44]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-2), loss='binary_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(
    monitor='val_loss',  # You can also use 'val_accuracy'
    patience=10,  # Number of epochs to wait for improvement
    restore_best_weights=True  # Restore the best model weights
)

model.fit(
    X_train,
    y_train,
    epochs=100,
    batch_size=128,
    validation_data=(X_val, y_val),  # Include validation data
    callbacks=[early_stopping]  # Include the early stopping callback
)

Epoch 1/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 73ms/step - accuracy: 0.5231 - loss: 2.5759 - val_accuracy: 0.5866 - val_loss: 0.6795
Epoch 2/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - accuracy: 0.6222 - loss: 0.6347 - val_accuracy: 0.5866 - val_loss: 0.5470
Epoch 3/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - accuracy: 0.6584 - loss: 0.5409 - val_accuracy: 0.8045 - val_loss: 0.5782
Epoch 4/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - accuracy: 0.8256 - loss: 0.5125 - val_accuracy: 0.8156 - val_loss: 0.5575
Epoch 5/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - accuracy: 0.8157 - loss: 0.5223 - val_accuracy: 0.7989 - val_loss: 0.4738
Epoch 6/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step - accuracy: 0.8302 - loss: 0.4516 - val_accuracy: 0.8101 - val_loss: 0.4745
Epoch 7/100
[1m6/6[0m [32m━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x3129334a0>

In [45]:
model.summary()

In [46]:
predictions_train = model.predict(X_train)
predictions_train = (predictions_train >= 0.5).astype(int)

predictions_val = model.predict(X_val)
predictions_val = (predictions_val >= 0.5).astype(int)

train_accuracy = accuracy_score(y_train, predictions_train)
val_accuracy = accuracy_score(y_val, predictions_val)

print(f"Train accuracy: {train_accuracy}.")
print(f"Validation accuracy: {val_accuracy}.")

[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step 
Train accuracy: 0.8356741573033708.
Validation accuracy: 0.8379888268156425.


## Make predictions

In [47]:
predictions = model.predict(test)
predictions = (predictions >= 0.5).astype(int)
predictions = predictions.flatten()
print(predictions)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
[0 0 0 0 1 0 1 0 1 0 0 0 1 0 1 1 0 0 0 1 0 1 1 0 1 0 1 0 0 0 0 0 0 0 1 0 1
 0 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1 0 0 0 0 0 0 1 0 0 0 1 1 1 1 0 1 0 1 0 0 1
 1 0 0 1 0 1 1 0 0 0 0 0 1 0 1 1 1 0 1 0 1 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 1 1 1 1 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 1 0 0 1 0 0 1 0 0 1 1 1 1 0 0 1 0 0 1 0 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1
 0 1 0 0 0 0 0 1 0 1 0 1 0 0 1 1 1 0 1 0 0 1 0 1 0 0 0 0 1 0 0 1 0 1 0 1 0
 1 0 1 1 0 1 0 0 0 1 0 0 1 0 0 0 1 1 1 1 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 1 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 1 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 0 1 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0
 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 1 0 0 0 0 0 1 0 0 1 0 1 1 0 1 0 0 1 1 0
 0 1 0 0 1 1 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 0 1 1 1 1 0 0 1 0 0 1]


## Submit predictions

In [15]:
choosen_model_name = 'log_res'

submission = pd.DataFrame({
    'PassengerId': pd.read_csv('data/test.csv')['PassengerId'],  # Ensure PassengerId is correctly handled
    'Survived': log_reg_final_predictions  # or log_reg_test_preds, xgb_clf_test_preds
})

# Get the current date and time
now = datetime.now()
# Format the date and time as a string
date_time_str = now.strftime("%Y%m%d_%H%M%S")

# Save the DataFrame to a CSV file with the date and time in the filename
submission.to_csv(f'output/submission_{choosen_model_name}_{date_time_str}.csv', index=False)