In [12]:
import pandas as pd

from datetime import datetime

import numpy as np

import seaborn as sns

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping

from xgboost import XGBClassifier

import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.metrics import accuracy_score

In [5]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
submission = pd.read_csv('data/gender_submission.csv')
print(f"Size of the training set: {train.shape[0]}")
print(f"Size of the test set: {test.shape[0]}")

Size of the training set: 891
Size of the test set: 418


In [60]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [61]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [62]:
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


# EDA

FE: 
Name
Ticket, get only the number
Cabin, have cabin, cabin number, and cabin letter. 

## Feature Engineering

In [27]:
def extract_title(name):
    return name.split(',')[1].split('.')[0].strip()

def feature_engineering(df):
    df.drop('PassengerId', axis=1, inplace=True)  
    
    df['Title'] = df['Name'].apply(extract_title)
    df.drop('Name', axis=1, inplace=True)  
    
    df.drop('Ticket', axis=1, inplace=True)  
    df.drop('Cabin', axis=1, inplace=True)  
    
    df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona', 'the Countess'], 'Rare')
    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')
    df['Title'] = df['Title'].replace('Master', 'YoungBoy')

    df['Age_bin'] = pd.cut(df['Age'], bins=[0,12,20,40,120], labels=['Children','Teenage','Adult','Elder'])
    df.drop('Age', axis=1, inplace=True)  
    
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    
    df['Fare_bin'] = pd.cut(df['Fare'], bins=[0,7.91,14.45,31,120], labels=['Low_fare','median_fare','Average_fare','high_fare'])
    df.drop('Fare', axis=1, inplace=True)  

    return df

In [6]:
def status(feature):
    print('Processing', feature, ': ok')

# Separating the target variable from the train set
targets = train['Survived']
train.drop(['Survived'], axis=1, inplace=True)

# Merging train and test data for future feature engineering
# Also removing the 'PassengerId' as it's not an informative feature
combined = pd.concat([train, test], ignore_index=True)
combined.drop(['PassengerId'], axis=1, inplace=True)

print(combined.shape)

(1309, 10)


In [7]:
titles = set()
for name in combined['Name']:
    titles.add(name.split(',')[1].split('.')[0].strip())

print(titles)
# set(['Sir', 'Major', 'the Countess', 'Don', 'Mlle', 'Capt', 'Dr', 'Lady', 'Rev', 'Mrs', 'Jonkheer', 'Master', 'Ms', 'Mr', 'Mme', 'Miss', 'Col'])

Title_Dictionary = {
    "Capt": "Officer",
    "Col": "Officer",
    "Major": "Officer",
    "Jonkheer": "Royalty",
    "Don": "Royalty",
    "Sir" : "Royalty",
    "Dr": "Officer",
    "Rev": "Officer",
    "the Countess":"Royalty",
    "Mme": "Mrs",
    "Mlle": "Miss",
    "Ms": "Mrs",
    "Mr" : "Mr",
    "Mrs" : "Mrs",
    "Miss" : "Miss",
    "Master" : "Master",
    "Lady" : "Royalty"
}

def get_titles():
    # we extract the title from each name
    combined['Title'] = combined['Name'].map(lambda name:name.split(',')[1].split('.')[0].strip())
    
    # a map of more aggregated title
    # we map each title
    combined['Title'] = combined.Title.map(Title_Dictionary)
    status('Title')
    return combined

combined = get_titles()

{'Ms', 'Rev', 'Sir', 'Mme', 'Jonkheer', 'Dona', 'Major', 'Dr', 'the Countess', 'Mlle', 'Capt', 'Lady', 'Col', 'Master', 'Mr', 'Don', 'Miss', 'Mrs'}
Processing Title : ok


In [8]:
combined.head()

Unnamed: 0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr
1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss
3,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs
4,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr


In [15]:
# Grouping by 'Sex', 'Pclass', and 'Title'
grouped_train = combined.iloc[:891].groupby(['Sex', 'Pclass', 'Title'])

# Apply median only to the 'Age' column
grouped_median_train = grouped_train['Age'].median().reset_index()

# The resulting DataFrame contains 'Sex', 'Pclass', 'Title', and the median 'Age'
grouped_median_train.head()

Unnamed: 0,Sex,Pclass,Title,Age
0,female,1,Miss,30.0
1,female,1,Mrs,40.0
2,female,1,Officer,49.0
3,female,1,Royalty,40.5
4,female,2,Miss,24.0


In [16]:
def fill_age(row):
    condition = (
        (grouped_median_train['Sex'] == row['Sex']) & 
        (grouped_median_train['Title'] == row['Title']) & 
        (grouped_median_train['Pclass'] == row['Pclass'])
    ) 
    return grouped_median_train[condition]['Age'].values[0]


def process_age():
    global combined
    # a function that fills the missing values of the Age variable
    combined['Age'] = combined.apply(lambda row: fill_age(row) if np.isnan(row['Age']) else row['Age'], axis=1)
    status('age')
    return combined

combined = process_age()

Processing age : ok


In [19]:
def process_names():
    global combined
    # we clean the Name variable
    combined.drop('Name', axis=1, inplace=True)
    
    # encoding in dummy variable
    titles_dummies = pd.get_dummies(combined['Title'], prefix='Title')
    combined = pd.concat([combined, titles_dummies], axis=1)
    
    # removing the title variable
    combined.drop('Title', axis=1, inplace=True)
    
    status('names')
    return combined

combined = process_names()

combined.head()

Processing names : ok


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty
0,3,male,22.0,1,0,A/5 21171,7.25,,S,False,False,True,False,False,False
1,1,female,38.0,1,0,PC 17599,71.2833,C85,C,False,False,False,True,False,False
2,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,False,True,False,False,False,False
3,1,female,35.0,1,0,113803,53.1,C123,S,False,False,False,True,False,False
4,3,male,35.0,0,0,373450,8.05,,S,False,False,True,False,False,False


In [20]:
def process_fares():
    global combined
    # there's one missing fare value - replacing it with the mean.
    combined.Fare.fillna(combined.iloc[:891].Fare.mean(), inplace=True)
    status('fare')
    return combined

combined = process_fares()

Processing fare : ok


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined.Fare.fillna(combined.iloc[:891].Fare.mean(), inplace=True)


In [21]:
def process_embarked():
    global combined
    # two missing embarked values - filling them with the most frequent one in the train  set(S)
    combined.Embarked.fillna('S', inplace=True)
    # dummy encoding 
    embarked_dummies = pd.get_dummies(combined['Embarked'], prefix='Embarked')
    combined = pd.concat([combined, embarked_dummies], axis=1)
    combined.drop('Embarked', axis=1, inplace=True)
    status('embarked')
    return combined

combined = process_embarked()

combined.head()

Processing embarked : ok


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined.Embarked.fillna('S', inplace=True)


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Title_Master,Title_Miss,Title_Mr,Title_Mrs,Title_Officer,Title_Royalty,Embarked_C,Embarked_Q,Embarked_S
0,3,male,22.0,1,0,A/5 21171,7.25,,False,False,True,False,False,False,False,False,True
1,1,female,38.0,1,0,PC 17599,71.2833,C85,False,False,False,True,False,False,True,False,False
2,3,female,26.0,0,0,STON/O2. 3101282,7.925,,False,True,False,False,False,False,False,False,True
3,1,female,35.0,1,0,113803,53.1,C123,False,False,False,True,False,False,False,False,True
4,3,male,35.0,0,0,373450,8.05,,False,False,True,False,False,False,False,False,True


In [22]:
train_cabin, test_cabin = set(), set()

for c in combined.iloc[:891]['Cabin']:
    try:
        train_cabin.add(c[0])
    except:
        train_cabin.add('U')
        
for c in combined.iloc[891:]['Cabin']:
    try:
        test_cabin.add(c[0])
    except:
        test_cabin.add('U')

print(train_cabin)
# set(['A', 'C', 'B', 'E', 'D', 'G', 'F', 'U', 'T'])

print(test_cabin)
# set(['A', 'C', 'B', 'E', 'D', 'G', 'F', 'U'])

{'B', 'T', 'U', 'D', 'F', 'G', 'E', 'A', 'C'}
{'B', 'U', 'D', 'F', 'G', 'E', 'A', 'C'}


In [23]:
def process_cabin():
    global combined    
    # replacing missing cabins with U (for Uknown)
    combined.Cabin.fillna('U', inplace=True)
    
    # mapping each Cabin value with the cabin letter
    combined['Cabin'] = combined['Cabin'].map(lambda c: c[0])
    
    # dummy encoding ...
    cabin_dummies = pd.get_dummies(combined['Cabin'], prefix='Cabin')    
    combined = pd.concat([combined, cabin_dummies], axis=1)

    combined.drop('Cabin', axis=1, inplace=True)
    status('cabin')
    return combined

In [24]:
combined = process_cabin()

Processing cabin : ok


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined.Cabin.fillna('U', inplace=True)


In [25]:
def process_sex():
    global combined
    # mapping string values to numerical one 
    combined['Sex'] = combined['Sex'].map({'male':1, 'female':0})
    status('Sex')
    return combined

combined = process_sex()

Processing Sex : ok


In [26]:
def process_pclass():
    
    global combined
    # encoding into 3 categories:
    pclass_dummies = pd.get_dummies(combined['Pclass'], prefix="Pclass")
    
    # adding dummy variable
    combined = pd.concat([combined, pclass_dummies],axis=1)
    
    # removing "Pclass"
    combined.drop('Pclass',axis=1,inplace=True)
    
    status('Pclass')
    return combined

combined = process_pclass()

Processing Pclass : ok


In [27]:
def cleanTicket(ticket):
    ticket = ticket.replace('.', '')
    ticket = ticket.replace('/', '')
    ticket = ticket.split()
    ticket = map(lambda t : t.strip(), ticket)
    ticket = list(filter(lambda t : not t.isdigit(), ticket))
    if len(ticket) > 0:
        return ticket[0]
    else: 
        return 'XXX'

tickets = set()
for t in combined['Ticket']:
    tickets.add(cleanTicket(t))

print(len(tickets))
#37


def process_ticket():
    
    global combined
    
    # a function that extracts each prefix of the ticket, returns 'XXX' if no prefix (i.e the ticket is a digit)
    def cleanTicket(ticket):
        ticket = ticket.replace('.','')
        ticket = ticket.replace('/','')
        ticket = ticket.split()
        ticket = map(lambda t : t.strip(), ticket)
        ticket = filter(lambda t : not t.isdigit(), ticket)
        if len(ticket) > 0:
            return ticket[0]
        else: 
            return 'XXX'
    

    # Extracting dummy variables from tickets:

    combined['Ticket'] = combined['Ticket'].map(cleanTicket)
    tickets_dummies = pd.get_dummies(combined['Ticket'], prefix='Ticket')
    combined = pd.concat([combined, tickets_dummies], axis=1)
    combined.drop('Ticket', inplace=True, axis=1)

    status('Ticket')
    return combined

combined = process_ticket()

37


TypeError: object of type 'filter' has no len()

In [28]:
def process_family():
    
    global combined
    # introducing a new feature : the size of families (including the passenger)
    combined['FamilySize'] = combined['Parch'] + combined['SibSp'] + 1
    
    # introducing other features based on the family size
    combined['Singleton'] = combined['FamilySize'].map(lambda s: 1 if s == 1 else 0)
    combined['SmallFamily'] = combined['FamilySize'].map(lambda s: 1 if 2 <= s <= 4 else 0)
    combined['LargeFamily'] = combined['FamilySize'].map(lambda s: 1 if 5 <= s else 0)
    
    status('family')
    return combined

In [29]:
combined = process_family()

print(combined.shape)
# (1309, 67)

Processing family : ok
(1309, 31)


In [32]:
combined = combined.drop(['Ticket'], axis=1)

In [33]:
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

def compute_score(clf, X, y, scoring='accuracy'):
    xval = cross_val_score(clf, X, y, cv = 5, scoring=scoring)
    return np.mean(xval)

def recover_train_test_target():
    global combined
    
    targets = pd.read_csv('data/train.csv', usecols=['Survived'])['Survived'].values
    train = combined.iloc[:891]
    test = combined.iloc[891:]
    
    return train, test, targets

train, test, targets = recover_train_test_target()

# turn run_gs to True if you want to run the gridsearch again.
run_gs = True

if run_gs:
    parameter_grid = {
                 'max_depth' : [4, 6, 8],
                 'n_estimators': [50, 10],
                 'max_features': ['sqrt', 'auto', 'log2'],
                 'min_samples_split': [2, 3, 10],
                 'min_samples_leaf': [1, 3, 10],
                 'bootstrap': [True, False],
                 }
    forest = RandomForestClassifier()
    cross_validation = StratifiedKFold(n_splits=5)

    grid_search = GridSearchCV(forest,
                               scoring='accuracy',
                               param_grid=parameter_grid,
                               cv=cross_validation,
                               verbose=1
                              )

    grid_search.fit(train, targets)
    model = grid_search
    parameters = grid_search.best_params_

    print('Best score: {}'.format(grid_search.best_score_))
    print('Best parameters: {}'.format(grid_search.best_params_))
    
else: 
    parameters = {'bootstrap': False, 'min_samples_leaf': 3, 'n_estimators': 50, 
                  'min_samples_split': 10, 'max_features': 'sqrt', 'max_depth': 6}
    
    model = RandomForestClassifier(**parameters)
    model.fit(train, targets)
    
output = model.predict(test).astype(int)
df_output = pd.DataFrame()
aux = pd.read_csv('./data/test.csv')
df_output['PassengerId'] = aux['PassengerId']
df_output['Survived'] = output
df_output[['PassengerId','Survived']].to_csv('output/gridsearch_rf.csv', index=False)


Fitting 5 folds for each of 324 candidates, totalling 1620 fits
Best score: 0.8383842822170611
Best parameters: {'bootstrap': False, 'max_depth': 8, 'max_features': 'log2', 'min_samples_leaf': 3, 'min_samples_split': 10, 'n_estimators': 10}


540 fits failed out of a total of 1620.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
540 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/geronimobasso/Desktop/extra/drones/code/paint-bb/.venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/geronimobasso/Desktop/extra/drones/code/paint-bb/.venv/lib/python3.12/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/Users/geronimobasso/Desktop/extra/drones/code/paint-bb/.venv/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/

## Preprocessing

In [28]:
train = feature_engineering(train)
test = feature_engineering(test)

In [29]:
numeric_features = ['Pclass', 'SibSp', 'Parch', 'FamilySize']
categorical_features = ['Sex', 'Title','Age_bin','Fare_bin', 'Embarked']

In [6]:
train.head()

Unnamed: 0,Survived,Pclass,Sex,SibSp,Parch,Embarked,Title,Age_bin,FamilySize,Fare_bin
0,0,3,male,1,0,S,Mr,Adult,2,Low_fare
1,1,1,female,1,0,C,Mrs,Adult,2,high_fare
2,1,3,female,0,0,S,Miss,Adult,1,median_fare
3,1,1,female,1,0,S,Mrs,Adult,2,high_fare
4,0,3,male,0,0,S,Mr,Adult,1,median_fare


In [30]:
X = train.drop('Survived', axis=1)
y = train['Survived']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set shape: {X_train.shape}")
print(f"Validating set shape: {X_val.shape}")

Training set shape: (712, 9)
Validating set shape: (179, 9)


In [31]:
preprocessor = ColumnTransformer(
    transformers=[
        # Pipeline for numeric features
        ('num', Pipeline([
            # Impute missing values in numeric columns with the median of the column
            ('imputer', SimpleImputer(strategy='median')),
            # Scale numeric features to have mean=0 and standard deviation=1
            ('scaler', StandardScaler())
        ]), numeric_features),
        
        # Pipeline for categorical features
        ('cat', Pipeline([
            # Impute missing values in categorical columns with the string 'missing'
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            # Convert categorical features to string type
            ('to_string', FunctionTransformer(lambda x: x.astype(str))),
            # One-hot encode categorical features, ignoring unknown categories
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_features)
    ])

# Create a full pipeline that applies the preprocessor to the dataset
full_pipeline = Pipeline([
    ('preprocessor', preprocessor)  # Apply the preprocessor to the data
])

In [32]:
X_train = full_pipeline.fit_transform(X_train)
X_val = full_pipeline.transform(X_val)

In [33]:
test = full_pipeline.transform(test)

## Model building

### Logistic Regression

In [None]:
# Define the Logistic Regression model
log_reg = LogisticRegression(random_state=42, max_iter=1000)

# Define the parameter grid for GridSearch
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'class_weight': [None, 'balanced']
}

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=log_reg,
    param_grid=param_grid,
    cv=10,
    scoring='accuracy',
    verbose=2,
    n_jobs=-1
)

# Fit the model
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

# Use the best model to make predictions on the test set
best_model = grid_search.best_estimator_

predictions_train = best_model.predict(X_train)
predictions_val = best_model.predict(X_val)

train_accuracy = accuracy_score(y_train, predictions_train)
val_accuracy = accuracy_score(y_val, predictions_val)

print(f"Train accuracy: {train_accuracy}.")
print(f"Validation accuracy: {val_accuracy}.")

y_pred = best_model.predict(test)

### Random Forest

In [24]:
rf = RandomForestClassifier(random_state=42)

# Define the parameter grid for GridSearch
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt'],
    'bootstrap': [True, False],
    'class_weight': [None, 'balanced', 'balanced_subsample']
}

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=10,
    scoring='accuracy',
    verbose=2,
    n_jobs=-1
)

# Fit the model
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

# Use the best model to make predictions on the test set
best_model = grid_search.best_estimator_

predictions_train = best_model.predict(X_train)
predictions_val = best_model.predict(X_val)

train_accuracy = accuracy_score(y_train, predictions_train)
val_accuracy = accuracy_score(y_val, predictions_val)

print(f"Train accuracy: {train_accuracy}.")
print(f"Validation accuracy: {val_accuracy}.")

y_pred = best_model.predict(train)

Fitting 10 folds for each of 1296 candidates, totalling 12960 fits
[CV] END bootstrap=True, class_weight=None, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END bootstrap=True, class_weight=None, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END bootstrap=True, class_weight=None, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   0.0s
[CV] END bootstrap=True, class_weight=None, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   0.0s
[CV] END bootstrap=True, class_weight=None, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=   0.0s
[CV] END bootstrap=True, class_weight=None, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time

6480 fits failed out of a total of 12960.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1969 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/geronimobasso/Desktop/extra/drones/code/paint-bb/.venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/geronimobasso/Desktop/extra/drones/code/paint-bb/.venv/lib/python3.12/site-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/Users/geronimobasso/Desktop/extra/drones/code/paint-bb/.venv/lib/python3.12/site-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File

ValueError: could not convert string to float: 'male'

In [36]:
# Initialize the Random Forest Classifier
random_forest = RandomForestClassifier(
    n_estimators=400,
    bootstrap=True,
    class_weight=None,
    criterion='gini',
    max_depth=None,
    max_features='sqrt',  # Changed from 'auto' to 'sqrt'
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_samples_leaf=1,
    min_samples_split=2,
    min_weight_fraction_leaf=0.0,
    n_jobs=1,
    oob_score=False,
    random_state=None,
    verbose=0,
    warm_start=False
)

# Fit the model
random_forest.fit(X_train, y_train)

# Make predictions
y_pred = random_forest.predict(train)

# Calculate accuracy on training data
train_accuracy = round(random_forest.score(X_train, y_train) * 100, 2)

# Calculate accuracy on test data
test_accuracy = round(random_forest.score(X_val, y_val) * 100, 2)

print("Important features")
feature_importance = pd.Series(random_forest.feature_importances_, X_train.columns).sort_values(ascending=True)
feature_importance.plot.barh(width=0.8)
plt.title("Feature Importance")
plt.tight_layout()
plt.show()

print('__'*30)
print(f"Training Accuracy: {train_accuracy}%")
print(f"Test Accuracy: {test_accuracy}%")



ValueError: could not convert string to float: 'male'

## XGBoost

In [None]:
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

param_grid = {
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.001,0.01, 0.1, 0.2, 0.3],
    'n_estimators': [100, 200, 300],
    'min_child_weight': [1, 3, 5],
    'gamma': [0.0, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, 
                           cv=5, scoring='accuracy', verbose=2, n_jobs=-1)

grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

best_model = grid_search.best_estimator_
y_pred = best_model.predict(test)

In [None]:
# Define the SVC model
svc = SVC()

# Define the parameter grid for GridSearch
param_grid = {
    'C': [0.1, 1, 10, 50],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto', 0.1, 1],
    'degree': [2, 3, 4],  # Only relevant for poly kernel
    'class_weight': [None, 'balanced']
}

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=svc,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    verbose=2,
    n_jobs=-1
)

# Fit the model
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

# Use the best model to make predictions on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(test)

In [49]:
print(y_pred)

[0 0 0 1 1 0 1 0 0 0 0 0 1 0 1 1 0 0 1 0 0 1 1 0 1 0 1 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0 1 1 0 1 0
 1 0 0 1 0 1 1 0 0 0 0 0 1 0 1 1 0 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 0 1 0 0 1 1 1 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0
 0 0 1 0 0 0 0 0 1 0 1 0 1 1 1 0 0 0 1 0 1 1 0 1 0 0 0 1 1 0 1 1 0 0 1 0 1
 0 1 0 0 0 0 0 0 0 1 0 1 0 0 1 1 1 0 1 0 0 1 0 1 0 0 0 0 1 0 1 1 0 1 0 1 0
 1 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 1 1 0 1 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 1 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 1 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 0
 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 1 0 0 0 1 0 1 0 0 0 0 1 1 1 1 0 0 1 1 0
 0 1 0 0 1 1 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 0 1 1 1 1 1 0 1 0 0 0]


In [75]:
best_model = grid_search.best_estimator_

predictions_train = best_model.predict(X_train)

predictions_val = best_model.predict(X_val)

train_accuracy = accuracy_score(y_train, predictions_train)
val_accuracy = accuracy_score(y_val, predictions_val)

print(f"Train accuracy: {train_accuracy}.")
print(f"Validation accuracy: {val_accuracy}.")

Train accuracy: 0.8876404494382022.
Validation accuracy: 0.8324022346368715.


### Deep learning

In [43]:
model = Sequential([
    Dense(units=4096, activation='relu', name='L1'),  # Increased units
    Dense(units=2048, activation='relu', name='L2'),  # Increased units
    Dense(units=1024, activation='relu', name='L3'),  # Increased units
    Dense(units=512, activation='relu', name='L4'),  # Increased units
    Dense(units=256, activation='relu', name='L5'),  # Increased units
    Dense(units=128, activation='relu', name='L6'),  # Increased units
    Dense(units=64, activation='relu', name='L7'),  # Increased units
    Dense(units=32, activation='relu', name='L8'),  # Increased units
    Dense(units=16, activation='relu', name='L9'),  # Increased units
    Dense(units=1, activation='sigmoid', name='L10')  # Output layer
])

In [44]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-2), loss='binary_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(
    monitor='val_loss',  # You can also use 'val_accuracy'
    patience=10,  # Number of epochs to wait for improvement
    restore_best_weights=True  # Restore the best model weights
)

model.fit(
    X_train,
    y_train,
    epochs=100,
    batch_size=128,
    validation_data=(X_val, y_val),  # Include validation data
    callbacks=[early_stopping]  # Include the early stopping callback
)

Epoch 1/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 73ms/step - accuracy: 0.5231 - loss: 2.5759 - val_accuracy: 0.5866 - val_loss: 0.6795
Epoch 2/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - accuracy: 0.6222 - loss: 0.6347 - val_accuracy: 0.5866 - val_loss: 0.5470
Epoch 3/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - accuracy: 0.6584 - loss: 0.5409 - val_accuracy: 0.8045 - val_loss: 0.5782
Epoch 4/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - accuracy: 0.8256 - loss: 0.5125 - val_accuracy: 0.8156 - val_loss: 0.5575
Epoch 5/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - accuracy: 0.8157 - loss: 0.5223 - val_accuracy: 0.7989 - val_loss: 0.4738
Epoch 6/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step - accuracy: 0.8302 - loss: 0.4516 - val_accuracy: 0.8101 - val_loss: 0.4745
Epoch 7/100
[1m6/6[0m [32m━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x3129334a0>

In [45]:
model.summary()

In [46]:
predictions_train = model.predict(X_train)
predictions_train = (predictions_train >= 0.5).astype(int)

predictions_val = model.predict(X_val)
predictions_val = (predictions_val >= 0.5).astype(int)

train_accuracy = accuracy_score(y_train, predictions_train)
val_accuracy = accuracy_score(y_val, predictions_val)

print(f"Train accuracy: {train_accuracy}.")
print(f"Validation accuracy: {val_accuracy}.")

[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step 
Train accuracy: 0.8356741573033708.
Validation accuracy: 0.8379888268156425.


## Make predictions

In [47]:
predictions = model.predict(test)
predictions = (predictions >= 0.5).astype(int)
predictions = predictions.flatten()
print(predictions)

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
[0 0 0 0 1 0 1 0 1 0 0 0 1 0 1 1 0 0 0 1 0 1 1 0 1 0 1 0 0 0 0 0 0 0 1 0 1
 0 0 0 0 0 0 1 1 0 0 0 1 1 0 0 1 0 0 0 0 0 0 1 0 0 0 1 1 1 1 0 1 0 1 0 0 1
 1 0 0 1 0 1 1 0 0 0 0 0 1 0 1 1 1 0 1 0 1 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 1 1 0 0 1 1 1 1 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 1 0 0 1 0 0 1 0 0 1 1 1 1 0 0 1 0 0 1 0 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1
 0 1 0 0 0 0 0 1 0 1 0 1 0 0 1 1 1 0 1 0 0 1 0 1 0 0 0 0 1 0 0 1 0 1 0 1 0
 1 0 1 1 0 1 0 0 0 1 0 0 1 0 0 0 1 1 1 1 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 1 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 1 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 0 1 0 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0
 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 1 0 0 0 0 0 1 0 0 1 0 1 1 0 1 0 0 1 1 0
 0 1 0 0 1 1 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 0 1 1 1 1 0 0 1 0 0 1]


## Submit predictions

In [49]:
choosen_model_name = 'nn_new_features'

submission = pd.DataFrame({
    'PassengerId': pd.read_csv('data/test.csv')['PassengerId'],  # Ensure PassengerId is correctly handled
    'Survived': predictions  # or log_reg_test_preds, xgb_clf_test_preds
})

# Get the current date and time
now = datetime.now()
# Format the date and time as a string
date_time_str = now.strftime("%Y%m%d_%H%M%S")

# Save the DataFrame to a CSV file with the date and time in the filename
submission.to_csv(f'output/submission_{choosen_model_name}_{date_time_str}.csv', index=False)