In [84]:
import pandas as pd

from datetime import datetime

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping

from xgboost import XGBClassifier

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.metrics import accuracy_score

In [85]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
submission = pd.read_csv('data/gender_submission.csv')
print(f"Size of the training set: {train.shape[0]}")
print(f"Size of the test set: {test.shape[0]}")

Size of the training set: 891
Size of the test set: 418


In [60]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [61]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [62]:
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


# EDA

FE: 
Name
Ticket, get only the number
Cabin, have cabin, cabin number, and cabin letter. 

## Feature Engineering

In [86]:
def extract_title(name):
    return name.split(',')[1].split('.')[0].strip()

train['Title'] = train['Name'].apply(extract_title)
train.drop('Name', axis=1, inplace=True)

test['Title'] = test['Name'].apply(extract_title)
test.drop('Name', axis=1, inplace=True)

train['Cabin'] = train['Cabin'].str[0]
test['Cabin'] = test['Cabin'].str[0]

## Preprocessing

In [87]:
numeric_features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
categorical_features = ['Title', 'Sex', 'Cabin', 'Embarked']

In [88]:
X = train.drop(columns=['Survived', 'Ticket', 'PassengerId'])
y = train['Survived']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set shape: {X_train.shape}")
print(f"Validating set shape: {X_val.shape}")

Training set shape: (712, 9)
Validating set shape: (179, 9)


In [89]:
preprocessor = ColumnTransformer(
    transformers=[
        # Pipeline for numeric features
        ('num', Pipeline([
            # Impute missing values in numeric columns with the median of the column
            ('imputer', SimpleImputer(strategy='median')),
            # Scale numeric features to have mean=0 and standard deviation=1
            ('scaler', StandardScaler())
        ]), numeric_features),
        
        # Pipeline for categorical features
        ('cat', Pipeline([
            # Impute missing values in categorical columns with the string 'missing'
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            # Convert categorical features to string type
            ('to_string', FunctionTransformer(lambda x: x.astype(str))),
            # One-hot encode categorical features, ignoring unknown categories
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_features)
    ])

# Create a full pipeline that applies the preprocessor to the dataset
full_pipeline = Pipeline([
    ('preprocessor', preprocessor)  # Apply the preprocessor to the data
])

In [90]:
X_train = full_pipeline.fit_transform(X_train)
X_val = full_pipeline.transform(X_val)

In [91]:
test = test.drop(['PassengerId','Ticket'], axis=1)

In [92]:
test = full_pipeline.transform(test)

## Model building

## XGBoost

In [None]:
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

param_grid = {
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.001,0.01, 0.1, 0.2, 0.3],
    'n_estimators': [100, 200, 300],
    'min_child_weight': [1, 3, 5],
    'gamma': [0.0, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, 
                           cv=5, scoring='accuracy', verbose=2, n_jobs=-1)

grid_search.fit(X_train, y_train)

print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

best_model = grid_search.best_estimator_
y_pred = best_model.predict(test)

In [None]:
# Define the SVC model
svc = SVC()

# Define the parameter grid for GridSearch
param_grid = {
    'C': [0.1, 1, 10, 50],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto', 0.1, 1],
    'degree': [2, 3, 4],  # Only relevant for poly kernel
    'class_weight': [None, 'balanced']
}

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=svc,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    verbose=2,
    n_jobs=-1
)

# Fit the model
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

# Use the best model to make predictions on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(test)

In [49]:
print(y_pred)

[0 0 0 1 1 0 1 0 0 0 0 0 1 0 1 1 0 0 1 0 0 1 1 0 1 0 1 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0 1 1 0 1 0
 1 0 0 1 0 1 1 0 0 0 0 0 1 0 1 1 0 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0
 1 1 0 1 0 0 1 1 1 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0
 0 0 1 0 0 0 0 0 1 0 1 0 1 1 1 0 0 0 1 0 1 1 0 1 0 0 0 1 1 0 1 1 0 0 1 0 1
 0 1 0 0 0 0 0 0 0 1 0 1 0 0 1 1 1 0 1 0 0 1 0 1 0 0 0 0 1 0 1 1 0 1 0 1 0
 1 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 1 1 0 1 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 1 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 1 0 0 0 0
 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 1 0 1 0 0 0 1 0 0
 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 1 0 0 0 1 0 1 0 0 0 0 1 1 1 1 0 0 1 1 0
 0 1 0 0 1 1 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 0 1 1 1 1 1 0 1 0 0 0]


In [75]:
best_model = grid_search.best_estimator_

predictions_train = best_model.predict(X_train)

predictions_val = best_model.predict(X_val)

train_accuracy = accuracy_score(y_train, predictions_train)
val_accuracy = accuracy_score(y_val, predictions_val)

print(f"Train accuracy: {train_accuracy}.")
print(f"Validation accuracy: {val_accuracy}.")

Train accuracy: 0.8876404494382022.
Validation accuracy: 0.8324022346368715.


### Deep learning

In [None]:
model = Sequential([
    Dense(units=4096, activation='relu', name='L1'),  # Increased units
    Dense(units=2048, activation='relu', name='L2'),  # Increased units
    Dense(units=1024, activation='relu', name='L3'),  # Increased units
    Dense(units=512, activation='relu', name='L4'),  # Increased units
    Dense(units=256, activation='relu', name='L5'),  # Increased units
    Dense(units=128, activation='relu', name='L6'),  # Increased units
    Dense(units=64, activation='relu', name='L7'),  # Increased units
    Dense(units=32, activation='relu', name='L8'),  # Increased units
    Dense(units=16, activation='relu', name='L9'),  # Increased units
    Dense(units=1, activation='sigmoid', name='L10')  # Output layer
])

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-2), loss='binary_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(
    monitor='val_loss',  # You can also use 'val_accuracy'
    patience=10,  # Number of epochs to wait for improvement
    restore_best_weights=True  # Restore the best model weights
)

model.fit(
    X_train,
    y_train,
    epochs=100,
    batch_size=128,
    validation_data=(X_val, y_val),  # Include validation data
    callbacks=[early_stopping]  # Include the early stopping callback
)

In [None]:
model.summary()

In [None]:
predictions_train = model.predict(X_train)
predictions_train = (predictions_train >= 0.5).astype(int)

predictions_val = model.predict(X_val)
predictions_val = (predictions_val >= 0.5).astype(int)

train_accuracy = accuracy_score(y_train, predictions_train)
val_accuracy = accuracy_score(y_val, predictions_val)

print(f"Train accuracy: {train_accuracy}.")
print(f"Validation accuracy: {val_accuracy}.")

## Make predictions

In [None]:
predictions = model.predict(test)
predictions = (predictions >= 0.5).astype(int)
predictions = predictions.flatten()
print(predictions)

## Submit predictions

In [63]:
choosen_model_name = 'svc'

submission = pd.DataFrame({
    'PassengerId': pd.read_csv('data/test.csv')['PassengerId'],  # Ensure PassengerId is correctly handled
    'Survived': y_pred  # or log_reg_test_preds, xgb_clf_test_preds
})

# Get the current date and time
now = datetime.now()
# Format the date and time as a string
date_time_str = now.strftime("%Y%m%d_%H%M%S")

# Save the DataFrame to a CSV file with the date and time in the filename
submission.to_csv(f'output/submission_{choosen_model_name}_{date_time_str}.csv', index=False)