In [14]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [15]:
# Cell 1: Setup and Imports

# Importing pandas
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Loading the training and testing datasets from the Kaggle input directory
train_data = pd.read_csv('/kaggle/input/titanic/train.csv')
test_data = pd.read_csv('/kaggle/input/titanic/test.csv')

# Dropping rows with missing 'Age' or 'Embarked' to ensure clean training data
train_data = train_data.dropna(subset=['Age', 'Embarked'])

print("Setup Complete. Data loaded and cleaned.")

Setup Complete. Data loaded and cleaned.


In [16]:
# Cell 2: Feature Engineering

y = train_data.Survived

# Making clean copies to avoid SettingWithCopy warnings
X = train_data.copy()
X_test = test_data.copy()

# STEP 1: FEATURES ENGINEERING

# 1a. FamilySize & IsAlone
# Creating 'FamilySize' by combining siblings/spouses, parents/children, and the passenger
X['FamilySize'] = X['SibSp'] + X['Parch'] + 1
X_test['FamilySize'] = X_test['SibSp'] + X_test['Parch'] + 1

# Creating 'IsAlone' based on the hypothesis that solo travelers had different survival rates
X['IsAlone'] = 0
X.loc[X['FamilySize'] == 1, 'IsAlone'] = 1
X_test['IsAlone'] = 0
X_test.loc[X_test['FamilySize'] == 1, 'IsAlone'] = 1

# 1b. Title
# Extracting titles from names to capture social status and gender
X['Title'] = X['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
X_test['Title'] = X_test['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# Grouping rare titles and standardizing common ones (e.g., Mlle to Miss)
for data in [X, X_test]:
    data['Title'] = data['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    data['Title'] = data['Title'].replace('Mlle', 'Miss')
    data['Title'] = data['Title'].replace('Ms', 'Miss')
    data['Title'] = data['Title'].replace('Mme', 'Mrs')

# STEP 2: IMPUTATION
# Filling missing ages using the median age of the passenger's title
title_age_median = X.groupby('Title')['Age'].median()
X['Age'] = X.apply(lambda row: title_age_median[row['Title']] if pd.isnull(row['Age']) else row['Age'], axis=1)

# Handling missing title in test set, then applying the same age logic
X_test['Title'] = X_test['Title'].fillna('Mr')
X_test['Age'] = X_test.apply(lambda row: title_age_median[row['Title']] if pd.isnull(row['Age']) else row['Age'], axis=1)

# Filling remaining gaps in Embarked and Fare with common/average values
X['Embarked'] = X['Embarked'].fillna('S')
X_test['Fare'] = X_test['Fare'].fillna(X['Fare'].mean())

# STEP 3: SELECT FINAL FEATURES
# Selecting features including the new 'IsAlone' and 'Title' and removing 'Deck' due to excessive missing values
final_features = ['Pclass', 'Sex', 'FamilySize', 'IsAlone', 'Embarked', 'Title', 'Fare', 'Age']

# Converting categorical variables to dummy variables (one-hot encoding)
X = pd.get_dummies(X[final_features])
X_test = pd.get_dummies(X_test[final_features])

# Aligning columns to ensure training and test sets have identical structure
X, X_test = X.align(X_test, join='left', axis=1, fill_value=0)

print("V6 Features ready with IsAlone.")

V6 Features ready with IsAlone.


In [17]:
# Cell 3: Splitting Data

# Splitting training data into training and validation sets
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

print("Data split into training and validation sets.")

Data split into training and validation sets.


In [18]:
# Cell 4: Hyperparameter Tuning

# Helper function to train RandomForest with the specific leaf sizes and return accuracy
def get_accuracy(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = RandomForestClassifier(max_leaf_nodes=max_leaf_nodes, random_state=1)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    accuracy = accuracy_score(val_y, preds_val)
    return(accuracy)

# List of candidate values for max_leaf_nodes to test
candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]

# Iterating through candidate values to find the one with highest validation accuracy
scores = {}
for leaf_size in candidate_max_leaf_nodes:
    scores[leaf_size] = get_accuracy(leaf_size, train_X, val_X, train_y, val_y)

# Identifying the best tree size
best_tree_size = max(scores, key=scores.get)

print("All scores (leaf_size: accuracy):")
print(scores)
print(f"Best tree size: {best_tree_size}")

All scores (leaf_size: accuracy):
{5: 0.797752808988764, 25: 0.8033707865168539, 50: 0.797752808988764, 100: 0.7921348314606742, 250: 0.7865168539325843, 500: 0.7865168539325843}
Best tree size: 25


In [19]:
# Cell 5: Final Model Training and Submission

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd

#1. Including the helper function here to ensure cell independence
def get_accuracy(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = RandomForestClassifier(max_leaf_nodes=max_leaf_nodes, random_state=1)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    return accuracy_score(val_y, preds_val)

#2. Re-calculating the best tree size to confirm parameters
candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500, 1000]
scores = {leaf_size: get_accuracy(leaf_size, train_X, val_X, train_y, val_y) for leaf_size in candidate_max_leaf_nodes}
best_tree_size = max(scores, key=scores.get)

print(f"Best tree size found: {best_tree_size}")

#3. Initializing the final model with the optimal tree size
final_model = RandomForestClassifier(max_leaf_nodes=best_tree_size, random_state=1)

# Fitting the model on entire the training dataset (X and y)
final_model.fit(X, y)

#4. Predict & Submit
test_preds = final_model.predict(X_test)

output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': test_preds})
output.to_csv('submission.csv', index=False)
print("V6 Submission file created with RandomForest.")

Best tree size found: 25
V6 Submission file created with RandomForest.
