In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic-machine-learning-u-lima/train.csv
/kaggle/input/titanic-machine-learning-u-lima/test.csv
/kaggle/input/titanic-machine-learning-u-lima/gender_submission.csv


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
import re

# --- 1. Load Data ---
# Define paths for the dataset
train_path = "/kaggle/input/titanic-machine-learning-u-lima/train.csv"
test_path  = "/kaggle/input/titanic-machine-learning-u-lima/test.csv"

# Read the training and testing data
train_df = pd.read_csv(train_path)
test_df  = pd.read_csv(test_path)

# Store PassengerIds from the test set for the final submission file
test_passenger_ids = test_df["PassengerId"]

# --- 2. Feature Engineering ---
# We'll create new features from the existing ones to improve model accuracy.
# This function will be applied to both train and test sets for consistency.

def engineer_features(df):
    # Extract titles from the 'Name' column (e.g., Mr, Mrs, Miss)
    df['Title'] = df['Name'].apply(lambda x: re.search(' ([A-Za-z]+)\.', x).group(1))
    
    # Consolidate rare titles into more common categories
    title_mapping = {
        "Mlle": "Miss", "Ms": "Miss", "Mme": "Mrs", "Dr": "Officer", 
        "Rev": "Officer", "Col": "Officer", "Major": "Officer", 
        "Capt": "Officer", "Lady": "Royalty", "Countess": "Royalty", 
        "Jonkheer": "Royalty", "Sir": "Royalty", "Don": "Royalty", "Dona": "Royalty"
    }
    df['Title'] = df['Title'].replace(title_mapping)

    # Create 'FamilySize' by combining SibSp and Parch
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    
    # Create 'IsAlone' feature
    df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

    # Extract the first letter of the 'Cabin' (Deck), fill missing with 'U' for Unknown
    df['Deck'] = df['Cabin'].str[0].fillna('U')

    # Drop columns that are no longer needed after feature engineering
    df = df.drop(columns=['Name', 'Ticket', 'Cabin', 'PassengerId'], errors='ignore')
    
    return df

# Apply the feature engineering function
train_df = engineer_features(train_df)
test_df = engineer_features(test_df)

# --- 3. Preprocessing ---
# Separate target variable (y) from features (X)
y = train_df["Survived"]
X = train_df.drop(columns=["Survived"])

# Align columns between training and testing sets, in case of any discrepancies
# This ensures the test set has the same columns as the training set after one-hot encoding
train_cols = X.columns
test_cols = test_df.columns
missing_in_test = set(train_cols) - set(test_cols)
for c in missing_in_test:
    test_df[c] = 0
test_df = test_df[train_cols]


# Define categorical and numerical features
# Note: 'Pclass' is treated as a categorical feature
cat_features = ["Sex", "Embarked", "Title", "Deck", "Pclass"]
num_features = ["Age", "SibSp", "Parch", "Fare", "FamilySize", "IsAlone"]

# Create preprocessing pipelines for numerical and categorical data
# Numerical features will be imputed with the median and then scaled
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Categorical features will be imputed with the most frequent value and then one-hot encoded
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

# Combine preprocessing steps into a single ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_features),
        ("cat", categorical_transformer, cat_features),
    ],
    remainder='passthrough' # Keep other columns (if any)
)

# --- 4. Model Training and Hyperparameter Tuning ---
# We will use a RandomForestClassifier, a powerful ensemble model.
# We'll use GridSearchCV to find the best combination of hyperparameters.

# Create the full pipeline including preprocessing and the model
model_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

# Define the parameter grid to search over
# This is a focused grid to balance performance and computation time
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [5, 10, 15],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__min_samples_split': [2, 5, 10]
}

# Set up GridSearchCV to perform a 5-fold cross-validation search
grid_search = GridSearchCV(
    model_pipeline, 
    param_grid, 
    cv=5, 
    scoring='accuracy', 
    n_jobs=-1, # Use all available CPU cores
    verbose=1
)

# Fit the grid search to the training data
print("Starting GridSearchCV to find the best model...")
grid_search.fit(X, y)

# Print the best parameters and the best cross-validation score
print("\nBest parameters found:", grid_search.best_params_)
print("Best cross-validation accuracy: {:.2f}%".format(grid_search.best_score_ * 100))

# --- 5. Generate Predictions and Submission File ---
# The best model found by GridSearchCV is automatically refit on the entire dataset.
# We use this best estimator to make predictions on the test data.
best_model = grid_search.best_estimator_
test_predictions = best_model.predict(test_df)

# Create the submission DataFrame
submission = pd.DataFrame({
    "PassengerId": test_passenger_ids,
    "Survived": test_predictions
})

# Save the submission file
submission.to_csv("submission.csv", index=False)

print("\nSuccessfully created and saved submission.csv")


Starting GridSearchCV to find the best model...
Fitting 5 folds for each of 81 candidates, totalling 405 fits

Best parameters found: {'classifier__max_depth': 15, 'classifier__min_samples_leaf': 2, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 300}
Best cross-validation accuracy: 83.50%

Successfully created and saved submission.csv
