In [504]:
import pandas as pd
import plotly.express as px
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import itertools
import time
from sklearn.datasets import make_classification 

In [None]:
data = pd.read_csv("data/train.csv",
                    index_col="PassengerId" 
                   ).drop(columns=["Name", "Ticket"] )


In [67]:
def show_plots(data: pd.DataFrame) -> None:
    """
    Display various plots based on Titanic dataset columns to visualize relationships 
    between features like Age, Pclass, Embarked, etc., and survival status.

    Args:
        data (pd.DataFrame): Titanic dataset containing relevant columns.
    """
    required_columns = ['Age', 'Pclass', 'Survived', 'Sex', 'SibSp', 'Parch', 'Embarked', 'Cabin']
    missing_cols = [col for col in required_columns if col not in data.columns]
    if missing_cols:
        raise ValueError(f"Missing required columns in data: {missing_cols}")

    px.histogram(data, x="Age", color="Survived", title="Age Distribution by Survival").show()

    px.histogram(data, x="Pclass", color="Survived", title="Pclass Distribution by Survival").show()

    px.histogram(data, x="Survived", color="Sex", title="Survival Count by Sex").show()

    px.violin(
        data, x="Pclass", y="Age", color="Survived",
        box=True, points="all",
        title="Age Distribution by Pclass and Survival"
    ).show()

    data = data.copy()
    data['family_size'] = data["SibSp"] + data["Parch"] + 1

    px.histogram(
        data, x="family_size", color="Survived", title="Family Size Distribution by Survival"
    ).show()

    px.histogram(
        data, x="Embarked", color="Survived",
        histnorm='percent', title="Embarkation Port Distribution by Survival (in %)"
    ).show()

    data['Deck'] = data['Cabin'].astype(str).str[0]

    px.histogram(
        data, x="Deck", color="Survived", title="Cabin Deck Distribution by Survival"
    ).show()


In [90]:
def clean_data(dataframe: pd.DataFrame, method: str) -> pd.DataFrame:
    """
    Clean DataFrame using specified methods.
    
    Args:
        dataframe (pd.DataFrame): Input DataFrame. It remains unmodified.
        method (str): Cleaning method for 'Age'. Options: ['dropna', 'median', 'interpolate'].
    
    Returns:
        pd.DataFrame: A cleaned DataFrame with modified 'Age', 'Embarked', and 'Cabin' columns.
    
    Raises:
        ValueError: If an invalid method is provided.
    """
    if method not in ['dropna', 'median', 'interpolate']:
        raise ValueError(f"Invalid method: {method}. Use 'dropna', 'median', or 'interpolate'.")

    if method == "dropna":
        return dataframe.dropna(subset=['Age'])

    cleaned_data = dataframe.copy()
    
    for col in ['Embarked', 'Deck']:
        if col in cleaned_data.columns:
            cleaned_data[col] = cleaned_data[col].fillna('Unknown')

    if method == "median":
        if 'Age' in cleaned_data.columns:
            imputer = SimpleImputer(strategy="median")
            cleaned_data['Age'] = imputer.fit_transform(cleaned_data[['Age']])
        return cleaned_data

    if method == "interpolate":
        if 'Age' in cleaned_data.columns:
            cleaned_data['Age'] = cleaned_data['Age'].interpolate()
        return cleaned_data


In [None]:
def data_deck(dataframe: pd.DataFrame) -> pd.DataFrame:
    """Adding a deck column

    Args:
        dataframe (pd.DataFrame): Input DataFrame. It remains unmodified.

    Returns:
        pd.DataFrame: Output DataFrame. It has the 'Deck' column but doesn't have the 'Cabin' column
    """
    new_df = dataframe.copy()
    new_df['Deck'] = new_df['Cabin'].apply(lambda x: str(x)[0] if pd.notnull(x) else np.nan)
    new_df.drop(columns=['Cabin'], inplace=True)
    return new_df

In [108]:
def encode_data(dataframe: pd.DataFrame, drop_unkown: bool = True) -> pd.DataFrame:
    
    """
    Encodes categorical columns in a DataFrame using one-hot encoding.

    This function identifies all columns of type 'object', applies one-hot encoding 
    to them using sklearn's OneHotEncoder, and returns a new DataFrame with the 
    encoded columns concatenated to the original numerical columns.

    Args:
        dataframe (pd.DataFrame): The input DataFrame containing categorical and/or numerical features.
        drop_unkown (bool): Yes if the encoder drop unkowns, No otherwise

    Returns:
        pd.DataFrame: A DataFrame with categorical columns one-hot encoded and original categorical columns removed.
    """

    cat_columns = list(dataframe.select_dtypes(include='object').columns)
    
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    encoder.fit(dataframe[cat_columns])
    
    encoded_array = encoder.transform(dataframe[cat_columns])
    
    encoded_columns = encoder.get_feature_names_out(cat_columns)
    
    encoded_df = pd.DataFrame(encoded_array, columns=encoded_columns, index=dataframe.index)
    
    non_cat_data = dataframe.drop(columns=cat_columns)
    
    final_df = pd.concat([non_cat_data, encoded_df], axis=1)
    if drop_unkown : 
        return final_df
    return final_df.drop(columns=['Deck_Unknown', 'Embarked_Unknown'])


In [None]:
def split_data (dataframe: pd.DataFrame, val_size: float = 0.25) -> list[pd.DataFrame] :
    """Spliting data to Training and Valuation data

    Args:
        dataframe (pd.DataFrame): Input DataFrame. It remains unmodified.
        val_size (float, optional): Test size. Defaults to 0.25.

    Returns:
        list[pd.DataFrame]: List of the training and the validation data sets
    """
    X = dataframe[dataframe.columns[1:]]
    y = dataframe[dataframe.columns[0]]
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=val_size, random_state=42)
    return X_train, X_val, y_train, y_val

In [None]:
def tree_model_training (X_train: pd.DataFrame , y_train: pd.DataFrame, MAXDEPTH: int) -> DecisionTreeClassifier:
    """Training a tree model

    Args:
        X_train (pd.DataFrame): The input X we will train our model on
        y_train (pd.DataFrame): The input Y we will train our model on
        MAXDEPTH (int): The max depth of the tree

    Returns:
        DecisionTreeClassifier: Trained classifier Tree model 
    """
    model = DecisionTreeClassifier(max_depth=MAXDEPTH, random_state=42)
    model.fit(X_train, y_train)
    return model


In [337]:
def evaluate_model(model: DecisionTreeClassifier,
                    X_train:pd.DataFrame , 
                    X_val: pd.DataFrame,
                    y_train: pd.DataFrame,
                    y_val:pd.DataFrame,
                    printing: bool=False) -> list[float] : 
    
    """Evaluating the model on training data and validation data, and optiomally prints out the numbers 

    Args:
        model (DecisionTreeClassifier): _description_
        X_train (pd.DataFrame): The train input X 
        X_val (pd.DataFrame): The validation input Y
        y_train (pd.DataFrame): The train input Y 
        y_val (pd.DataFrame): The validation input Y
        printing (bool, optional): Wethere to print out the accuarcies. Defaults to False.

    Returns:
         list[flaot]: the accuarcies of the model
    """
    R1 = model.score(X_train, y_train)
    R2 = model.score(X_val, y_val)
    if printing :
        print(f"the accuarcy of the model on trained data is {round(R1*100,1)}%")
        print(f"the accuarcy of the model on validation data is {round(R2*100,1)}%")
    return R1,R2

In [None]:
def visualize_tree(model: DecisionTreeClassifier, 
                    X_train: pd.DataFrame,
                    FIGSIZE: tuple = (80,20),
                    depth: int = 3) -> None:
    """visualizing the tree model

    Args:
        model (DecisionTreeClassifier): the model
        X_train (pd.DataFrame): The input data (to use columns from)
        FIGSIZE (tuple, optional): Figure Size. Defaults to (80,20).
        depth (int, optional): How much depth should be showed. Defaults to 3.
    """
    plt.figure(figsize=FIGSIZE)
    plot_tree(model, feature_names=X_train.columns, max_depth= depth , filled= True, class_names=['No', 'Yes']) 

In [325]:
def find_optimal_tree_depth(X_train: pd.DataFrame,
                             X_val: pd.DataFrame,
                             y_train: pd.DataFrame,
                             y_val: pd.DataFrame,
                             MAX_DEPTH: int,
                             visualise: bool=False) -> list[list[tuple], int]:
    """Finding the optimal maximum depth for a tree classifiel model

    Args:
        X_train (pd.DataFrame): Input training X data
        X_val (pd.DataFrame): Input validation X data
        y_train (pd.DataFrame): Input training y data
        y_val (pd.DataFrame): Input validation y data
        MAX_DEPTH (int): maximum depth of the tree
        visualise (bool, optional): whether to visualise the accuracies accross multiple depths. Defaults to False.

    Returns:
        list[list[tuple], int]: outputs the accuracies and the best found depth 
    """

    accuracies = []
    best_depth = 1
    best_val_acc = 0

    for depth in range(1, MAX_DEPTH + 1):
        model = tree_model_training(X_train, y_train, MAXDEPTH=depth)
        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        train_acc = accuracy_score(y_train, y_train_pred)
        val_acc = accuracy_score(y_val, y_val_pred)
        accuracies.append((train_acc, val_acc))

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_depth = depth

    if visualise:
        train_accuracies = [acc[0] for acc in accuracies]
        val_accuracies = [acc[1] for acc in accuracies]
        depths = list(range(1, MAX_DEPTH + 1))

        plt.figure(figsize=(10, 6))
        plt.plot(depths, train_accuracies, label='Training Accuracy', marker='o')
        plt.plot(depths, val_accuracies, label='Validation Accuracy', marker='s')
        plt.axvline(x=best_depth, color='r', linestyle='--', label=f'Best Depth: {best_depth}')
        plt.title('Decision Tree Accuracy vs Max Depth')
        plt.xlabel('Max Depth')
        plt.ylabel('Accuracy')
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.show()

    return accuracies, best_depth

In [500]:
def random_forest_train (X_train: pd.DataFrame,
                        y_train: pd.DataFrame,
                        n_trees:int = 1,
                        maxdepth = 4,
                        maxfeatures = "log2"
                        ) :
    model = RandomForestClassifier(n_jobs=-1,
                                random_state=42,
                                n_estimators= n_trees,
                                max_depth=maxdepth,
                                max_features=maxfeatures)
    model.fit(X_train, y_train)
    return model


In [501]:
temp = encode_data(clean_data(data_deck(data), "median"))
X_train, X_val, y_train, y_val = split_data(temp, 0.25)
trained_model = tree_model_training(X_train, y_train, 
                                    MAXDEPTH=14)

new_trained_model2 = random_forest_train(X_train, y_train, n_trees=1, maxdepth = 4)
acc2 = evaluate_model(new_trained_model2, X_train, X_val, y_train, y_val,printing=False)[1]
acc2


0.8340807174887892

In [487]:
def show_different_variables(model, X_train, X_val, y_train, y_val) :
    n_trees = np.arange(1,13)
    maxdepths = np.arange(1,16)
    
    results = np.zeros((len(n_trees), len(maxdepths)))

    for i, ntree in enumerate(n_trees):
        for j, maxdepth in enumerate(maxdepths):
            model = random_forest_train(X_train, y_train, n_trees=ntree, maxdepth=maxdepth)
            acc = evaluate_model(model, X_train, X_val, y_train, y_val, printing=False)[1]
            results[i, j] = acc

    plt.figure(figsize=(12, 6))
    sns.heatmap(results, xticklabels=maxdepths, yticklabels=n_trees, cmap='viridis')
    plt.xlabel('Max Depth')
    plt.ylabel('N_trees')
    plt.title('Validation Accuracy Heatmap')
    plt.show()



In [None]:


def analyze_random_forest_hyperparameters(
    X_train: pd.DataFrame,
    y_train: pd.Series or pd.DataFrame,
    X_val: pd.DataFrame,
    y_val: pd.Series or pd.DataFrame,
    n_estimators_list: list = [50, 100, 200],
    max_depth_list: list = [4, 8, 12, None],
    max_features_list: list = ['sqrt', 'log2', 0.5],
    min_samples_split_list: list = [2, 5, 10],
    min_samples_leaf_list: list = [1, 3, 5],
    criterion_list: list = ['gini', 'entropy'],
    bootstrap_list: list = [True], # Usually True for RF, but can test False
    n_jobs: int = -1,
    random_state: int = 42
) -> pd.DataFrame:
    """
    Trains and evaluates RandomForestClassifiers with various hyperparameter combinations.

    This function iterates through all combinations of the provided hyperparameter lists,
    trains a RandomForestClassifier for each combination on the training data,
    evaluates its performance (accuracy) on both the training and validation sets,
    and measures the training time.

    Args:
        X_train: Training features DataFrame.
        y_train: Training target variable Series or DataFrame.
        X_val: Validation features DataFrame.
        y_val: Validation target variable Series or DataFrame.
        n_estimators_list: List of n_estimators values to try.
        max_depth_list: List of max_depth values to try (can include None).
        max_features_list: List of max_features values to try ('sqrt', 'log2', float, int).
        min_samples_split_list: List of min_samples_split values to try (int >= 2 or float).
        min_samples_leaf_list: List of min_samples_leaf values to try (int >= 1 or float).
        criterion_list: List of criterion values to try ('gini', 'entropy', 'log_loss').
        bootstrap_list: List of boolean values for the bootstrap parameter.
        n_jobs: Number of jobs to run in parallel for fit. -1 means using all processors.
        random_state: Controls randomness for reproducibility.

    Returns:
        A pandas DataFrame containing the results of each hyperparameter combination,
        sorted by validation accuracy in descending order. Columns include the tested
        hyperparameters, training accuracy, validation accuracy, and training time.
    """
    results = []

    if isinstance(y_train, pd.DataFrame):
         y_train_ravel = y_train.values.ravel()
         y_train_ravel = y_train

    param_grid = {
        'n_estimators': n_estimators_list,
        'max_depth': max_depth_list,
        'max_features': max_features_list,
        'min_samples_split': min_samples_split_list,
        'min_samples_leaf': min_samples_leaf_list,
        'criterion': criterion_list,
        'bootstrap': bootstrap_list
    }

    keys, values = zip(*param_grid.items())
    param_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]

    total_combinations = len(param_combinations)
    print(f"Starting hyperparameter analysis for {total_combinations} combinations...")

    for i, params in enumerate(param_combinations):
        print(f"Running combination {i+1}/{total_combinations}: {params}")
        start_time = time.time()

        model = RandomForestClassifier(
            **params,  
            n_jobs=n_jobs,
            random_state=random_state
        )

        # Train the model
        model.fit(X_train, y_train_ravel)
        end_time = time.time()
        train_time = end_time - start_time

        # Evaluate on training data
        y_train_pred = model.predict(X_train)
        train_accuracy = accuracy_score(y_train_ravel, y_train_pred)

        # Evaluate on validation data
        y_val_pred = model.predict(X_val)
        val_accuracy = accuracy_score(y_val, y_val_pred)

        # Store results - start with the parameters and add metrics
        result_entry = params.copy()
        result_entry['train_accuracy'] = round(train_accuracy, 5)
        result_entry['val_accuracy'] = round(val_accuracy, 5)
        result_entry['train_time_s'] = round(train_time, 3)
        results.append(result_entry)

        print(f"  -> Train Acc: {train_accuracy:.4f}, Val Acc: {val_accuracy:.4f}, Time: {train_time:.2f}s")


    results_df = pd.DataFrame(results)

    metric_cols = ['train_accuracy', 'val_accuracy', 'train_time_s']
    param_cols = list(param_grid.keys())
    # Ensure all expected columns are present before reordering
    final_cols = param_cols + [col for col in metric_cols if col in results_df.columns]
    results_df = results_df[final_cols]


    results_df = results_df.sort_values(by='val_accuracy', ascending=False).reset_index(drop=True)

    print("\nHyperparameter analysis complete.")
    return results_df

print("Generating synthetic data for example...")
X, y = make_classification(n_samples=1500, n_features=25, n_informative=15,
                           n_redundant=5, n_classes=2, random_state=42, flip_y=0.05)
X = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])
y = pd.Series(y, name='target')

print("Splitting data...")
X_train_ex, X_val_ex, y_train_ex, y_val_ex = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y 
)
print(f"Train set size: {X_train_ex.shape[0]}, Validation set size: {X_val_ex.shape[0]}")


n_estimators_to_try = [50, 100]       
max_depth_to_try = [5, 10, None]     
max_features_to_try = ['sqrt', 0.6]   
min_samples_split_to_try = [2, 8]     
min_samples_leaf_to_try = [1, 4]     
criterion_to_try = ['gini', 'entropy']

analysis_results = analyze_random_forest_hyperparameters(
    X_train=X_train_ex,
    y_train=y_train_ex,
    X_val=X_val_ex,
    y_val=y_val_ex,
    n_estimators_list=n_estimators_to_try,
    max_depth_list=max_depth_to_try,
    max_features_list=max_features_to_try,
    min_samples_split_list=min_samples_split_to_try,
    min_samples_leaf_list=min_samples_leaf_to_try,
    criterion_list=criterion_to_try,
    bootstrap_list=[True], 
    random_state=42,
    n_jobs=-1 
)

print("\n--- Hyperparameter Analysis Results ---")
print(analysis_results.head(10))

print("\n--- Best Hyperparameter Combination Found ---")
if not analysis_results.empty:
    print(analysis_results.iloc[0])
else:
    print("No results generated.")

Generating synthetic data for example...
Splitting data...
Train set size: 1050, Validation set size: 450
Starting hyperparameter analysis for 96 combinations...
Running combination 1/96: {'n_estimators': 50, 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_split': 2, 'min_samples_leaf': 1, 'criterion': 'gini', 'bootstrap': True}
  -> Train Acc: 0.9419, Val Acc: 0.8689, Time: 0.08s
Running combination 2/96: {'n_estimators': 50, 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_split': 2, 'min_samples_leaf': 1, 'criterion': 'entropy', 'bootstrap': True}
  -> Train Acc: 0.9419, Val Acc: 0.8733, Time: 0.07s
Running combination 3/96: {'n_estimators': 50, 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_split': 2, 'min_samples_leaf': 4, 'criterion': 'gini', 'bootstrap': True}
  -> Train Acc: 0.9362, Val Acc: 0.8644, Time: 0.07s
Running combination 4/96: {'n_estimators': 50, 'max_depth': 5, 'max_features': 'sqrt', 'min_samples_split': 2, 'min_samples_leaf': 4, 'criterion': 'entrop

In [503]:
analysis_results

Unnamed: 0,n_estimators,max_depth,max_features,min_samples_split,min_samples_leaf,criterion,bootstrap,train_accuracy,val_accuracy,train_time_s
0,100,,0.6,2,1,gini,True,1.00000,0.90444,0.188
1,100,,0.6,8,1,gini,True,0.99429,0.90444,0.189
2,100,10.0,sqrt,2,4,gini,True,0.98000,0.90222,0.135
3,100,10.0,sqrt,8,4,gini,True,0.98000,0.90222,0.125
4,100,10.0,0.6,8,1,gini,True,0.99333,0.90222,0.184
...,...,...,...,...,...,...,...,...,...,...
91,50,5.0,sqrt,8,4,gini,True,0.93619,0.86444,0.072
92,50,5.0,sqrt,8,1,gini,True,0.93905,0.86222,0.072
93,50,5.0,0.6,2,1,entropy,True,0.93905,0.86222,0.101
94,50,5.0,0.6,2,4,entropy,True,0.93048,0.86000,0.108
