In [2]:
import pandas as pd
import plotly.express as px
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns
import itertools
import time
from sklearn.datasets import make_classification 

In [3]:
data = pd.read_csv("data/train.csv",
                    index_col="PassengerId" 
                   ).drop(columns=["Name", "Ticket"] )


In [4]:
def show_plots(data: pd.DataFrame) -> None:
    """
    Display various plots based on Titanic dataset columns to visualize relationships 
    between features like Age, Pclass, Embarked, etc., and survival status.

    Args:
        data (pd.DataFrame): Titanic dataset containing relevant columns.
    """
    required_columns = ['Age', 'Pclass', 'Survived', 'Sex', 'SibSp', 'Parch', 'Embarked', 'Cabin']
    missing_cols = [col for col in required_columns if col not in data.columns]
    if missing_cols:
        raise ValueError(f"Missing required columns in data: {missing_cols}")

    px.histogram(data, x="Age", color="Survived", title="Age Distribution by Survival").show()

    px.histogram(data, x="Pclass", color="Survived", title="Pclass Distribution by Survival").show()

    px.histogram(data, x="Survived", color="Sex", title="Survival Count by Sex").show()

    px.violin(
        data, x="Pclass", y="Age", color="Survived",
        box=True, points="all",
        title="Age Distribution by Pclass and Survival"
    ).show()

    data = data.copy()
    data['family_size'] = data["SibSp"] + data["Parch"] + 1

    px.histogram(
        data, x="family_size", color="Survived", title="Family Size Distribution by Survival"
    ).show()

    px.histogram(
        data, x="Embarked", color="Survived",
        histnorm='percent', title="Embarkation Port Distribution by Survival (in %)"
    ).show()

    data['Deck'] = data['Cabin'].astype(str).str[0]

    px.histogram(
        data, x="Deck", color="Survived", title="Cabin Deck Distribution by Survival"
    ).show()


In [5]:
def clean_data(dataframe: pd.DataFrame, method: str) -> pd.DataFrame:
    """
    Clean DataFrame using specified methods.
    
    Args:
        dataframe (pd.DataFrame): Input DataFrame. It remains unmodified.
        method (str): Cleaning method for 'Age'. Options: ['dropna', 'median', 'interpolate'].
    
    Returns:
        pd.DataFrame: A cleaned DataFrame with modified 'Age', 'Embarked', and 'Cabin' columns.
    
    Raises:
        ValueError: If an invalid method is provided.
    """
    if method not in ['dropna', 'median', 'interpolate']:
        raise ValueError(f"Invalid method: {method}. Use 'dropna', 'median', or 'interpolate'.")

    if method == "dropna":
        return dataframe.dropna(subset=['Age'])

    cleaned_data = dataframe.copy()
    
    for col in ['Embarked', 'Deck']:
        if col in cleaned_data.columns:
            cleaned_data[col] = cleaned_data[col].fillna('Unknown')

    if method == "median":
        if 'Age' in cleaned_data.columns:
            imputer = SimpleImputer(strategy="median")
            cleaned_data['Age'] = imputer.fit_transform(cleaned_data[['Age']])
        return cleaned_data

    if method == "interpolate":
        if 'Age' in cleaned_data.columns:
            cleaned_data['Age'] = cleaned_data['Age'].interpolate()
        return cleaned_data


In [6]:
def data_deck(dataframe: pd.DataFrame) -> pd.DataFrame:
    """Adding a deck column

    Args:
        dataframe (pd.DataFrame): Input DataFrame. It remains unmodified.

    Returns:
        pd.DataFrame: Output DataFrame. It has the 'Deck' column but doesn't have the 'Cabin' column
    """
    new_df = dataframe.copy()
    new_df['Deck'] = new_df['Cabin'].apply(lambda x: str(x)[0] if pd.notnull(x) else np.nan)
    new_df.drop(columns=['Cabin'], inplace=True)
    return new_df

In [7]:
def encode_data(dataframe: pd.DataFrame, drop_unkown: bool = True) -> pd.DataFrame:
    
    """
    Encodes categorical columns in a DataFrame using one-hot encoding.

    This function identifies all columns of type 'object', applies one-hot encoding 
    to them using sklearn's OneHotEncoder, and returns a new DataFrame with the 
    encoded columns concatenated to the original numerical columns.

    Args:
        dataframe (pd.DataFrame): The input DataFrame containing categorical and/or numerical features.
        drop_unkown (bool): Yes if the encoder drop unkowns, No otherwise

    Returns:
        pd.DataFrame: A DataFrame with categorical columns one-hot encoded and original categorical columns removed.
    """

    cat_columns = list(dataframe.select_dtypes(include='object').columns)
    
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    encoder.fit(dataframe[cat_columns])
    
    encoded_array = encoder.transform(dataframe[cat_columns])
    
    encoded_columns = encoder.get_feature_names_out(cat_columns)
    
    encoded_df = pd.DataFrame(encoded_array, columns=encoded_columns, index=dataframe.index)
    
    non_cat_data = dataframe.drop(columns=cat_columns)
    
    final_df = pd.concat([non_cat_data, encoded_df], axis=1)
    if drop_unkown : 
        return final_df
    return final_df.drop(columns=['Deck_Unknown', 'Embarked_Unknown'])


In [8]:
def split_data (dataframe: pd.DataFrame, val_size: float = 0.25) -> list[pd.DataFrame] :
    """Spliting data to Training and Valuation data

    Args:
        dataframe (pd.DataFrame): Input DataFrame. It remains unmodified.
        val_size (float, optional): Test size. Defaults to 0.25.

    Returns:
        list[pd.DataFrame]: List of the training and the validation data sets
    """
    X = dataframe[dataframe.columns[1:]]
    y = dataframe[dataframe.columns[0]]
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=val_size, random_state=42)
    return X_train, X_val, y_train, y_val

In [9]:
def tree_model_training (X_train: pd.DataFrame , y_train: pd.DataFrame, MAXDEPTH: int) -> DecisionTreeClassifier:
    """Training a tree model

    Args:
        X_train (pd.DataFrame): The input X we will train our model on
        y_train (pd.DataFrame): The input Y we will train our model on
        MAXDEPTH (int): The max depth of the tree

    Returns:
        DecisionTreeClassifier: Trained classifier Tree model 
    """
    model = DecisionTreeClassifier(max_depth=MAXDEPTH, random_state=42)
    model.fit(X_train, y_train)
    return model


In [10]:
def evaluate_model(model: DecisionTreeClassifier,
                    X_train:pd.DataFrame , 
                    X_val: pd.DataFrame,
                    y_train: pd.DataFrame,
                    y_val:pd.DataFrame,
                    printing: bool=False) -> list[float] : 
    
    """Evaluating the model on training data and validation data, and optiomally prints out the numbers 

    Args:
        model (DecisionTreeClassifier): _description_
        X_train (pd.DataFrame): The train input X 
        X_val (pd.DataFrame): The validation input Y
        y_train (pd.DataFrame): The train input Y 
        y_val (pd.DataFrame): The validation input Y
        printing (bool, optional): Wethere to print out the accuarcies. Defaults to False.

    Returns:
         list[flaot]: the accuarcies of the model
    """
    R1 = model.score(X_train, y_train)
    R2 = model.score(X_val, y_val)
    if printing :
        print(f"the accuarcy of the model on trained data is {round(R1*100,1)}%")
        print(f"the accuarcy of the model on validation data is {round(R2*100,1)}%")
    return R1,R2

In [11]:
def visualize_tree(model: DecisionTreeClassifier, 
                    X_train: pd.DataFrame,
                    FIGSIZE: tuple = (80,20),
                    depth: int = 3) -> None:
    """visualizing the tree model

    Args:
        model (DecisionTreeClassifier): the model
        X_train (pd.DataFrame): The input data (to use columns from)
        FIGSIZE (tuple, optional): Figure Size. Defaults to (80,20).
        depth (int, optional): How much depth should be showed. Defaults to 3.
    """
    plt.figure(figsize=FIGSIZE)
    plot_tree(model, feature_names=X_train.columns, max_depth= depth , filled= True, class_names=['No', 'Yes']) 

In [12]:
def find_optimal_tree_depth(X_train: pd.DataFrame,
                             X_val: pd.DataFrame,
                             y_train: pd.DataFrame,
                             y_val: pd.DataFrame,
                             MAX_DEPTH: int,
                             visualise: bool=False) -> list[list[tuple], int]:
    """Finding the optimal maximum depth for a tree classifiel model

    Args:
        X_train (pd.DataFrame): Input training X data
        X_val (pd.DataFrame): Input validation X data
        y_train (pd.DataFrame): Input training y data
        y_val (pd.DataFrame): Input validation y data
        MAX_DEPTH (int): maximum depth of the tree
        visualise (bool, optional): whether to visualise the accuracies accross multiple depths. Defaults to False.

    Returns:
        list[list[tuple], int]: outputs the accuracies and the best found depth 
    """

    accuracies = []
    best_depth = 1
    best_val_acc = 0

    for depth in range(1, MAX_DEPTH + 1):
        model = tree_model_training(X_train, y_train, MAXDEPTH=depth)
        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        train_acc = accuracy_score(y_train, y_train_pred)
        val_acc = accuracy_score(y_val, y_val_pred)
        accuracies.append((train_acc, val_acc))

        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_depth = depth

    if visualise:
        train_accuracies = [acc[0] for acc in accuracies]
        val_accuracies = [acc[1] for acc in accuracies]
        depths = list(range(1, MAX_DEPTH + 1))

        plt.figure(figsize=(10, 6))
        plt.plot(depths, train_accuracies, label='Training Accuracy', marker='o')
        plt.plot(depths, val_accuracies, label='Validation Accuracy', marker='s')
        plt.axvline(x=best_depth, color='r', linestyle='--', label=f'Best Depth: {best_depth}')
        plt.title('Decision Tree Accuracy vs Max Depth')
        plt.xlabel('Max Depth')
        plt.ylabel('Accuracy')
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.show()

    return accuracies, best_depth

In [13]:
def random_forest_train (X_train: pd.DataFrame,
                        y_train: pd.DataFrame,
                        n_trees:int = 1,
                        maxdepth = 4,
                        maxfeatures = "log2"
                        ) :
    model = RandomForestClassifier(n_jobs=-1,
                                random_state=42,
                                n_estimators= n_trees,
                                max_depth=maxdepth,
                                max_features=maxfeatures)
    model.fit(X_train, y_train)
    return model


In [501]:
temp = encode_data(clean_data(data_deck(data), "median"))
X_train, X_val, y_train, y_val = split_data(temp, 0.25)
trained_model = tree_model_training(X_train, y_train, 
                                    MAXDEPTH=14)

new_trained_model2 = random_forest_train(X_train, y_train, n_trees=1, maxdepth = 4)
acc2 = evaluate_model(new_trained_model2, X_train, X_val, y_train, y_val,printing=False)[1]
acc2


0.8340807174887892

In [14]:
def show_different_variables(model, X_train, X_val, y_train, y_val) :
    n_trees = np.arange(1,13)
    maxdepths = np.arange(1,16)
    
    results = np.zeros((len(n_trees), len(maxdepths)))

    for i, ntree in enumerate(n_trees):
        for j, maxdepth in enumerate(maxdepths):
            model = random_forest_train(X_train, y_train, n_trees=ntree, maxdepth=maxdepth)
            acc = evaluate_model(model, X_train, X_val, y_train, y_val, printing=False)[1]
            results[i, j] = acc

    plt.figure(figsize=(12, 6))
    sns.heatmap(results, xticklabels=maxdepths, yticklabels=n_trees, cmap='viridis')
    plt.xlabel('Max Depth')
    plt.ylabel('N_trees')
    plt.title('Validation Accuracy Heatmap')
    plt.show()



In [None]:
def analyze_random_forest_hyperparameters_simple(
    X_train: pd.DataFrame,
    y_train: pd.DataFrame,
    X_val: pd.DataFrame,
    y_val: pd.DataFrame
) -> pd.DataFrame:
    """
    Analyzes a few RandomForestClassifier hyperparameter combinations on a given dataset.

    Args:
        X_train: Training features.
        y_train: Training labels.
        X_val: Validation features.
        y_val: Validation labels.

    Returns:
        pd.DataFrame: Accuracy results with different hyperparameter settings.
    """
    param_grid = {
        'n_estimators': [50, 100],
        'max_depth': [5, 10, None],
        'max_features': ['sqrt', 0.6],
        'criterion': ['gini', 'entropy']
    }

    keys, values = zip(*param_grid.items())
    param_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]

    results = []

    for params in param_combinations:
        # Train model using the existing helper from Code 1
        model = RandomForestClassifier(
            **params,
            random_state=42,
            n_jobs=-1
        )
        model.fit(X_train, y_train)

        train_acc = model.score(X_train, y_train)
        val_acc = model.score(X_val, y_val)

        result = params.copy()
        result['train_accuracy'] = round(train_acc, 4)
        result['val_accuracy'] = round(val_acc, 4)

        results.append(result)

    results_df = pd.DataFrame(results)
    results_df = results_df.sort_values(by='val_accuracy', ascending=False).reset_index(drop=True)
    return results_df


In [19]:
# Reuse your existing synthetic dataset generation
X, y = make_classification(n_samples=1500, n_features=25, n_informative=15,
                           n_redundant=5, n_classes=2, random_state=42, flip_y=0.05)
X = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])
y = pd.Series(y, name='target')

# Train/Validation split
X_train_ex, X_val_ex, y_train_ex, y_val_ex = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y 
)

# Run simple analysis
results_df = analyze_random_forest_hyperparameters_simple(
    X_train_ex, y_train_ex, X_val_ex, y_val_ex
)

results_df.head()


Unnamed: 0,n_estimators,max_depth,max_features,criterion,train_accuracy,val_accuracy
0,100,,0.6,gini,1.0,0.9044
1,100,10.0,0.6,gini,0.9971,0.9022
2,50,10.0,0.6,gini,0.9971,0.9
3,50,,sqrt,gini,1.0,0.9
4,50,,0.6,gini,1.0,0.9


In [22]:
best_model = RandomForestClassifier(n_estimators= 100, max_depth= None, max_features=0.6, criterion="gini")
best_model.fit(X_train_ex, y_train_ex)



In [23]:
evaluate_model(best_model, X_train_ex, X_val_ex, y_train_ex, y_val_ex, printing=True)

the accuarcy of the model on trained data is 100.0%
the accuarcy of the model on validation data is 91.3%


(1.0, 0.9133333333333333)

# This is the best performent Parameter !
