# Setup notebook

In [89]:
# Import libraries
import pandas as pd
# Set display options to show all rows
pd.set_option('display.max_rows', None)
import numpy as np
from collections import defaultdict
import joblib

import plotly.express as px
import plotly.io as pio
from plotly.subplots import make_subplots
pio.renderers.default = 'vscode'

from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler, RobustScaler
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, cross_validate
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

## Functions

In [2]:
def plot_pie(df, col_to_plot, title, hole_size, color_sequence, legend_dict):
    """
    Plots a pie chart using Plotly.

    Args:
        df (pd.DataFrame): The DataFrame containing the data.
        col_to_plot (str): The column name to be used for pie chart categories.
        title (str): The title of the pie chart.
        hole_size (float): The size of the hole in the center (for donut charts, e.g., 0 for full pie, 0.5 for half).
        color_sequence (list): A list of colors for the pie chart slices.
        legend_dict (dict): A dictionary mapping original category values to new legend labels.

    Raises:
        ValueError: If `legend_dict` and `color_sequence` do not have the same length.

    Returns:
        None: Displays the pie chart.

    Example:
        >>> df = pd.DataFrame({'Category': ['A', 'B', 'C'], 'Values': [10, 20, 30]})
        >>> legend_dict = {'A': 'Alpha', 'B': 'Beta', 'C': 'Gamma'}
        >>> plot_pie(df, 'Category', 'Sample Pie Chart', 0.2, ['red', 'blue', 'green'], legend_dict)
    """

    if len(legend_dict) == 0:
        fig = px.pie(
            df,  # Changed from total_raw_df (assuming it's a typo)
            names=df[col_to_plot],
            title=title,
            hole=hole_size,
            color_discrete_sequence=color_sequence,
        )
    elif len(legend_dict) != len(color_sequence):
        raise ValueError('legend_dict and color_sequence must have the same length')
    else:
        fig = px.pie(
            df,  # Changed from total_raw_df
            names=df[col_to_plot].map(legend_dict),
            title=title,
            hole=hole_size,
            color_discrete_sequence=color_sequence,
        )

    fig.update_layout(
        font=dict(size=18),
        autosize=False,
        width=800,
        height=800,
    )
    fig.show()


In [3]:
def plot_sunburst(df, hierarchy, title, color, color_mapping):
    """
    Plots a sunburst chart using Plotly.

    Args:
        df (pd.DataFrame): The DataFrame containing hierarchical data.
        hierarchy (list): A list of column names defining the hierarchy for the sunburst chart.
        title (str): The title of the sunburst chart.
        color (str): The column name used for coloring the chart.
        color_mapping (dict): A dictionary mapping category labels to specific colors.

    Raises:
        KeyError: If any category in the data does not exist in `color_mapping`.

    Returns:
        None: Displays the sunburst chart.

    Example:
        >>> df = pd.DataFrame({
        ...     'Category': ['A', 'A', 'B', 'B'],
        ...     'Subcategory': ['X', 'Y', 'X', 'Z'],
        ...     'Value': [10, 20, 30, 40]
        ... })
        >>> hierarchy = ['Category', 'Subcategory']
        >>> color_mapping = {'A': 'red', 'B': 'blue', 'X': 'green', 'Y': 'purple', 'Z': 'orange'}
        >>> plot_sunburst(df, hierarchy, 'Sunburst Chart', 'Category', color_mapping)
    """

    fig = px.sunburst(
        df,
        path=hierarchy,
        title=title,
        color=df[color],
    )

    fig.update_traces(
        textinfo="label+percent parent",
        insidetextorientation='horizontal',
        marker_colors=[color_mapping[cat] for cat in fig.data[-1].labels]
    )

    fig.update_layout(
        font=dict(size=18),
        autosize=False,
        width=800,
        height=800,
    )

    fig.show()


In [4]:
def extract_names(name):
    """
    Extracts the family name, title, and first name(s) from a full name string.

    Args:
        name (str): A name string formatted as "LastName, Title. FirstName" 
                    or "LastName, Title. (SpouseFirstName SpouseLastName)".

    Returns:
        tuple: (Family Name, Title, First Name)
    """
    
    if '(' in name and ')' in name: # Example: 'Smith, Mrs. (Jane Smith)'
        family_name, rest = name.split(',', 1) # family:name = 'Smith' rest = ', Mrs. (Jane Smith)'
        title = rest.split('.', 1)[0] # title = ' Mrs'
        # Remove parentheses and extract the spouse's first name
        spouse_name = rest[rest.find('('):rest.find(')')+1].strip()[1:-1] # spouse_name = 'Jane Smith'
        # If only one name is present, it is first name
        if ' ' in spouse_name:
            # Exclude the last name
            first_name = ' '.join(spouse_name.split()[:-1]) # first_name = 'Jane'
        else:
            first_name = spouse_name
    else:
        family_name, rest = name.split(',', 1)
        title, first_name = rest.split('.', 1)
    return family_name.strip(), title.strip(), first_name.strip()

In [5]:
def split_name_column(df):
    """
    Splits the 'Name' column in a DataFrame into 'Family Name', 'Title', and 'First Name'.  
    It also handles cases where a spouse's name is enclosed in parentheses.

    The function assumes that names are formatted as either:  
    1. `"LastName, Title. FirstName"`  
    2. `"LastName, Title. (SpouseFirstName SpouseLastName)"`  

    When a spouse's name is included in parentheses, the function extracts only the spouse's first name(s).

    Args:
        df (pd.DataFrame): A DataFrame containing a 'Name' column with names in one of the above formats.

    Returns:
        pd.DataFrame: The modified DataFrame with new columns: 'Family Name', 'Title', and 'First Name'.
                      The original 'Name' column is removed.

    Example:
        >>> data = {'Name': ['Doe, Mr. John', 'Smith, Mrs. (Jane Smith)']}
        >>> df = pd.DataFrame(data)
        >>> split_name_column(df)
           Family Name Title First Name
        0        Doe   Mr    John
        1      Smith   Mrs   Jane
    """
    
    # Apply the extraction function to each row and create new columns
    df[['Family Name', 'Title', 'First Name']] = df['Name'].apply(lambda x: pd.Series(extract_names(x)))
    
    # Drop the 'Name' column as it is no longer needed
    df.drop(columns=['Name'], inplace=True)
    
    return df


In [6]:
def classify_passengers(df):
    """
    Classifies Titanic passengers into solo travelers, families, or friends based on ticket and family name information.

    A passenger is classified as:
    - **Solo traveler (0)**: If they are the only person with their ticket.
    - **Family member (1)**: If they share a ticket and the same family name with others.
    - **Friend (2)**: If they share a ticket but have different family names.

    Args:
        df (pd.DataFrame): A DataFrame containing 'Ticket' and 'Family Name' columns.

    Returns:
        pd.DataFrame: The updated DataFrame with a new 'Company' column:
                      - 0: Solo traveler
                      - 1: Family member
                      - 2: Friend

    Example:
        >>> data = {'Ticket': ['A123', 'A123', 'B456', 'C789', 'C789'], 
                    'Family Name': ['Smith', 'Smith', 'Doe', 'Brown', 'Taylor']}
        >>> df = pd.DataFrame(data)
        >>> classify_passengers(df)
           Ticket Family Name  Company
        0   A123      Smith      1
        1   A123      Smith      1
        2   B456        Doe      0
        3   C789      Brown      2
        4   C789     Taylor      2
    """

    # Count the number of occurrences for each ticket
    ticket_counts = df['Ticket'].value_counts()

    # Identify passengers traveling alone (unique ticket)
    df['Company'] = df['Ticket'].map(lambda x: 0 if ticket_counts[x] == 1 else None)

    # Group passengers who are not alone by ticket
    group_ticket_all = df[df['Company'].isna()].groupby('Ticket')

    for ticket, group_ticket in group_ticket_all:

        # #DEBUGONLY
        # print("-" * 40)
        # print(f"TICKET: {ticket}")
        # print(group_ticket)

        if len(group_ticket) > 1:
            # Further group by family name to classify families
            group_family_all = group_ticket.groupby('Family Name')
            for family_name, group_family in group_family_all:

                # #DEBUGONLY
                # print("-" * 20)
                # print(f"Family Name: {family_name}")
                # print(group_family)

                if len(group_family) > 1:
                    df.loc[group_family.index, 'Company'] = 1  # Mark as family members
                else:
                    # Remaining passengers in the group are friends
                    df.loc[group_ticket.index[df.loc[group_ticket.index, 'Company'].isna()], 'Company'] = 2  

    return df


In [7]:
def find_friends(df):
    """
    Identifies passengers who share a ticket with others but have different family names, classifying them as friends.

    A "friend" is defined as:
    - A passenger who shares the same 'Ticket' with at least one other passenger.
    - The passengers in the group have different 'Family Name' values.

    Args:
        df (pd.DataFrame): A DataFrame containing 'Ticket' and 'Family Name' columns.

    Returns:
        pd.DataFrame: A subset of the original DataFrame containing only friends.

    Example:
        >>> data = {'Ticket': ['A123', 'A123', 'B456', 'C789', 'C789'], 
                    'Family Name': ['Smith', 'Johnson', 'Doe', 'Brown', 'Taylor']}
        >>> df = pd.DataFrame(data)
        >>> find_friends(df)
           Ticket Family Name
        0   A123      Smith
        1   A123    Johnson
        3   C789      Brown
        4   C789     Taylor
    """

    # Group by 'Ticket' and count the number of unique family names per ticket
    grouped = df.groupby('Ticket')['Family Name'].nunique()

    # Identify tickets that have more than one unique family name (indicating friends)
    friend_tickets = grouped[grouped > 1].index

    # Filter and return only passengers traveling on these tickets
    return df[df['Ticket'].isin(friend_tickets)]


In [8]:
def impute_ages_randomly(df, company):
    """
    Imputes missing 'Age' values for travelers belonging to a specified company.

    This function identifies travelers based on the 'Company' column, then uses the 
    non-missing 'Age' values of these travelers to randomly impute any missing ages. 
    The imputation preserves the original age distribution by sampling with replacement 
    from the observed ages.

    Args:
        df (pd.DataFrame): A DataFrame containing at least the 'Company' and 'Age' columns.
        company (str): The company identifier used to filter solo travelers.

    Returns:
        pd.DataFrame: The input DataFrame with missing 'Age' values imputed for the specified company's solo travelers.

    Example:
        >>> data = {'Company': ['A', 'A', 'B', 'A'],
        ...         'Age': [25, np.nan, 30, np.nan]}
        >>> df = pd.DataFrame(data)
        >>> imputed_df = impute_ages(df, 'A')
        >>> print(imputed_df)
           Company   Age
        0       A  25.0
        1       A  25.0  # or another sampled age from company 'A'
        2       B  30.0
        3       A  25.0  # or another sampled age from company 'A'
    """
    # Identify solo travelers belonging to the specified company
    solo_mask = df['Company'] == company
    
    # Extract observed ages for solo travelers (non-missing)
    observed_ages = df.loc[solo_mask, 'Age'].dropna().values
    
    # Get the indices where Age is missing for solo travelers
    missing_indices = df.loc[solo_mask & df['Age'].isna()].index
    
    # Randomly sample from the observed ages to fill in the missing values
    # Using np.random.choice to preserve the original age distribution
    imputed_values = np.random.choice(observed_ages, size=len(missing_indices), replace=True)
    
    # Fill in the missing values
    df.loc[missing_indices, 'Age'] = imputed_values
    
    return df

In [9]:
def compute_reference_medians(df, family_identifier_cols=['Ticket', 'Family Name']):
    """
    #TODO: Add docstring
    """
    parent_ages = []
    child_ages = []
    
    # Group by the family identifier
    families = df.groupby(family_identifier_cols)
    for name, group in families:
        # Only use families with no missing age values
        if group['Age'].isna().sum() == 0:
            ages_sorted = group['Age'].sort_values(ascending=False).values
            if len(ages_sorted) >= 2:
                # Two oldest assumed as parents
                parent_ages.extend(ages_sorted[:2])
                # The rest are children (if any)
                if len(ages_sorted) > 2:
                    child_ages.extend(ages_sorted[2:])
            else:
                # If only one member, you may decide how to treat it (e.g., as a solo traveler)
                parent_ages.append(ages_sorted[0])
                
    parent_median = np.median(parent_ages) if parent_ages else None
    child_median = np.median(child_ages) if child_ages else None
    return parent_median, child_median

In [10]:
def impute_family_ages(df, family_identifier_cols=['Ticket', 'Family Name']):
    """
    #TODO: Add docstring
    """
    # First, compute reference medians using families with complete data
    parent_median, child_median = compute_reference_medians(df, family_identifier_cols)
    print("Reference Parent Median Age:", parent_median)
    print("Reference Child Median Age:", child_median)
    
    # Create a copy to avoid modifying original DataFrame
    df_imputed = df.copy()
    
    # Function to impute a single family group
    def impute_group(group):
        # Make a copy of group rows
        group = group.copy()
        # Identify rows with known and missing ages
        known = group[group['Age'].notna()]
        missing = group[group['Age'].isna()]
        
        # If the family has less than 2 members, there is no grouping to decide parent/child roles
        if len(group) < 2:
            # For a single member, you might want to leave it or use a global median
            group.loc[group['Age'].isna(), 'Age'] = parent_median if parent_median is not None else group['Age'].median()
            return group
        
        # Determine order. For known ages, sort descending.
        # For missing ages, we cannot sort by age so we tentatively append them.
        group['sort_order'] = group['Age']  # This will be NaN for missing ages
        group = group.sort_values(by='sort_order', ascending=False, na_position='last')
        
        # Now, assume that the top two positions are parents.
        # Create a new column to store inferred role: 'Parent' or 'Child'
        group['role'] = np.nan
        # Get index of top two rows (by current sorted order)
        group_roles = group.index.tolist()
        if len(group_roles) >= 2:
            # For the two oldest positions, set as 'Parent'
            group.loc[group_roles[0:2], 'role'] = 'Parent'
            # The rest become 'Child'
            if len(group_roles) > 2:
                group.loc[group_roles[2:], 'role'] = 'Child'
        else:
            # If there is only one member, assume parent.
            group['role'] = 'Parent'
        
        # Now, for missing ages, impute based on their inferred role
        def impute_age(row):
            if pd.notna(row['Age']):
                return row['Age']
            else:
                if row['role'] == 'Parent' and parent_median is not None:
                    return parent_median
                elif row['role'] == 'Child' and child_median is not None:
                    return child_median
                else:
                    # fallback if median not available
                    return df_imputed['Age'].median()
                    
        group['Age'] = group.apply(impute_age, axis=1)
        group = group.drop(columns=['sort_order', 'role'])
        return group

    # Apply the imputation to families only (Company == 1, assumed to be families)
    # If you want to restrict to families, you can filter by another column if available.
    family_groups = df_imputed.groupby(family_identifier_cols)
    df_imputed = family_groups.apply(impute_group).reset_index(drop=True)
    
    return df_imputed

In [11]:
def impute_family_ages_randomly(df, family_identifier_cols=['Ticket', 'Family Name']):
    """
    #TODO: Add docstring
    """
    df = df.copy()
    
    # Identify family groups (assumed Company == 1 indicates family travelers)
    family_mask = df['Company'] == 1
    
    # Overall observed ages for families (fallback option)
    overall_family_ages = df.loc[family_mask, 'Age'].dropna().values
    
    # Group by the family identifier
    families = df.loc[family_mask].groupby(family_identifier_cols)
    
    for group_id, group in families:
        # Find indices with missing ages in this family group
        missing_indices = group[group['Age'].isna()].index
        
        # Observed ages within the family group
        observed_ages = group['Age'].dropna().values
        
        if len(missing_indices) > 0:
            if len(observed_ages) > 0:
                # Randomly sample from the observed ages in the family
                imputed_values = np.random.choice(observed_ages, size=len(missing_indices), replace=True)
            else:
                # If the family has no observed ages, sample from the overall family distribution
                imputed_values = np.random.choice(overall_family_ages, size=len(missing_indices), replace=True)
            # Impute the missing ages in the DataFrame
            df.loc[missing_indices, 'Age'] = imputed_values
    
    return df

In [12]:
def plot_age_distribution_by_company(df, title):
    """
    Plots histograms of age distributions for passengers based on company categories.

    The function creates three vertically arranged subplots (one for each company category):
      - 0.0: Solo travelers
      - 1.0: Family
      - 2.0: Friends

    For each category, it filters the DataFrame by the 'Company' column, constructs a histogram 
    for the 'Age' column, and adds it to the corresponding subplot. Colors and labels are defined 
    using a legend dictionary and a colors mapping.

    Args:
        df (pd.DataFrame): A DataFrame containing at least the following columns:
                           - 'Company': Numeric values (0.0, 1.0, 2.0) representing company categories.
                           - 'Age': Numeric values representing the age of passengers.

    Returns:
        None: The function displays the resulting plot.

    Example:
        >>> data = {
        ...     'Company': [0.0, 1.0, 2.0, 0.0, 1.0, 2.0],
        ...     'Age': [22, 45, 30, 18, 50, 28]
        ... }
        >>> df = pd.DataFrame(data)
        >>> plot_age_distribution_by_company(df)
    """
    # Define a mapping from company values to descriptive labels
    legend_dict = {0.0: 'Solo', 1.0: 'Family', 2.0: 'Friends'}

    # Define a mapping from company values to specific colors
    colors = {0.0: 'blue', 1.0: 'green', 2.0: 'red'}

    # Create subplots: three rows, one column, sharing the same x-axis.
    # Each subplot is titled according to the corresponding company category.
    fig = make_subplots(
        rows=3, cols=1, shared_xaxes=True, 
        subplot_titles=("Solo", "Family", "Friends")
    )

    # Iterate over each company category and add a histogram trace to the appropriate subplot.
    for i, company in enumerate([0.0, 1.0, 2.0], start=1):
        # Filter the DataFrame for the current company category.
        company_df = df[df['Company'] == company]
        
        # Create a histogram for the 'Age' column of the filtered DataFrame.
        # The color_discrete_sequence parameter sets the histogram's color.
        hist_trace = px.histogram(
            company_df,
            x='Age',
            title=f'Age Distribution for {legend_dict[company]}',
            color_discrete_sequence=[colors[company]]
        ).data[0]
        
        # Add the histogram trace to the subplot at row i, column 1.
        fig.add_trace(hist_trace, row=i, col=1)

    # Update the overall layout of the figure:
    # - Set the main title.
    # - Hide the legend.
    # - Specify the overall figure height.
    fig.update_layout(
        title_text=title,
        showlegend=False,
        height=900
    )

    # Display the final plot.
    fig.show()

In [13]:
def calculate_vif(df):
    """
    Calculates the Variance Inflation Factor (VIF) for each feature in the provided DataFrame.

    The VIF measures how much the variance of an estimated regression coefficient increases 
    due to multicollinearity. A higher VIF indicates a higher degree of collinearity between 
    a given feature and the other features in the DataFrame.

    Args:
        df (pd.DataFrame): A DataFrame containing only the numeric features for which the VIF 
                           is to be calculated.

    Returns:
        pd.DataFrame: A DataFrame with two columns:
                      - 'Feature': The name of the feature.
                      - 'VIF': The calculated variance inflation factor for that feature.
                      The DataFrame is sorted in descending order by the VIF values.

    Example:
        >>> import pandas as pd
        >>> from statsmodels.stats.outliers_influence import variance_inflation_factor
        >>> data = {
        ...     'X1': [1, 2, 3, 4, 5],
        ...     'X2': [2, 4, 6, 8, 10],
        ...     'X3': [5, 3, 6, 2, 7]
        ... }
        >>> df = pd.DataFrame(data)
        >>> calculate_vif(df)
            Feature       VIF
        0      X1  10.1234
        1      X2   9.8765
        2      X3   1.2345
    """
    # Create an empty DataFrame to hold the VIF results.
    vif_data = pd.DataFrame()
    
    # Assign the feature names from the DataFrame to the 'Feature' column.
    vif_data["Feature"] = df.columns
    
    # Calculate the VIF for each feature.
    # For each column index i in df, variance_inflation_factor computes the VIF.
    vif_data["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
    
    # Return the DataFrame sorted by VIF in descending order.
    return vif_data.sort_values(by="VIF", ascending=False)

In [14]:
def remove_high_vif_features(df, threshold=5.0):
    """
    Iteratively removes features with high Variance Inflation Factor (VIF) to reduce multicollinearity.

    The function calculates the VIF for all numeric features in the DataFrame and removes 
    the feature with the highest VIF if it exceeds the given threshold. This process repeats 
    until all remaining features have a VIF below the threshold.

    Args:
        df (pd.DataFrame): A DataFrame containing only the numeric features for which VIF 
                           should be analyzed and reduced.
        threshold (float, optional): The maximum allowable VIF value. Features with VIF 
                                     above this threshold will be removed. Default is 5.0.

    Returns:
        pd.DataFrame: The modified DataFrame with high-VIF features removed.

    Example:
        >>> import pandas as pd
        >>> data = {
        ...     'X1': [1, 2, 3, 4, 5],
        ...     'X2': [2, 4, 6, 8, 10],
        ...     'X3': [5, 3, 6, 2, 7]
        ... }
        >>> df = pd.DataFrame(data)
        >>> df = remove_high_vif_features(df, threshold=5.0)
        Dropping 'X2' with VIF=10.12
    """
    while True:
        # Calculate the VIF for all features in the DataFrame.
        vif_df = calculate_vif(df)
        
        # Get the highest VIF value from the DataFrame.
        max_vif = vif_df.iloc[0]["VIF"]
        
        # If the highest VIF is below the threshold, stop the iteration.
        if max_vif < threshold:
            break 
        
        # Identify the feature with the highest VIF.
        feature_to_remove = vif_df.iloc[0]["Feature"]
        
        # Print the feature being removed along with its VIF value.
        print(f"Dropping '{feature_to_remove}' with VIF={max_vif:.2f}")
        
        # Drop the feature from the DataFrame.
        df = df.drop(columns=[feature_to_remove])
    
    # Return the updated DataFrame with reduced multicollinearity.
    return df

# Process data

| Variable   | Definition                           | Key                                    |
|------------|--------------------------------------|----------------------------------------|
| Survived   | Survival                            | 0 = No, 1 = Yes                        |
| Pclass     | Ticket class                        | 1 = 1st, 2 = 2nd, 3 = 3rd              |
| Name     | Name of the passenger                        |               |
| Sex        | Gender of the patient                                 | 1 = female, 0 = male                                       |
| Age        | Age in years                        | Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5                                       |
| sibsp      | # of siblings/spouses aboard       | Sibling = brother, sister, stepbrother, stepsister; Spouse = husband, wife (mistresses and fiancés were ignored)                                       |
| parch      | # of parents/children aboard       | Parent = mother, father; Child = daughter, son, stepdaughter, stepson; Some children travelled only with a nanny, therefore parch=0 for them                                      |
| ticket     | Ticket number                       |                                        |
| fare       | Passenger fare                      |                                        |
| cabin      | Cabin number                        |                                        |
| embarked   | Port of Embarkation                | C = Cherbourg, Q = Queenstown, S = Southampton |

### Load data

In [15]:
# Load raw dataframes
train_raw_df = pd.read_csv('../Data/train.csv')
test_raw_df = pd.read_csv('../Data/test.csv')
# total_raw_df = pd.read_excel('../Data/Complete_dataset.xls')

In [16]:
 # 'Sex' column: convert male-->0, female-->1
sex_dict = {'male': 0, 'female': 1}
train_raw_df['Sex'] = train_raw_df['Sex'].map(sex_dict)
test_raw_df['Sex'] = test_raw_df['Sex'].map(sex_dict)

### Exploratory Data Analysis

In [17]:
# See how many records and variables
num_pass, num_var = train_raw_df.shape[0], train_raw_df.shape[1]
print(f'Number of passengers = {num_pass}\nNumber of variables = {num_var}')

Number of passengers = 891
Number of variables = 12


In [18]:
train_raw_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S


In [19]:
# Look for missing values
for el in train_raw_df.columns:
    num_null = train_raw_df[el].isnull().sum()
    print(f'Column {el} has {num_null} nulls')

Column PassengerId has 0 nulls
Column Survived has 0 nulls
Column Pclass has 0 nulls
Column Name has 0 nulls
Column Sex has 0 nulls
Column Age has 177 nulls
Column SibSp has 0 nulls
Column Parch has 0 nulls
Column Ticket has 0 nulls
Column Fare has 0 nulls
Column Cabin has 687 nulls
Column Embarked has 2 nulls


### Manipulate data

- Age will be imputed. I will analyze the surnames of the passengers and try to extract family units and impute age with a logic. SibSp and Parch should also be considered in this analysis;
- Cabin could be related to Pclass, and thus redundant, or it could give some insight about survival rate as a part of the shil went down before the other. To explore;
- Fare should also be related to Pclass. To explore;

In [20]:
# Split name into first name, family name and title
train_df = train_raw_df.copy()
train_df = split_name_column(train_df)

In [21]:
# Sort the columns in the specified order
sorted_columns = [
    'PassengerId', 
    'Family Name', 
    'Title', 
    'First Name', 
    'Survived', 
    'Pclass', 
    'Sex', 
    'Age', 
    'SibSp', 
    'Parch', 
    'Ticket', 
    'Fare', 
    'Cabin', 
    'Embarked'
]
train_df = train_df[sorted_columns]
train_df = train_df.sort_values(by='Family Name')

In [22]:
train_df.head(10)

Unnamed: 0,PassengerId,Family Name,Title,First Name,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
845,846,Abbing,Mr,Anthony,0,3,0,42.0,0,0,C.A. 5547,7.55,,S
279,280,Abbott,Mrs,Rosa,1,3,1,35.0,1,1,C.A. 2673,20.25,,S
746,747,Abbott,Mr,Rossmore Edward,0,3,0,16.0,1,1,C.A. 2673,20.25,,S
874,875,Abelson,Mrs,Hannah,1,2,1,28.0,1,0,P/PP 3381,24.0,,C
308,309,Abelson,Mr,Samuel,0,2,0,30.0,1,0,P/PP 3381,24.0,,C
365,366,Adahl,Mr,Mauritz Nils Martin,0,3,0,30.0,0,0,C 7076,7.25,,S
401,402,Adams,Mr,John,0,3,0,26.0,0,0,341826,8.05,,S
40,41,Ahlin,Mrs,Johanna Persdotter,0,3,1,40.0,1,0,7546,9.475,,S
855,856,Aks,Mrs,Leah,1,3,1,18.0,0,1,392091,9.35,,S
207,208,Albimona,Mr,Nassef Cassem,1,3,0,26.0,0,0,2699,18.7875,,C


#### --- Age ---

In [23]:
# Classify passengers into solo travelers, families, or friends.
# This is done because solo travelers will have a different survival rate than families or friends
# Moreovers, solo travelers will have different ages than families or friends, 
# so this should help to fill in missing ages with more accuracy

# Remember: 0 = solo traveler, 1 = family member, 2 = friend
train_df = classify_passengers(train_df)

There wasn't an official **minimum age** requirement to travel alone on the Titanic, but historical records suggest that most **unaccompanied minors** were at least teenagers.  

From survivor lists and passenger data:  
- The **youngest recorded solo traveler** was **13-year-old Marie Grice Young** (a first-class passenger).  
- Most **children under 12** were traveling with family or guardians.  
- Some **teenagers (around 14-17)** were traveling alone, often as third-class emigrants.  

In [24]:
# Force passengers with age < 13 to classified as family members
train_df.loc[train_df['Age'] < 13, 'Company'] = 1

In [25]:
train_df.head()

Unnamed: 0,PassengerId,Family Name,Title,First Name,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Company
845,846,Abbing,Mr,Anthony,0,3,0,42.0,0,0,C.A. 5547,7.55,,S,0.0
279,280,Abbott,Mrs,Rosa,1,3,1,35.0,1,1,C.A. 2673,20.25,,S,1.0
746,747,Abbott,Mr,Rossmore Edward,0,3,0,16.0,1,1,C.A. 2673,20.25,,S,1.0
874,875,Abelson,Mrs,Hannah,1,2,1,28.0,1,0,P/PP 3381,24.0,,C,1.0
308,309,Abelson,Mr,Samuel,0,2,0,30.0,1,0,P/PP 3381,24.0,,C,1.0


In [26]:
# Survival rate for solo travelers, families, and friends
survival_rate = train_df.groupby('Company')['Survived'].mean()*100
print(survival_rate)

Company
0.0    29.020333
1.0    50.769231
2.0    58.888889
Name: Survived, dtype: float64


In [27]:
# Show Age distributions by Company
plot_age_distribution_by_company(train_df, 'Age Distribution by Company (ORIGINAL DATA)')

In [28]:
# Impute data for solo travelers
train_df = impute_ages_randomly(train_df, 0)
# Impute data for families
train_df = impute_family_ages_randomly(train_df)
# Impute data for friends
train_df = impute_ages_randomly(train_df, 2)

In [29]:
# Show Age distributions by Company
plot_age_distribution_by_company(train_df, 'Age Distribution by Company (IMPUTED DATA)')

#### --- Cabin ---

In [30]:
# For all passengers with Cabin stargin with the same letter (example: 'A'), calculate survival rate
# This will help to fill in missing Cabin values
# First, extract the first letter of the Cabin
train_df['Cabin_Letter'] = train_df['Cabin'].str[0]
# Calculate survival rate by Cabin letter
cabin_survival_rate = train_df.groupby('Cabin_Letter')['Survived'].mean()*100
print(cabin_survival_rate)

Cabin_Letter
A    46.666667
B    74.468085
C    59.322034
D    75.757576
E    75.000000
F    61.538462
G    50.000000
T     0.000000
Name: Survived, dtype: float64


In [31]:
# Drop the 'Cabin' and 'Cabine_Letter' columns
train_df.drop(columns=['Cabin', 'Cabin_Letter'], inplace=True)

#### --- Embarked ----

In [32]:
# Could there be some relationship between the Embarked site and surival rate?
# Calculate survival rate by Embarked site
embarked_survival_rate = train_df.groupby('Embarked')['Survived'].mean()
print(embarked_survival_rate)

Embarked
C    0.553571
Q    0.389610
S    0.336957
Name: Survived, dtype: float64


In [33]:
# Impute missing Embarked values with the most common value
most_common_embarked = train_df['Embarked'].mode()[0]
train_df['Embarked'] = train_df['Embarked'].fillna(most_common_embarked)

### Wrap up

In [34]:
# Look for missing values after data manipulation
for el in train_df.columns:
    num_null = train_df[el].isnull().sum()
    print(f'Column {el} has {num_null} nulls')

Column PassengerId has 0 nulls
Column Family Name has 0 nulls
Column Title has 0 nulls
Column First Name has 0 nulls
Column Survived has 0 nulls
Column Pclass has 0 nulls
Column Sex has 0 nulls
Column Age has 0 nulls
Column SibSp has 0 nulls
Column Parch has 0 nulls
Column Ticket has 0 nulls
Column Fare has 0 nulls
Column Embarked has 0 nulls
Column Company has 0 nulls


# Classifier

- PassengerId will NOT be considered in the train dataset, as it is irrelevant;
- Cabin will NOT be considered in the train dataset, as it has too much missing values, and it does not seem to be related to surivavl rates;

## Get data

In [35]:
train_df.head()

Unnamed: 0,PassengerId,Family Name,Title,First Name,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Company
845,846,Abbing,Mr,Anthony,0,3,0,42.0,0,0,C.A. 5547,7.55,S,0.0
279,280,Abbott,Mrs,Rosa,1,3,1,35.0,1,1,C.A. 2673,20.25,S,1.0
746,747,Abbott,Mr,Rossmore Edward,0,3,0,16.0,1,1,C.A. 2673,20.25,S,1.0
874,875,Abelson,Mrs,Hannah,1,2,1,28.0,1,0,P/PP 3381,24.0,C,1.0
308,309,Abelson,Mr,Samuel,0,2,0,30.0,1,0,P/PP 3381,24.0,C,1.0


In [36]:
# Create a copy of train_df with only the columns: 'Pclass', 'Sex', 'Age', SibSp', 'Parch', 'Fare', 'Embarked', 'Company'
X = train_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Company']].copy()
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Company
845,3,0,42.0,0,0,7.55,S,0.0
279,3,1,35.0,1,1,20.25,S,1.0
746,3,0,16.0,1,1,20.25,S,1.0
874,2,1,28.0,1,0,24.0,C,1.0
308,2,0,30.0,1,0,24.0,C,1.0


In [37]:
# Extract data labels
y = train_df['Survived'].copy()
y.head()

845    0
279    1
746    0
874    1
308    0
Name: Survived, dtype: int64

## Process data

In [38]:
# Identify categorical columns
cat_cols = [
    'Embarked',
    'Company'
]
# Identify numerical columns
num_cols = X.columns.difference(cat_cols)
# Processing pipelines
# Encoding categorical variables & scaling numerical features
num_pipeline = Pipeline([
    # ("scaler", StandardScaler())
    # ("scaler", MinMaxScaler())
    ("scaler", RobustScaler())
])
cat_pipeline = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])
# Combine transformers
preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_cols),
    ("cat", cat_pipeline, cat_cols)
])
X_proc = pd.DataFrame(preprocessor.fit_transform(X))

In [39]:
# Get VIF and remove feature with high collinearity
X_proc = remove_high_vif_features(X_proc, threshold=5)

Dropping '6.0' with VIF=inf



divide by zero encountered in double_scalars



## Train classifier

### Suppor Vector Machine

In [51]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_proc, y, test_size=0.2, random_state=42)

# Define models
svm = SVC(C=1.0, kernel='rbf', degree=2, gamma='scale', random_state=0, probability=True) # {'C': 10, 'degree': 2, 'gamma': 'scale', 'kernel': 'rbf'}
rf = RandomForestClassifier(max_depth=None, bootstrap=True, min_samples_leaf=1, min_samples_split=10, n_estimators=200) # {'bootstrap': True, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
nb = GaussianNB(var_smoothing=1e-05) # {'var_smoothing': 1e-05}

In [52]:
# # Define hyperparameter grids
# svm_params = {
#     'C': [0.1, 1, 10, 100],
#     'kernel': ['linear', 'rbf', 'poly'],
#     'degree': [2, 3, 4],  # Only applies if kernel='poly'
#     'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1]  # Only for 'rbf', 'poly', 'sigmoid'
#     }
# svm_grid = GridSearchCV(svm, svm_params, cv=5, scoring='accuracy', n_jobs=-1)
svm.fit(X_train, y_train)

In [53]:
# rf_params = {
#     'n_estimators': [50, 100, 200, 300],
#     'max_depth': [None, 10, 20, 30],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'bootstrap': [True, False]
#     }
# rf_grid = GridSearchCV(rf, rf_params, cv=5, scoring='accuracy', n_jobs=-1)
rf.fit(X_train, y_train)

In [54]:
# nb_params = {
#     'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]
#     }
# nb_grid = GridSearchCV(nb, nb_params, cv=5, scoring='accuracy', n_jobs=-1)
nb.fit(X_train, y_train)

In [55]:
# # Best models
# best_svm = svm_grid.best_estimator_
# best_rf = rf_grid.best_estimator_
# best_nb = GaussianNB(var_smoothing=1e-05)

# print(f"Best SVM: {svm_grid.best_params_}")
# print(f"Best RF: {rf_grid.best_params_}")
# print(f"Best NB: {nb_grid.best_params_}")

Best SVM: {'C': 10, 'degree': 2, 'gamma': 'scale', 'kernel': 'rbf'}

Best RF: {'bootstrap': True, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}

Best NB: {'var_smoothing': 1e-05}

In [None]:
# Ensemble Voting Classifier
ensemble = VotingClassifier(
    # estimators=[('svm', svm), ('rf', rf), ('nb', nb)], voting='soft',
    # estimators=[('svm', svm), ('rf', rf), ('nb', nb)], voting='hard',
    estimators=[('svm', svm), ('rf', rf)], voting='soft',
    # estimators=[('svm', svm), ('rf', rf)], voting='hard',
    # estimators=[('svm', svm), ('nb', nb)], voting='soft',
    # estimators=[('svm', svm), ('nb', nb)], voting='hard',
    # estimators=[('rf', rf), ('nb', nb)], voting='soft',
    # estimators=[('rf', rf), ('nb', nb)], voting='hard',
)
# Cross-validation on ensemble
ensemble_scores = cross_val_score(ensemble, X_train, y_train, cv=10, scoring='accuracy')
print(f"Ensemble CV Accuracy: {np.mean(ensemble_scores):.4f}")

# Train final ensemble model
ensemble.fit(X_train, y_train)

# Evaluate on test data
test_score = ensemble.score(X_test, y_test)
print(f"Final Ensemble Test Accuracy: {test_score:.4f}")

Ensemble CV Accuracy: 0.7879
Final Ensemble Test Accuracy: 0.7989


Best classifier: SVM + RF - Voting SOFT

## Test set

In [65]:
test_raw_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",0,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",1,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",0,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",0,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",1,22.0,1,1,3101298,12.2875,,S


### Manipulate data

In [66]:
# Remove 'Survived' from sorted_columns
sorted_columns.remove('Survived')

In [67]:
# Split name into first name, family name and title
test_df = test_raw_df.copy()
test_df = split_name_column(test_df)

# Sort the columns in the specified order
test_df = test_df[sorted_columns]
test_df = test_df.sort_values(by='Family Name')

# Classify passengers into solo travelers, families, or friends.
test_df = classify_passengers(test_df)

# Force passengers with age < 13 to classified as family members
test_df.loc[test_df['Age'] < 13, 'Company'] = 1

# Impute data for solo travelers
test_df = impute_ages_randomly(test_df, 0)
# Impute data for families
test_df = impute_family_ages_randomly(test_df)
# Impute data for friends
test_df = impute_ages_randomly(test_df, 2)

# Drop the 'Cabin' and 'Cabine_Letter' columns
test_df.drop(columns=['Cabin'], inplace=True)

# Impute missing Embarked and 'Fare' values with the most common value
most_common_embarked = test_df['Embarked'].mode()[0]
test_df['Embarked'] = test_df['Embarked'].fillna(most_common_embarked)
most_common_fare = test_df['Fare'].mode()[0]
test_df['Fare'] = test_df['Fare'].fillna(most_common_fare)

## Process data

In [68]:
# Create a copy of train_df with only the columns: 'Pclass', 'Sex', 'Age', SibSp', 'Parch', 'Fare', 'Embarked', 'Company'
X_test = test_df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Company']].copy()
X_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Company
392,3,0,13.0,0,2,20.25,S,0.0
57,3,0,25.0,0,0,7.65,S,0.0
345,3,1,16.0,0,0,7.65,S,0.0
251,3,0,20.0,0,0,7.925,S,0.0
8,3,1,18.0,0,0,7.2292,C,0.0


In [80]:
# Processing pipelines
X_test_proc = pd.DataFrame(preprocessor.transform(X_test))
# Drop column 6 (Embarked_C) to match the number of columns in X_train
X_test_proc.drop(columns=6, inplace=True)

In [83]:
# Make predictions
y_pred = ensemble.predict(X_test_proc)

In [85]:
y_pred

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,

In [87]:
# Save predictions to a CSV file in the following format: PassengerId, Survived
output = pd.DataFrame({'PassengerId': test_df.PassengerId, 'Survived': y_pred})
output.to_csv('ClaudioCrema_20250212_01.csv', index=False)

In [90]:
# Save model and train dataset
joblib.dump(ensemble, 'ClaudioCrema_20250212_01_MODEL.pkl')
joblib.dump(X_proc, 'ClaudioCrema_20250212_01.pkl')

['ClaudioCrema_20250212_01.pkl']