# Space Titanic Kaggle Competition

## Introduction

This notebook is a submission for the Space Titanic Kaggle competition.

**Goal**: Predict whether the passenger was transported to another dimension based on the features provided.

## Import Libraries

In [27]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer


## Data Exploration and Cleaning

In [28]:
data_og = pd.read_csv('spaceship_titanic_data/train.csv')
data = data_og.copy()

Quick look at the data.

In [29]:
data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [30]:
def describe_dataset(df: pd.DataFrame) -> None:
    # Ensure full output is displayed
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', None)
    pd.set_option('display.max_colwidth', None)

    print("="*40)
    print("🔍 Dataset Overview")
    print("="*40)

    print("\n📌 Data Types:")
    print(df.dtypes.to_string())

    print("\n📌 Number of Rows and Columns:")
    print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")

    print("\n📌 Duplicate Rows:")
    print(df.duplicated().sum())

    print("\n📌 Missing Values:")
    missing_values = df.isnull().sum()
    missing_percent = (missing_values / len(df)) * 100
    missing_df = pd.DataFrame({'Missing Values': missing_values, 'Percentage': missing_percent})
    print(missing_df[missing_df['Missing Values'] > 0].to_string())

    print("\n📌 Summary Statistics (Numerical Features):")
    print(df.describe().to_string())

    print("\n📌 Summary Statistics (Categorical Features):")
    print(df.describe(include=['O']).to_string())

    print("="*40)


In [31]:
describe_dataset(data)

🔍 Dataset Overview

📌 Data Types:
PassengerId      object
HomePlanet       object
CryoSleep        object
Cabin            object
Destination      object
Age             float64
VIP              object
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Name             object
Transported        bool

📌 Number of Rows and Columns:
Rows: 8693, Columns: 14

📌 Duplicate Rows:
0

📌 Missing Values:
              Missing Values  Percentage
HomePlanet               201    2.312205
CryoSleep                217    2.496261
Cabin                    199    2.289198
Destination              182    2.093639
Age                      179    2.059128
VIP                      203    2.335212
RoomService              181    2.082135
FoodCourt                183    2.105142
ShoppingMall             208    2.392730
Spa                      183    2.105142
VRDeck                   188    2.162660
Name                     200    2.300702

📌

Conclusions:
- Convert data types (str - bool) and extract relevant information from raw features (id -> travelled as group).
- There are no duplicate records.
- There are missing values, about 2% in case of the features affected -> Impute missing values.
- There seem to be outliers in the bills, check later with graphs.
- There are no missing values in the target variable.
- The remaining categorical features have a reasonable number of unique values, so we can use one-hot encoding.
- There are passengers with the same name, do a check if they are not the same person.

In [32]:
double_names = data[data['Name'].isin(data['Name'].value_counts()[data['Name'].value_counts() > 1].index)].sort_values(by = 'Name')

In [None]:
import numpy as np
import pandas as pd
from itertools import combinations
from sklearn.metrics.pairwise import cosine_similarity

def jaccard_similarity(row1, row2, categorical_cols):
    """Compute Jaccard similarity between two categorical records."""
    matches = sum(row1[col] == row2[col] for col in categorical_cols)
    return matches / len(categorical_cols)

def compute_pairwise_similarity(group, numeric_cols, categorical_cols, num_weight=0.7, cat_weight=0.3):
    """Compute similarity for all pairs within a group (only records with the same Name)."""
    if len(group) < 2:
        return None  # Skip if only one record with that Name

    similarities = []
    
    # Ensure numerical columns have no NaNs (fill with median of the group)
    group[numeric_cols] = group[numeric_cols].fillna(group[numeric_cols].median())

    # Generate all possible pairs within the group
    for (idx1, row1), (idx2, row2) in combinations(group.iterrows(), 2):
        # Compute cosine similarity for numerical features
        num_vector1 = row1[numeric_cols].values.reshape(1, -1)
        num_vector2 = row2[numeric_cols].values.reshape(1, -1)
        num_sim = cosine_similarity(num_vector1, num_vector2)[0][0]  # Extract single similarity value

        # Compute Jaccard similarity for categorical features
        cat_sim = jaccard_similarity(row1, row2, categorical_cols)

        # Compute final similarity score
        final_sim = (num_sim * num_weight) + (cat_sim * cat_weight)

        similarities.append({
            'Name': row1['Name'],  # Keep track of which Name these belong to
            'Index1': idx1, 'Index2': idx2,
            'Numerical_Similarity': num_sim,
            'Categorical_Similarity': cat_sim,
            'Final_Similarity': final_sim
        })

    return pd.DataFrame(similarities)

# Define relevant feature lists
numeric_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
categorical_features = ['HomePlanet', 'Destination', 'CryoSleep', 'VIP']

# Apply function only to groups where Name has duplicates
filtered_data = data[data.duplicated('Name', keep=False)]  # Only keep records with duplicate names
similarity_results = filtered_data.groupby('Name').apply(
    compute_pairwise_similarity, numeric_features, categorical_features
).reset_index(drop=True)


  similarity_results = filtered_data.groupby('Name').apply(


Unnamed: 0,Name,Index1,Index2,Numerical_Similarity,Categorical_Similarity,Final_Similarity
9,Glena Hahnstonsen,6702,8002,1.0,0.75,0.925
3,Apix Wala,2559,4108,0.751787,0.75,0.751251
16,Loree Wolfernan,4730,7650,0.155435,1.0,0.408804
6,Cuses Pread,6171,6987,0.068923,1.0,0.348246
15,Keitha Josey,220,5432,0.048367,1.0,0.333857
13,Gwendy Sykess,1292,3309,0.00193,1.0,0.301351
17,Sharie Gallenry,1812,2700,0.0,1.0,0.3
7,Dia Cartez,1795,4392,0.069712,0.75,0.273799
2,Anton Woody,2991,8488,0.043321,0.75,0.255325
14,Juane Popelazquez,3534,7495,0.042539,0.75,0.254777


In [46]:
for name in similarity_results['Name'].unique():
    double_names.loc[double_names['Name'] == name, 'similarity'] = similarity_results.loc[similarity_results['Name'] == name, 'Final_Similarity'].values[0]

double_names.sort_values(by = 'similarity', ascending = False)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,similarity
8002,8560_01,Earth,True,G/1391/P,55 Cancri e,18.0,False,0.0,0.0,0.0,0.0,0.0,Glena Hahnstonsen,True,0.925
6702,7073_01,Earth,True,G/1161/S,TRAPPIST-1e,40.0,False,0.0,0.0,0.0,0.0,0.0,Glena Hahnstonsen,True,0.925
2559,2746_01,Mars,False,F/569/P,TRAPPIST-1e,52.0,False,758.0,0.0,356.0,5.0,0.0,Apix Wala,False,0.751251
4108,4387_01,Mars,False,F/902/P,PSO J318.5-22,32.0,False,192.0,0.0,441.0,18.0,0.0,Apix Wala,False,0.751251
4730,5050_01,Earth,False,F/1028/P,TRAPPIST-1e,31.0,False,433.0,111.0,238.0,86.0,0.0,Loree Wolfernan,False,0.408804
7650,8164_01,Earth,False,G/1314/S,TRAPPIST-1e,31.0,False,0.0,918.0,0.0,0.0,900.0,Loree Wolfernan,False,0.408804
6987,7429_01,Mars,False,,TRAPPIST-1e,38.0,False,1126.0,0.0,23.0,34.0,0.0,Cuses Pread,False,0.348246
6171,6513_01,Mars,False,F/1244/S,TRAPPIST-1e,20.0,False,57.0,0.0,1185.0,0.0,0.0,Cuses Pread,False,0.348246
220,0234_01,Earth,False,F/50/P,TRAPPIST-1e,43.0,False,0.0,888.0,0.0,0.0,0.0,Keitha Josey,False,0.333857
5432,5808_01,Earth,False,G/934/P,TRAPPIST-1e,2.0,False,0.0,0.0,0.0,0.0,0.0,Keitha Josey,False,0.333857


After checking the duplicate names using additional similaeity scores and the features, the passengers are not the same person as they are staying in different cabins in different groups.

It is an interesting discrepancy, as some of the passengers are even the same age from the same planet.

Actions to take:
- Drop the `PassengerId` column -> create bool feature of travelling in group or not
- Drop the `Name` column
- Split `Cabin` into `Deck`/`Num`/`Side`
- `Age`: impute missing values with median
- Impute missing values in the billing columns `RoomService`, `FoodCourt`, `ShoppingMall`, `Spa`, `VRDeck` with 0 if the passenger is in `CryoSleep`, else with the median
- If `CryoSleep` is missing, impute with False if the passenger has bills, True otherwise
- Convert bool features represented as strings to bool: `CryoSleep`, `VIP`
- Impute mode for `HomePlanet`, `Destination`

As I will have to do the same modifications to the test data, I will do it in a pipeline.

In [34]:
def travelling_in_group(df):
    df['InGroup'] = df['PassengerId'].apply(lambda x: x.split('_')[1] != '01' or df['PassengerId'].str.startswith(x.split('_')[0]).sum() > 1)
    return df.drop(columns=['PassengerId'])

def drop_name(df):
    return df.drop(columns=['Name'])

def split_cabin(df):
    df[['Deck', 'Num', 'Side']] = df['Cabin'].str.split('/', expand=True)
    return df.drop(columns=['Cabin'])

def impute_cryosleep(df):
    df['CryoSleep'] = df.apply(lambda row: False if pd.isnull(row['CryoSleep']) and row[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum() > 0 
                               else True if pd.isnull(row['CryoSleep']) else row['CryoSleep'], axis=1)
    return df

def impute_billing(df):
    billing_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    for col in billing_cols:
        df[col] = df.apply(lambda row: 0 if row['CryoSleep'] else row[col], axis=1)
        df[col].fillna(df[col].median(), inplace=True)
    return df

def convert_bool(df):
    df['CryoSleep'] = df['CryoSleep'].astype('boolean')
    df['VIP'] = df['VIP'].astype('boolean')
    return df

numeric_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_features = ['HomePlanet', 'Cabin', 'Destination']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

full_pipeline = Pipeline(steps=[
    ('drop_name', FunctionTransformer(drop_name)),  # Drop 'Name' first
    ('group', FunctionTransformer(travelling_in_group)),  # Create 'InGroup' & drop 'PassengerId'
    ('impute_cryosleep', FunctionTransformer(impute_cryosleep)),  # Impute CryoSleep before conversion
    ('convert_bool', FunctionTransformer(convert_bool)),  # Convert boolean after imputation
    ('preprocessor', preprocessor)  # Apply numeric & categorical transformations
])


preprocessed_data = full_pipeline.fit_transform(data)