### Environment Setup

In [None]:
import sys
from pathlib import Path

def is_google_colab() -> bool:
    if "google.colab" in str(get_ipython()):
        return True
    return False

def clone_repository() -> None:
    repo_dir = Path("mlfs-book")
    if repo_dir.exists():
        print(f"Repository already exists at {repo_dir.absolute()}")
        %cd mlfs-book
    else:
        print("Cloning repository...")
        !git clone https://github.com/featurestorebook/mlfs-book.git
        %cd mlfs-book

def install_dependencies() -> None:
    !pip install --upgrade uv
    !uv pip install --all-extras --system --requirement pyproject.toml

if is_google_colab():
    clone_repository()
    install_dependencies()
    root_dir = str(Path().absolute())
    print("Google Colab environment")
else:
    root_dir = Path().absolute()
    # Strip ~/notebooks/ccfraud from PYTHON_PATH if notebook started in one of these subdirectories
    if root_dir.parts[-1:] == ('titanic',):
        root_dir = Path(*root_dir.parts[:-1])
    if root_dir.parts[-1:] == ('notebooks',):
        root_dir = Path(*root_dir.parts[:-1])
    root_dir = str(root_dir) 
    print("Local environment")

# Add the root directory to the `PYTHONPATH` to use the `recsys` Python module from the notebook.
if root_dir not in sys.path:
    sys.path.append(root_dir)
print(f"Added the following directory to the PYTHONPATH: {root_dir}")

### Imports

In [None]:
import os
import hopsworks
import pandas as pd
from mlfs import config

### Feature Store
A Feature Store is the centralized system for managing and serving features to machine learning models.
It provides a consistent interface for storing, retrieving, and sharing features across teams and projects.

In [None]:
settings = config.HopsworksSettings(_env_file=f"{root_dir}/.env")

project = hopsworks.login()
fs = project.get_feature_store()

### Get Backfill Data

In [None]:
titanic_df = pd.read_csv(f"{root_dir}/data/titanic.csv")
titanic_df.head()

### Feature Engineering

 * Impute any missing values for `Age` and `Embarked`

In [None]:
titanic_df = titanic_df[['PassengerId', 'Sex','Age','Pclass','Fare','Parch','SibSp','Embarked', 'Survived']]
# fill NAs with some imputed values
def_values = {'Age': titanic_df['Age'].mean(), 'Embarked': titanic_df['Embarked'].value_counts().idxmax()}
titanic_df = titanic_df.fillna(value=def_values)
titanic_df

## Hopsworks

### Feature Group
A Feature Group is a logical collection of related features built from a single dataset or source.
Each feature group typically represents one entity or domain (e.g., passengers, transactions, sensors).
Feature Groups are versioned, stored, and maintained in the Feature Store, ensuring consistency and reusability.

In [None]:
titanic_fg = fs.get_or_create_feature_group(
    name="titanic",
    version=1,
    primary_key=['PassengerId'],
    description="Titanic passengers dataset"
)

titanic_fg.insert(titanic_df, wait=True)

### Feature Exploration

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

Demographic vs Survival

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Survival rate by gender and passenger class
sns.barplot(x='Sex', y='Survived', hue='Pclass', data=titanic_df, ax=axes[0])
axes[0].set_title('Survival Rate by Gender and Passenger Class')
axes[0].set_xlabel('')
axes[0].set_ylabel('Survival Rate')

# Age distribution by survival
sns.histplot(data=titanic_df, x='Age', hue='Survived', multiple='stack', bins=30, kde=True, ax=axes[1])
axes[1].set_title('Age Distribution by Survival')
axes[1].set_xlabel('Age')
axes[1].set_ylabel('Frequency')
plt.tight_layout()
plt.show()

Socioeconomic Signals

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Fare vs. survival
sns.boxplot(x="Survived", y="Fare", data=titanic_df, ax=axes[0], color="mediumseagreen")
axes[0].set_title("Fare Distribution by Survival")
axes[0].set_xlabel("")
axes[0].set_ylabel("Ticket Fare (British Pounds)")
axes[0].set_xticklabels(['Died', 'Survived'])

# Embarkation port vs. survival
sns.countplot(x='Embarked', hue="Survived", data=titanic_df, ax=axes[1], palette="husl")
axes[1].set_title("Survival by Embarkation Port")
axes[1].set_xlabel("Embarkation Port")
axes[1].set_ylabel("Number of Passengers")
axes[1].legend(title="Survival Status", labels=['Died', 'Survived'])
axes[1].set_xticklabels(['Cherbourg', 'Queenstown', 'Southampton'])

plt.tight_layout()
plt.show()

Family Structure & Group Travel

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Survival rate by family size
sns.barplot(x=(titanic_df["SibSp"] + titanic_df["Parch"] + 1), y="Survived", data=titanic_df, 
            ax=axes[0], palette="husl")

axes[0].set_title("Survival Rate by Family Size")
axes[0].set_xlabel("Family Size")
axes[0].set_ylabel("Average Survival Rate")

# Solo vs. group travelers
titanic_df["IsAlone"] = titanic_df["FamilySize"] == 1
sns.barplot(x="IsAlone", y="Survived", data=titanic_df, ax=axes[1], palette="Set2")
axes[1].set_title("Survival Rate: Alone vs. With Family")
axes[1].set_xlabel("")
axes[1].set_ylabel("Average Survival Rate")
axes[1].set_xticklabels(['With Family', 'Alone'])

plt.tight_layout()
plt.show()