### Environment Setup

In [None]:
import sys
from pathlib import Path

def is_google_colab() -> bool:
    if "google.colab" in str(get_ipython()):
        return True
    return False

def clone_repository() -> None:
    repo_dir = Path("mlfs-book")
    if repo_dir.exists():
        print(f"Repository already exists at {repo_dir.absolute()}")
        %cd mlfs-book
    else:
        print("Cloning repository...")
        !git clone https://github.com/featurestorebook/mlfs-book.git
        %cd mlfs-book

def install_dependencies() -> None:
    !pip install --upgrade uv
    !uv pip install --all-extras --system --requirement pyproject.toml

if is_google_colab():
    clone_repository()
    install_dependencies()
    root_dir = str(Path().absolute())
    print("Google Colab environment")
else:
    root_dir = Path().absolute()
    if root_dir.parts[-1:] == ('titanic',):
        root_dir = Path(*root_dir.parts[:-1])
    if root_dir.parts[-1:] == ('notebooks',):
        root_dir = Path(*root_dir.parts[:-1])
    root_dir = str(root_dir) 
    print("Local environment")

if root_dir not in sys.path:
    sys.path.append(root_dir)
print(f"Added the following directory to the PYTHONPATH: {root_dir}")

### Imports

In [None]:
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import seaborn as sns
from matplotlib import pyplot
import os
import hopsworks
import pandas as pd
from mlfs import config
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

### Feature Store

In [None]:
settings = config.HopsworksSettings(_env_file=f"{root_dir}/.env")

project = hopsworks.login()
fs = project.get_feature_store()

### Feature View
A Feature View defines the set of features used as input to a model.
Features can be sourced from multiple Feature Groups, which represent different datasets.
By selecting and joining features across these groups, you create a unified Feature View
that serves as the training and serving interface for your model.

In [None]:
# The feature view is the input set of features for your model. The features can come from different feature groups.    
# You can select features from different feature groups and join them together to create a feature view

from hopsworks.hsfs.builtin_transformations import label_encoder

titanic_fg = fs.get_feature_group(name="titanic", version=1)
selected_features = titanic_fg.select_features()

feature_view = fs.get_or_create_feature_view(name="titanic",
                                             version=1,
                                             description="Read from Titanic Passengers Dataset",
                                             labels=["survived"],
                                             transformation_functions = [
                                                 label_encoder("sex" ),
                                                 label_encoder("embarked")
                                             ],
                                             query=selected_features)

### Split data for testing and training
Load the training data from the Feature View and split it into features (X) and labels (y).
The train_test_split method randomly divides the dataset into training and test sets.
Here, 20% of the data is reserved for testing, while the remaining 80% is used for training.

In [None]:
X_train, X_test, y_train, y_test = feature_view.train_test_split(0.2)

X_train

### Train Model
Initialize and train an XGBoost Classifier on the training data.
The model is created using xgb.XGBClassifier(), which provides gradient boosting for classification tasks.
The fit method trains the model on the feature set (X_train) and the corresponding labels (y_train).
The .values.ravel() call ensures the labels are in the correct 1D format expected by the classifier.

In [None]:
model = xgb.XGBClassifier()
model.fit(X_train, y_train.values.ravel())

### Prediction
Generate predictions on the unseen test features to evaluate model performance

In [None]:
y_pred = model.predict(X_test)
y_pred

### Evaluation
Compare predictions with the true test labels.
The classification_report summarizes precision, recall, f1-score, and accuracy for each class, 
while the confusion_matrix highlights correct predictions and misclassifications.

In [None]:
# Compare predictions (y_pred) with the labels in the test set (y_test)
metrics = classification_report(y_test, y_pred, output_dict=True)
results = confusion_matrix(y_test, y_pred)
metrics

Convert the confusion matrix into a labeled DataFrame and then plot it as a heatmap to make it easier to interpret.

In [None]:
# Create the confusion matrix as a figure, we will later store it as a PNG image file
df_cm = pd.DataFrame(results, ['True Deceased', 'True Survivor'],
                     ['Pred Deceased', 'Pred Survivor'])
cm = sns.heatmap(df_cm, annot=True)
fig = cm.get_figure()

fig.show()

### Model Registry
The Model Registry is a centralized repository for managing machine learning models.
It stores trained models along with their metadata, versions, and lineage, ensuring reproducibility.
By registering models, teams can track experiments, promote models from development to production,
and serve them consistently in different environments.

In [None]:
# We will now upload our model to the Hopsworks Model Registry. First get an object for the model registry.
mr = project.get_model_registry()

# The contents of the 'iris_model' directory will be saved to the model registry. Create the dir, first.
model_dir="titanic_model"
if os.path.isdir(model_dir) == False:
    os.mkdir(model_dir)
images_dir = model_dir + "/images"
if os.path.isdir(images_dir) == False:
    os.mkdir(images_dir)

# Save both our model and the confusion matrix to 'model_dir', whose contents will be uploaded to the model registry
# Saving the XGBoost regressor object as a json file in the model directory
model.save_model(model_dir + "/titanic_model.json")
fig.savefig(images_dir + "/confusion_matrix.png")    

# Create an entry in the model registry that includes the model's name, desc, metrics
titanic_model = mr.python.create_model(
    name="titanic", 
    metrics={"accuracy" : metrics['accuracy'], 
             'f1 score' : metrics['weighted avg']['f1-score']},
    feature_view=feature_view,
    description="Titanic Survivor Predictor"
)

# Upload the model to the model registry, including all files in 'model_dir'
titanic_model.save(model_dir)