In [1]:
import os
import pickle

import wandb
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [2]:
wandb.init(project="mlops-zoomcamp-wandb", job_type="train", name="baseline_experiment_1")

artifact = wandb.use_artifact('kade/mlops-zoomcamp-wandb/Titanic:latest', type='dataset')
artifact_dir = artifact.download()

train_val_df = pd.read_csv(os.path.join(artifact_dir, 'train.csv'))
test_df = pd.read_csv(os.path.join(artifact_dir, 'test.csv'))

In [3]:
features = ["Pclass", "Sex", "SibSp", "Parch"]
X_train = pd.get_dummies(train_val_df[features][train_val_df["Split"] == "Train"])
X_val = pd.get_dummies(train_val_df[features][train_val_df["Split"] == "Validation"])
y_train = train_val_df["Survived"][train_val_df["Split"] == "Train"]
y_val = train_val_df["Survived"][train_val_df["Split"] == "Validation"]

In [4]:
model_params = {"n_estimators": 100, "max_depth": 10, "random_state": 1}
wandb.config = model_params

model = RandomForestClassifier(**model_params)
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
y_probas_train = model.predict_proba(X_train)
y_pred_val = model.predict(X_val)
y_probas_val = model.predict_proba(X_val)

In [5]:
wandb.log({
    "train/accuracy": accuracy_score(y_train, y_pred_train),
    "train/precision": precision_score(y_train, y_pred_train),
    "train/recall": recall_score(y_train, y_pred_train),
    "train/f1": f1_score(y_train, y_pred_train),
    "val/accuracy": accuracy_score(y_val, y_pred_val),
    "val/precision": precision_score(y_val, y_pred_val),
    "val/recall": recall_score(y_val, y_pred_val),
    "val/f1": f1_score(y_val, y_pred_val),
})

In [6]:
label_names = ["Not-Survived", "Survived"]

wandb.sklearn.plot.plot_class_proportions(y_train, y_val, label_names)
wandb.sklearn.plot.plot_summary_metrics(model, X_train, y_train, X_val, y_val)
wandb.sklearn.plot.plot_roc(y_val, y_probas_val, label_names)
wandb.sklearn.plot.plot_confusion_matrix(y_val, y_pred_val, label_names)

In [7]:
with open("random_forest_classifier.pkl", "wb") as f:
    pickle.dump(model, f)
    
artifact = wandb.Artifact("titanic-random_forest_classifier", type="model")
artifact.add_file("random_forest_classifier.pkl")
wandb.log_artifact(artifact)

wandb.finish()