In [1]:
import os
import pickle

import wandb
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [2]:
wandb.init(project="mlops-zoomcamp-wandb", job_type="log_data")

artifact = wandb.Artifact("Titanic", type='dataset', metadata={"Source": "https://www.kaggle.com/c/titanic/data"})
artifact.add_dir("data")
wandb.log_artifact(artifact)

wandb.finish

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkade[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Adding directory to artifact (./data)... Done. 0.0s


<function wandb.sdk.wandb_run.finish(exit_code: Optional[int] = None, quiet: Optional[bool] = None) -> None>

# versioning data

In [2]:
wandb.init(project="mlops-zoomcamp-wandb", job_type="log_data")

artifact = wandb.use_artifact('kade/mlops-zoomcamp-wandb/Titanic:v0', type='dataset')
artifact_dir = artifact.download()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkade[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m:   2 of 2 files downloaded.  


In [3]:
# read data
train_df = pd.read_csv(os.path.join(artifact_dir, 'train.csv'))
test_df = pd.read_csv(os.path.join(artifact_dir, 'test.csv'))

In [4]:
num_train_examples = int(len(train_df) * 0.8)
num_val_examples = len(train_df) - num_train_examples

print(f"Training examples: {num_train_examples}")
print(f"Validation examples: {num_val_examples}")

Training examples: 712
Validation examples: 179


In [5]:
train_df["Split"] = ["Train"] * num_train_examples + ["Validation"] * num_val_examples
train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Split
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Train
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Train
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Train
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Train
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Train


In [6]:
train_df.to_csv("data/train.csv", encoding='utf-8', index=False)

In [7]:
# log the 'data' dir as an artifact
artifact = wandb.Artifact("Titanic", type='dataset', metadata={"Source": "https://www.kaggle.com/c/titanic/data"})
artifact.add_dir("data")
wandb.log_artifact(artifact)

wandb.finish()

[34m[1mwandb[0m: Adding directory to artifact (./data)... Done. 0.0s


# explore the dataset

In [2]:
wandb.init(project="mlops-zoomcamp-wandb", job_type="explore_data")

# fetch the latest dataset artifact
artifact = wandb.use_artifact('kade/mlops-zoomcamp-wandb/Titanic:latest', type='dataset')
artifact_dir = artifact.download()

train_val_df = pd.read_csv(os.path.join(artifact_dir, 'train.csv'))
test_df = pd.read_csv(os.path.join(artifact_dir, 'test.csv'))

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkade[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m:   2 of 2 files downloaded.  


In [3]:
# create tables
train_val_table = wandb.Table(dataframe=train_val_df)
test_table = wandb.Table(dataframe=test_df)

# log the tables
wandb.log({
    "train-val-table": train_val_table,
    "test-table": test_table
})

wandb.finish()

# fit a baseline model

In [2]:
wandb.init(project="mlops-zoomcamp-wandb", job_type="train", name="baseline_experiment_1")

artifact = wandb.use_artifact('kade/mlops-zoomcamp-wandb/Titanic:latest', type='dataset')
artifact_dir = artifact.download()

train_val_df = pd.read_csv(os.path.join(artifact_dir, 'train.csv'))
test_df = pd.read_csv(os.path.join(artifact_dir, 'test.csv'))

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkade[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m:   2 of 2 files downloaded.  


In [3]:
features = ["Pclass", "Sex", "SibSp", "Parch"]
X_train = pd.get_dummies(train_val_df[features][train_val_df["Split"] == "Train"])
X_val = pd.get_dummies(train_val_df[features][train_val_df["Split"] == "Validation"])
y_train = train_val_df["Survived"][train_val_df["Split"] == "Train"]
y_val = train_val_df["Survived"][train_val_df["Split"] == "Validation"]

In [4]:
model_params = {"n_estimators": 100, "max_depth": 10, "random_state": 1}
wandb.config = model_params

model = RandomForestClassifier(**model_params)
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
y_probas_train = model.predict_proba(X_train)
y_pred_val = model.predict(X_val)
y_probas_val = model.predict_proba(X_val)

In [5]:
wandb.log({
    "train/accuracy": accuracy_score(y_train, y_pred_train),
    "train/precision": precision_score(y_train, y_pred_train),
    "train/recall": recall_score(y_train, y_pred_train),
    "train/f1": f1_score(y_train, y_pred_train),
    "val/accuracy": accuracy_score(y_val, y_pred_val),
    "val/precision": precision_score(y_val, y_pred_val),
    "val/recall": recall_score(y_val, y_pred_val),
    "val/f1": f1_score(y_val, y_pred_val),
})

In [6]:
label_names = ["Not-Survived", "Survived"]

wandb.sklearn.plot.plot_class_proportions(y_train, y_val, label_names)
wandb.sklearn.plot.plot_summary_metrics(model, X_train, y_train, X_val, y_val)
wandb.sklearn.plot.plot_roc(y_val, y_probas_val, label_names)
wandb.sklearn.plot.plot_confusion_matrix(y_val, y_pred_val, label_names)



In [7]:
with open("random_forest_classifier.pkl", "wb") as f:
    pickle.dump(model, f)
    
artifact = wandb.Artifact("titanic-random_forest_classifier", type="model")
artifact.add_file("random_forest_classifier.pkl")
wandb.log_artifact(artifact)

wandb.finish()

0,1
train/accuracy,▁
train/f1,▁
train/precision,▁
train/recall,▁
val/accuracy,▁
val/f1,▁
val/precision,▁
val/recall,▁

0,1
train/accuracy,0.8118
train/f1,0.73307
train/precision,0.82143
train/recall,0.66187
val/accuracy,0.82123
val/f1,0.72881
val/precision,0.7963
val/recall,0.67188
