# Part 1

In [1]:
import wandb
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score, mean_squared_error

In [2]:
wandb.__version__

'0.15.3'

In [2]:
wandb.init(project="mlops-zoomcamp-wandb", name="experiment-1")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mg-broughton[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
X, y = load_iris(return_X_y=True)
label_names = ["Setosa", "Versicolour", "Virginica"]

In [4]:
params = {"C": 0.1, "random_state": 42}
wandb.config = params

In [5]:
model = LogisticRegression(**params).fit(X, y)
y_pred = model.predict(X)
y_probas = model.predict_proba(X)

In [6]:
wandb.log({
    "accuracy": accuracy_score(y, y_pred),
    "mean_squared_error": mean_squared_error(y, y_pred)
})

In [7]:
wandb.sklearn.plot_roc(y, y_probas, labels=label_names)



In [8]:
wandb.sklearn.plot_precision_recall(y, y_probas, labels=label_names)

In [9]:
wandb.sklearn.plot_confusion_matrix(y, y_pred, labels=label_names)

In [11]:
# save model
with open("logistic_regression_model.pkl", 'wb') as f:
    pickle.dump(model, f)

artifact = wandb.Artifact("logistic_regression_model", type="model")
artifact.add_file("logistic_regression_model.pkl")
wandb.log_artifact(artifact)

<wandb.sdk.wandb_artifacts.Artifact at 0x7f8fc97c2f10>

In [12]:
wandb.finish()

VBox(children=(Label(value='0.010 MB of 0.012 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.782158…

0,1
accuracy,▁
mean_squared_error,▁

0,1
accuracy,0.96
mean_squared_error,0.04


# Part 2

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

import os
import pandas as pd

In [None]:
# Initialize a WandB Run
wandb.init(project="mlops-zoomcamp-wandb", job_type="log_data")

# Log the `data` directory as an artifact
artifact = wandb.Artifact('Titanic', type='dataset', metadata={"Source": "https://www.kaggle.com/competitions/titanic/data"})
artifact.add_dir('data')
wandb.log_artifact(artifact)

wandb.finish()

In [None]:
# Initialize a WandB Run
wandb.init(project="mlops-zoomcamp-wandb", job_type="log_data")

# Fetch the dataset artifact 
artifact = wandb.use_artifact('geekyrakshit/mlops-zoomcamp-wandb/Titanic:v0', type='dataset')
artifact_dir = artifact.download()

In [None]:
train_df = pd.read_csv(os.path.join(artifact_dir, "train.csv"))
test_df = pd.read_csv(os.path.join(artifact_dir, "test.csv"))

num_train_examples = int(0.8 * len(train_df))
num_val_examples = len(train_df) - num_train_examples

train_df["Split"] = ["Train"] * num_train_examples + ["Validation"] * num_val_examples
train_df.to_csv("data/train.csv", encoding='utf-8', index=False)

In [None]:
# Log the `data` directory as an artifact
artifact = wandb.Artifact('Titanic', type='dataset', metadata={"Source": "https://www.kaggle.com/competitions/titanic/data"})
artifact.add_dir('data')
wandb.log_artifact(artifact)

# End the WandB Run
wandb.finish()

In [None]:
# Initialize a WandB Run
wandb.init(project="mlops-zoomcamp-wandb", job_type="explore_data")

# Fetch the latest version of the dataset artifact 
artifact = wandb.use_artifact('geekyrakshit/mlops-zoomcamp-wandb/Titanic:latest', type='dataset')
artifact_dir = artifact.download()

# Read the files
train_val_df = pd.read_csv(os.path.join(artifact_dir, "train.csv"))
test_df = pd.read_csv(os.path.join(artifact_dir, "test.csv"))

In [None]:
# Create tables corresponding to datasets
train_val_table = wandb.Table(dataframe=train_val_df)
test_table = wandb.Table(dataframe=test_df)

# Log the tables to Weights & Biases
wandb.log({
    "Train-Val-Table": train_val_table,
    "Test-Table": test_table
})

# End the WandB Run
wandb.finish()