In [1]:
import os
import pickle

import wandb
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [2]:
wandb.init(project="mlops-zoomcamp-wandb", job_type="log_data")

artifact = wandb.use_artifact('kade/mlops-zoomcamp-wandb/Titanic:v0', type='dataset')
artifact_dir = artifact.download()

In [3]:
# read data
train_df = pd.read_csv(os.path.join(artifact_dir, 'train.csv'))
test_df = pd.read_csv(os.path.join(artifact_dir, 'test.csv'))

In [4]:
num_train_examples = int(len(train_df) * 0.8)
num_val_examples = len(train_df) - num_train_examples

print(f"Training examples: {num_train_examples}")
print(f"Validation examples: {num_val_examples}")

In [5]:
train_df["Split"] = ["Train"] * num_train_examples + ["Validation"] * num_val_examples
train_df.head()

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  Split  
0      0         A/5 21171   7.2500   NaN        S  Train  
1      0          PC 17599  71.2833   C85        C  Train  
2      0  STON/O2. 3101282   7.9250   NaN        S  Train  
3      0            113803  53.1000  C123        S  Train  
4      0            

In [6]:
train_df.to_csv("data/train.csv", encoding='utf-8', index=False)

In [7]:
# log the 'data' dir as an artifact
artifact = wandb.Artifact("Titanic", type='dataset', metadata={"Source": "https://www.kaggle.com/c/titanic/data"})
artifact.add_dir("data")
wandb.log_artifact(artifact)

wandb.finish()