### Goals
The goal of this notebook is to upload data to our data bucket.

In [6]:
pip install google-cloud-storage

Note: you may need to restart the kernel to use updated packages.


In [7]:
from google.cloud import storage

def upload_data(*,
    bucket : str,
    filepath_in_bucket : str,
    local_filepath : str
):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket)
    blob = bucket.blob(filepath_in_bucket)
    blob.upload_from_filename(local_filepath)
    print(f"{local_filepath} uploaded succesfully.")

In [11]:
import pandas as pd
import numpy as np

def preprocess_data(df):
    df = df[["Pclass", "Sex", "Age", "SibSp", "Parch", "Survived"]]
    df = df.dropna()
    df["Sex"] = df["Sex"].map({"male": 0, "female": 1})
    #SibSP is the no of siblings or partners onboard
    #Parch is the number of parents or children onboard
    df = df.rename(columns={"SibSp": "no_of_siblings_or_partners_onboard", "Parch": "no_of_parents_or_children_onboard"})
    return df

In [12]:
from sklearn.model_selection import train_test_split

whole_dataset = preprocess_data(pd.read_csv("titanic.csv"))
train, test = train_test_split(whole_dataset, test_size = 0.2, random_state = 42, stratify = whole_dataset["Survived"])

In [13]:
train.to_csv("train.csv")
test.to_csv("test.csv")

In [14]:
BUCKET = "de2025-group10-a1-data"

upload_data(
    bucket = BUCKET,
    filepath_in_bucket = "titanic/data.csv",
    local_filepath = "titanic.csv"
)

upload_data(
    bucket = BUCKET,
    filepath_in_bucket = "titanic/train.csv",
    local_filepath = "train.csv"
)

upload_data(
    bucket = BUCKET,
    filepath_in_bucket = "titanic/test.csv",
    local_filepath = "test.csv"
)

titanic.csv uploaded succesfully.
train.csv uploaded succesfully.
test.csv uploaded succesfully.
