# 01 Ingest Data
This notebook loads the Iris dataset from scikit-learn and saves it as the raw data source. It can be parameterized to generate either the original dataset or a simulated dataset with 50% more data.

In [None]:
# Parameters for papermill
# This cell is tagged as 'parameters'
# Default to 'original' if not provided
dataset_version = 'original' # 'original' or 'simulated'

In [None]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join('..', 'src')))
import pandas as pd
from sklearn.datasets import load_iris
from ds_logger import start_logging, end_logging

notebook_description = f"Loads the {dataset_version} Iris dataset and saves it to the raw data directory."
start_logging(notebook_name=f'01_ingest_data_{dataset_version}.ipynb', notebook_description=notebook_description)

In [None]:
# Load Iris dataset
iris = load_iris()
original_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
original_df['target'] = iris.target

if dataset_version == 'simulated':
    # Simulate receiving 50% more data by sampling from the original data
    new_samples_count = int(len(original_df) * 0.5)
    new_samples_df = original_df.sample(n=new_samples_count, replace=True, random_state=42)
    df = pd.concat([original_df, new_samples_df], ignore_index=True)
    raw_data_path = '../data/raw/iris_raw_simulated.csv'
else: # 'original'
    df = original_df
    raw_data_path = '../data/raw/iris_raw_original.csv'

# Save raw data
df.to_csv(raw_data_path, index=False)

In [None]:
end_logging(results={'rows_ingested': len(df), 'dataset_version': dataset_version})