# 01b Ingest Data (New Data Simulation)
This notebook simulates receiving 50% more data and saves it as the raw data source for experiment B.

In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join('..', 'src')))
import pandas as pd
from sklearn.datasets import load_iris
from ds_logger import start_logging, end_logging

notebook_description = "Simulates receiving 50% more data and saves it to the raw data directory for experiment B."
start_logging(notebook_name='01b_ingest_data.ipynb', notebook_description=notebook_description)

In [2]:
# Load Iris dataset
iris = load_iris()
original_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
original_df['target'] = iris.target

# Simulate receiving 50% more data by sampling from the original data
new_samples_count = int(len(original_df) * 0.5)
new_samples_df = original_df.sample(n=new_samples_count, replace=True, random_state=42)

# Concatenate original data with new data
df = pd.concat([original_df, new_samples_df], ignore_index=True)

# Save raw data
raw_data_path = '../data/raw/iris_raw_b.csv'
df.to_csv(raw_data_path, index=False)

In [3]:
end_logging(results={'rows_ingested': len(df)})