In [2]:
import os
import pandas as pd
import numpy as np
import random

def introduce_errors(df):
    
    error_rate = 0.1  

    # 1. Missing values
    for col in ['thalach', 'cp', 'ca', 'thal']:
        df.loc[df.sample(frac=error_rate).index, col] = np.nan

    # 2. Outliers
    for col in ['thalach', 'cp', 'ca', 'thal']:
        indices = df.sample(frac=error_rate).index
        df.loc[indices, col] *= np.random.choice([3, -3], size=len(indices))

    # 4. Incorrect data types
    df['thalach'] = df['thalach'].apply(lambda x: str(x) if np.random.rand() < error_rate else x)

    # 5. Negative values
    df['cp'] = df['cp'].apply(lambda x: -abs(x) if np.random.rand() < error_rate else x)

    # 6. Strings in numerical columns
    df['ca'] = df['ca'].apply(lambda x: 'ERROR' if np.random.rand() < error_rate else x)

    # 7. Duplicated rows
    duplicates = df.sample(frac=error_rate)
    df = pd.concat([df, duplicates], ignore_index=True)

    # 8. Corrupted categorical values
    df['thal'] = df['thal'].apply(lambda x: 'abnormal' if np.random.rand() < error_rate else x)

    # 9. Rows with all zeros
    num_all_zero_rows = int(len(df) * error_rate)
    df.loc[df.sample(n=num_all_zero_rows).index, ['thalach', 'cp', 'ca', 'thal']] = 0
    return df


def split_dataset(dataset_path, raw_data_folder, num_files):
    """
    Splits the dataset into specified number of files, introduces errors, and saves to raw-data folder.
    """
    df = pd.read_csv(dataset_path)
    rows_per_file = len(df) // num_files
    if not os.path.exists(raw_data_folder):
        os.makedirs(raw_data_folder)
    
    for i in range(num_files):
        start_idx = i * rows_per_file
        end_idx = (i + 1) * rows_per_file if i < num_files - 1 else len(df)
        df_part = df.iloc[start_idx:end_idx]
        
        df_part = introduce_errors(df_part)
        
        part_file_path = os.path.join(raw_data_folder, f"part_{i + 1}.csv")
        df_part.to_csv(part_file_path, index=False)
    
    print(f"Data ingestion completed. {num_files} files have been created in '{raw_data_folder}'.")

if __name__ == "__main__":
    dataset_path = '../dataset/test.csv'
    raw_data_folder = '../data_ingestion_folder/raw_folder'
    num_files = 200
    split_dataset(dataset_path, raw_data_folder, num_files)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['thalach'] = df['thalach'].apply(lambda x: str(x) if np.random.rand() < error_rate else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cp'] = df['cp'].apply(lambda x: -abs(x) if np.random.rand() < error_rate else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ca'] = df['ca'].apply(

Data ingestion completed. 200 files have been created in '../data_ingestion_folder/raw_folder'.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['thalach'] = df['thalach'].apply(lambda x: str(x) if np.random.rand() < error_rate else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cp'] = df['cp'].apply(lambda x: -abs(x) if np.random.rand() < error_rate else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['ca'] = df['ca'].apply(