In [1]:
import pandas as pd

In [None]:
def load_data(input_file_path):
    try:
        df = pd.read_csv(input_file_path)
        print("Data loaded successfully.")
        return df
    except FileNotFoundError:
        print(f"Error: File not found at path '{input_file_path}'")
    except pd.errors.ParserError:
        print("Error: Failed to parse CSV file.")
    except Exception as e:
        print(f"nexpected error while loading data: {e}")


In [None]:
df = load_data(r"data/raw/fetal_health.csv")  # Change path as needed
df


Data loaded successfully.


Unnamed: 0,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,...,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency,fetal_health
0,120.0,0.000,0.000,0.000,0.000,0.0,0.0,73.0,0.5,43.0,...,62.0,126.0,2.0,0.0,120.0,137.0,121.0,73.0,1.0,2.0
1,132.0,0.006,0.000,0.006,0.003,0.0,0.0,17.0,2.1,0.0,...,68.0,198.0,6.0,1.0,141.0,136.0,140.0,12.0,0.0,1.0
2,133.0,0.003,0.000,0.008,0.003,0.0,0.0,16.0,2.1,0.0,...,68.0,198.0,5.0,1.0,141.0,135.0,138.0,13.0,0.0,1.0
3,134.0,0.003,0.000,0.008,0.003,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,11.0,0.0,137.0,134.0,137.0,13.0,1.0,1.0
4,132.0,0.007,0.000,0.008,0.000,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,9.0,0.0,137.0,136.0,138.0,11.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2121,140.0,0.000,0.000,0.007,0.000,0.0,0.0,79.0,0.2,25.0,...,137.0,177.0,4.0,0.0,153.0,150.0,152.0,2.0,0.0,2.0
2122,140.0,0.001,0.000,0.007,0.000,0.0,0.0,78.0,0.4,22.0,...,103.0,169.0,6.0,0.0,152.0,148.0,151.0,3.0,1.0,2.0
2123,140.0,0.001,0.000,0.007,0.000,0.0,0.0,79.0,0.4,20.0,...,103.0,170.0,5.0,0.0,153.0,148.0,152.0,4.0,1.0,2.0
2124,140.0,0.001,0.000,0.006,0.000,0.0,0.0,78.0,0.4,27.0,...,103.0,169.0,6.0,0.0,152.0,147.0,151.0,4.0,1.0,2.0


In [11]:
def split_num_cat_data(df):
    try:
        numerical_df = df.select_dtypes(include=['number'])
        categorical_df = df.select_dtypes(exclude=['number'])
        print("Data split into numerical and categorical successfully.")
        return numerical_df, categorical_df
    except Exception as e:
        print(f"Error while splitting the data: {e}")


In [12]:
numerical_df, categorical_df = split_num_cat_data(df)
numerical_df, categorical_df


Data split into numerical and categorical successfully.


(      baseline value  accelerations  fetal_movement  uterine_contractions  \
 0              120.0          0.000           0.000                 0.000   
 1              132.0          0.006           0.000                 0.006   
 2              133.0          0.003           0.000                 0.008   
 3              134.0          0.003           0.000                 0.008   
 4              132.0          0.007           0.000                 0.008   
 ...              ...            ...             ...                   ...   
 2121           140.0          0.000           0.000                 0.007   
 2122           140.0          0.001           0.000                 0.007   
 2123           140.0          0.001           0.000                 0.007   
 2124           140.0          0.001           0.000                 0.006   
 2125           142.0          0.002           0.002                 0.008   
 
       light_decelerations  severe_decelerations  prolongued_d

In [13]:
def generate_metadata(numerical_df, categorical_df):
    try:
        metadata = {
            "numerical_shape": numerical_df.shape,
            "categorical_shape": categorical_df.shape
        }
        print("Metadata generated successfully.")
        return metadata
    except Exception as e:
        print(f"Error while generating metadata: {e}")


In [14]:
metadata = generate_metadata(numerical_df, categorical_df)
metadata


Metadata generated successfully.


{'numerical_shape': (2126, 22), 'categorical_shape': (2126, 0)}

In [15]:
def save_data(numerical_df, categorical_df, metadata):
    try:
        numerical_df.to_csv("numerical_data.csv", index=False)
        categorical_df.to_csv("categorical_data.csv", index=False)
        with open("metadata.txt", "w") as f:
            f.write(str(metadata))
        print("Data and metadata saved successfully.")
    except Exception as e:
        print(f"Error while saving data: {e}")


In [16]:
save_data(numerical_df, categorical_df, metadata)

Data and metadata saved successfully.


In [None]:
def run(input_file_path):
    try:
        df = load_data(input_file_path)
        if df is not None:
            numerical_df, categorical_df = split_num_cat_data(df)
            if numerical_df is not None and categorical_df is not None:
                metadata = generate_metadata(numerical_df, categorical_df)
                if metadata is not None:
                    save_data(numerical_df, categorical_df, metadata)
    except Exception as e:
        print(f"Error during run: {e}")


In [22]:
run(r"C:\Users\MaesakD\Documents\Learning\MLOps\project_setup\data_folder\fetal_health.csv")  # Replace with your actual file


Data loaded successfully.
Data split into numerical and categorical successfully.
Metadata generated successfully.
Data and metadata saved successfully.
