In [14]:
# Load libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


%matplotlib inline

# Load dataset
df = pd.read_csv('./data/ANC_Data.csv')  
# Display basic info
df.info()

# Show sample
df.head()

df.describe()

print(df.columns)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7123 entries, 0 to 7122
Data columns (total 19 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   age                             7123 non-null   int64  
 1   parity                          7123 non-null   int64  
 2   gravidity                       7123 non-null   int64  
 3   marital_status                  7123 non-null   object 
 4   education_level                 6429 non-null   object 
 5   number_of_antenatal_visits      7123 non-null   int64  
 6   household_income_level          7123 non-null   object 
 7   occupation                      7123 non-null   object 
 8   gestational_age_at_delivery     7123 non-null   int64  
 9   apgar_score                     7123 non-null   float64
 10  preterm_birth                   7123 non-null   int64  
 11  birth_complications             7123 non-null   object 
 12  residence                       71

In [17]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
import joblib

# Loading the dataset
data = pd.read_csv('./data/ANC_Data.csv')
print("Dataset Shape:", data.shape)
print("Missing Values Before Imputation:\n", data.isnull().sum())

# Encoding categorical variables
categorical_cols = ['marital_status', 'education_level', 'occupation', 'residence', 'birth_complications', 'previous_complications', 'has_diabetes', 'has_hypertension', 'risk_level', 'household_income_level']
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))
    label_encoders[col] = le
    joblib.dump(le, f'le_{col}.pkl')  # Save encoders
print("Sample Encoded Data (First 5 rows):\n", data.head())

# Defining features (X) and target (y)
X = data.drop('risk_level', axis=1)
y = data['risk_level']

# Scaling numerical features (excluding household_income_level)
numerical_cols = ['age', 'parity', 'gravidity', 'number_of_antenatal_visits', 'gestational_age_at_delivery', 'apgar_score', 'gestational_age_at_first_visit', 'birth_weight_kg']
scaler = MinMaxScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])
joblib.dump(scaler, 'scaler.pkl')  # Save scaler
print("Sample Scaled Data (First 5 rows of numerical columns):\n", X[numerical_cols].head())
print("Data Types After Scaling:\n", X[numerical_cols].dtypes)

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("Training Set Shape:", X_train.shape)
print("Testing Set Shape:", X_test.shape)
print("Class Distribution in Training Set:\n", y_train.value_counts(normalize=True))

# Saving preprocessed data
X_train.to_csv('X_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
y_test.to_csv('y_test.csv', index=False)

# Calculating and saving data split statistics
split_stats = {
    'Training Samples': len(X_train),
    'Testing Samples': len(X_test),
    'Training Percentage': len(X_train) / len(data) * 100,
    'Testing Percentage': len(X_test) / len(data) * 100
}
with open('data_split_stats.txt', 'w') as f:
    for key, value in split_stats.items():
        f.write(f"{key}: {value:.2f}\n")
print("Data Split Statistics:\n", split_stats)

print("Data preprocessing completed. Training and testing sets saved.")

Dataset Shape: (7123, 19)
Missing Values Before Imputation:
 age                                 0
parity                              0
gravidity                           0
marital_status                      0
education_level                   694
number_of_antenatal_visits          0
household_income_level              0
occupation                          0
gestational_age_at_delivery         0
apgar_score                         0
preterm_birth                       0
birth_complications                 0
residence                           0
gestational_age_at_first_visit      0
previous_complications              0
has_diabetes                        0
has_hypertension                    0
birth_weight_kg                     0
risk_level                          0
dtype: int64
Sample Encoded Data (First 5 rows):
    age  parity  gravidity  marital_status  education_level  \
0   30       3          5               1                2   
1   27       2          4               1  

In [18]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
import joblib

# Loading the dataset
data = pd.read_csv('./data/ANC_Data.csv')
print("Dataset Shape:", data.shape)
print("Initial Column Data Types:\n", data.dtypes)
print("Missing Values Before Imputation:\n", data.isnull().sum())

# Encoding categorical variables
categorical_cols = ['marital_status', 'education_level', 'occupation', 'residence', 'birth_complications', 'previous_complications', 'has_diabetes', 'has_hypertension', 'risk_level', 'household_income_level']
label_encoders = {}
for col in categorical_cols:
    if col in data.columns:  # Check if column exists
        le = LabelEncoder()
        data[col] = le.fit_transform(data[col].astype(str))
        label_encoders[col] = le
        joblib.dump(le, f'le_{col}.pkl')  # Save encoders
        print(f"Encoded {col} (Sample unique values):", data[col].unique())
    else:
        print(f"Warning: {col} not found in dataset")
print("Sample Encoded Data (First 5 rows):\n", data.head())

# Defining features (X) and target (y)
X = data.drop('risk_level', axis=1)
y = data['risk_level']

# Scaling numerical features (excluding categorical columns)
numerical_cols = ['age', 'parity', 'gravidity', 'number_of_antenatal_visits', 'gestational_age_at_delivery', 'apgar_score', 'gestational_age_at_first_visit', 'birth_weight_kg']
print("Numerical Columns for Scaling:", numerical_cols)
scaler = MinMaxScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])
joblib.dump(scaler, 'scaler.pkl')  # Save scaler
print("Sample Scaled Data (First 5 rows of numerical columns):\n", X[numerical_cols].head())
print("Data Types After Scaling:\n", X[numerical_cols].dtypes)

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("Training Set Shape:", X_train.shape)
print("Testing Set Shape:", X_test.shape)
print("Class Distribution in Training Set:\n", y_train.value_counts(normalize=True))

# Saving preprocessed data
X_train.to_csv('X_train.csv', index=False)
X_test.to_csv('X_test.csv', index=False)
y_train.to_csv('y_train.csv', index=False)
y_test.to_csv('y_test.csv', index=False)

# Calculating and saving data split statistics
split_stats = {
    'Training Samples': len(X_train),
    'Testing Samples': len(X_test),
    'Training Percentage': len(X_train) / len(data) * 100,
    'Testing Percentage': len(X_test) / len(data) * 100
}
with open('data_split_stats.txt', 'w') as f:
    for key, value in split_stats.items():
        f.write(f"{key}: {value:.2f}\n")
print("Data Split Statistics:\n", split_stats)

print("Data preprocessing completed. Training and testing sets saved.")

Dataset Shape: (7123, 19)
Initial Column Data Types:
 age                                 int64
parity                              int64
gravidity                           int64
marital_status                     object
education_level                    object
number_of_antenatal_visits          int64
household_income_level             object
occupation                         object
gestational_age_at_delivery         int64
apgar_score                       float64
preterm_birth                       int64
birth_complications                object
residence                          object
gestational_age_at_first_visit      int64
previous_complications             object
has_diabetes                        int64
has_hypertension                    int64
birth_weight_kg                   float64
risk_level                         object
dtype: object
Missing Values Before Imputation:
 age                                 0
parity                              0
gravidity              