In [1]:
# 1. Import libraries
import pandas as pd
import numpy as np

# 2. Load datasets
pima_path = '../data/raw/pima.csv'
germany_path = '../data/raw/germany.csv'

pima = pd.read_csv(pima_path)
germany = pd.read_csv(germany_path)

print("PIMA shape:", pima.shape)
print("Germany shape:", germany.shape)

PIMA shape: (768, 9)
Germany shape: (2000, 9)


In [7]:
# 3. Standardize column names
pima.columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
                'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']

germany.columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 
                   'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']

print("PIMA columns:", pima.columns.tolist())
print("Germany columns:", germany.columns.tolist())

PIMA columns: ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
Germany columns: ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']


In [8]:
# 4. Combine datasets
combined = pd.concat([pima, germany], ignore_index=True)
print("Combined shape:", combined.shape)

Combined shape: (2768, 9)


In [9]:
# 5. Basic cleaning
# Replace zeros in non-logical fields (except Pregnancies and Outcome)
zero_cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

for col in zero_cols:
    combined[col] = combined[col].replace(0, np.nan)

# Impute missing values with median
combined.fillna(combined.median(), inplace=True)

In [10]:
# 6. Save the cleaned dataset
output_path = '../data/processed/diabetes_combined.csv'
combined.to_csv(output_path, index=False)

print("✅ Cleaned dataset saved to:", output_path)

✅ Cleaned dataset saved to: ../data/processed/diabetes_combined.csv


In [11]:
print(combined.isnull().sum())

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64
