In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')   

  machar = _get_machar(dtype)


In [2]:
# Load datasets
diabetes_dataset = pd.read_csv('../data/raw/diabetes.csv')
heart_disease_dataset = pd.read_csv('../data/raw/heart.csv')

In [3]:
print(diabetes_dataset.columns)
print(heart_disease_dataset.columns)

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')
Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')


In [4]:
# Table for diabetes dataset columns
diabetes_info = {
    "Unit": [
        "count", "mg/dL", "mmHg", "mm", "mu U/mL", "kg/m^2", "ratio", "years", "binary"
    ],
    "Meanings": [
        "Number of pregnancies",
        "Plasma glucose concentration",
        "Diastolic blood pressure",
        "Triceps skin fold thickness",
        "2-Hour serum insulin",
        "Body mass index",
        "Diabetes pedigree function",
        "Age",
        "Diabetes outcome"
    ],
    "Expected Range": [
        "0+", "70-200", "40-120", "10-99", "0-846", "15-50", "0.1-2.5", "21-81", "0 or 1"
    ]
}
diabetes_table = pd.DataFrame(diabetes_info)
display(diabetes_table)

Unnamed: 0,Unit,Meanings,Expected Range
0,count,Number of pregnancies,0+
1,mg/dL,Plasma glucose concentration,70-200
2,mmHg,Diastolic blood pressure,40-120
3,mm,Triceps skin fold thickness,10-99
4,mu U/mL,2-Hour serum insulin,0-846
5,kg/m^2,Body mass index,15-50
6,ratio,Diabetes pedigree function,0.1-2.5
7,years,Age,21-81
8,binary,Diabetes outcome,0 or 1


In [5]:
# Table for heart disease dataset columns
# Table for heart disease dataset columns
heart_info = {
    "Unit": [
        "years", "binary", "category", "mmHg", "mg/dL", "binary", "category", "bpm",
        "binary", "mm", "category", "count", "category", "binary"
    ],
    "Meanings": [
        "Age",
        "Sex (1=male, 0=female)",
        "Chest pain type",
        "Resting blood pressure",
        "Serum cholesterol",
        "Fasting blood sugar > 120 mg/dL",
        "Resting ECG results",
        "Maximum heart rate achieved",
        "Exercise induced angina",
        "ST depression induced by exercise",
        "Slope of the peak exercise ST segment",
        "Number of major vessels colored by fluoroscopy",
        "Thalassemia",
        "Heart disease presence"
    ],
    "Expected Range": [
        "29-77", "0 or 1", "0-3", "94-200", "126-564", "0 or 1", "0-2", "71-202",
        "0 or 1", "0.0-6.2", "0-2", "0-3", "1-3", "0 or 1"
    ]
}
heart_table = pd.DataFrame(heart_info)
display(heart_table)

Unnamed: 0,Unit,Meanings,Expected Range
0,years,Age,29-77
1,binary,"Sex (1=male, 0=female)",0 or 1
2,category,Chest pain type,0-3
3,mmHg,Resting blood pressure,94-200
4,mg/dL,Serum cholesterol,126-564
5,binary,Fasting blood sugar > 120 mg/dL,0 or 1
6,category,Resting ECG results,0-2
7,bpm,Maximum heart rate achieved,71-202
8,binary,Exercise induced angina,0 or 1
9,mm,ST depression induced by exercise,0.0-6.2


In [6]:
#Diabetes_Dataset Sanity and Structure Check
print("Diabetes Dataset Shape:", diabetes_dataset.shape)
diabetes_dataset.head()
print("Heart Disease Dataset Info: ")
heart_disease_dataset.info()

Diabetes Dataset Shape: (768, 9)
Heart Disease Dataset Info: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  
 5   fbs       1025 non-null   int64  
 6   restecg   1025 non-null   int64  
 7   thalach   1025 non-null   int64  
 8   exang     1025 non-null   int64  
 9   oldpeak   1025 non-null   float64
 10  slope     1025 non-null   int64  
 11  ca        1025 non-null   int64  
 12  thal      1025 non-null   int64  
 13  target    1025 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 112.2 KB


In [7]:
print("Heart Disease Dataset Shape:", heart_disease_dataset.shape)
heart_disease_dataset.head()
print("Heart Disease Dataset Info: ")
heart_disease_dataset.info()

Heart Disease Dataset Shape: (1025, 14)
Heart Disease Dataset Info: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  
 5   fbs       1025 non-null   int64  
 6   restecg   1025 non-null   int64  
 7   thalach   1025 non-null   int64  
 8   exang     1025 non-null   int64  
 9   oldpeak   1025 non-null   float64
 10  slope     1025 non-null   int64  
 11  ca        1025 non-null   int64  
 12  thal      1025 non-null   int64  
 13  target    1025 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 112.2 KB


In [8]:
# Data Wrangling and Exploration
heart_disease_dataset.rename(columns = {"age": "Age"}, inplace = True)

In [9]:
#Check data types of columns
print(heart_disease_dataset.dtypes)
print(diabetes_dataset.dtypes)

Age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal          int64
target        int64
dtype: object
Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object


In [10]:
# For heart disease dataset
heart_disease_dataset['sex'] = heart_disease_dataset['sex'].astype('category')
heart_disease_dataset['fbs'] = heart_disease_dataset['fbs'].astype('category')
heart_disease_dataset['exang'] = heart_disease_dataset['exang'].astype('category')
heart_disease_dataset['target'] = heart_disease_dataset['target'].astype('category')
heart_disease_dataset['cp'] = heart_disease_dataset['cp'].astype('category')
heart_disease_dataset['restecg'] = heart_disease_dataset['restecg'].astype('category')
heart_disease_dataset['slope'] = heart_disease_dataset['slope'].astype('category')
heart_disease_dataset['thal'] = heart_disease_dataset['thal'].astype('category')
heart_disease_dataset['ca'] = heart_disease_dataset['ca'].astype('category')

# For diabetes dataset
diabetes_dataset['Outcome'] = diabetes_dataset['Outcome'].astype('category')

In [None]:
#Remove duplicates from heart disease dataset
duplicates_sorted = duplicates.sort_values(list(duplicates.columns))
display(duplicates_sorted.head(20))

Unnamed: 0,Age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
60,29,1,1,130,204,0,0,202,0,0.0,2,0,2,1
64,29,1,1,130,204,0,0,202,0,0.0,2,0,2,1
118,29,1,1,130,204,0,0,202,0,0.0,2,0,2,1
668,29,1,1,130,204,0,0,202,0,0.0,2,0,2,1
12,34,0,1,118,210,0,1,192,0,0.7,2,0,2,1
15,34,0,1,118,210,0,1,192,0,0.7,2,0,2,1
779,34,0,1,118,210,0,1,192,0,0.7,2,0,2,1
143,34,1,3,118,182,0,0,174,0,0.0,2,0,2,1
201,34,1,3,118,182,0,0,174,0,0.0,2,0,2,1
572,34,1,3,118,182,0,0,174,0,0.0,2,0,2,1


In [17]:
heart_disease_dataset.drop_duplicates(inplace=True)

In [18]:
# Find true duplicates in diabetes_dataset
diabetes_duplicates = diabetes_dataset[diabetes_dataset.duplicated(keep=False)]

# Sort duplicates by all columns so identical rows are grouped together
diabetes_duplicates_sorted = diabetes_duplicates.sort_values(list(diabetes_duplicates.columns))

# Display the first 20 sorted duplicate rows
display(diabetes_duplicates_sorted.head(20))

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome


In [None]:
#Statistical Summary of Diabetes Dataset
diabetes_dataset.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0


In [20]:
# Replace zero values with NaN for specific columns in diabetes dataset
# Replacing impossible zero values with NaN
cols_with_zero_invalid = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for col in cols_with_zero_invalid:
    diabetes_dataset[col] = diabetes_dataset[col].replace(0, np.nan)

In [None]:
#Checking the number of missing values for eeach column
diabetes_dataset.isnull().sum()

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [23]:
for col in ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']:
    diabetes_dataset[col].fillna(diabetes_dataset[col].median(), inplace=True)

In [24]:
heart_disease_dataset.describe()

Unnamed: 0,Age,trestbps,chol,thalach,oldpeak
count,302.0,302.0,302.0,302.0,302.0
mean,54.42053,131.602649,246.5,149.569536,1.043046
std,9.04797,17.563394,51.753489,22.903527,1.161452
min,29.0,94.0,126.0,71.0,0.0
25%,48.0,120.0,211.0,133.25,0.0
50%,55.5,130.0,240.5,152.5,0.8
75%,61.0,140.0,274.75,166.0,1.6
max,77.0,200.0,564.0,202.0,6.2


In [None]:
# Check class balance for diabetes dataset
print("Diabetes Dataset Class Balance:")
print(diabetes_dataset['Outcome'].value_counts())
print(diabetes_dataset['Outcome'].value_counts(normalize=True))  # Proportion

Diabetes Dataset Class Balance:
Outcome
0    500
1    268
Name: count, dtype: int64
Outcome
0    0.651042
1    0.348958
Name: proportion, dtype: float64


In [None]:
# Check class balance for heart disease dataset
print("Heart Disease Dataset Class Balance:")
print(heart_disease_dataset['target'].value_counts())
print(heart_disease_dataset['target'].value_counts(normalize=True))  # Proportion

Heart Disease Dataset Class Balance:
target
1    164
0    138
Name: count, dtype: int64
target
1    0.543046
0    0.456954
Name: proportion, dtype: float64


In [27]:
# Save processed diabetes dataset
diabetes_dataset.to_csv('../data/processed/diabetes_processed.csv', index=False)

# Save processed heart disease dataset
heart_disease_dataset.to_csv('../data/processed/heart_processed.csv', index=False)