In [1]:
import kagglehub
import os
import shutil
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

# Downloading the Dataset
path = kagglehub.dataset_download("uciml/pima-indians-diabetes-database")

# get the current working directory, should be the same as the directory where the notebook is in
working_dir = os.getcwd()

# Copy the CSV directly into the working directory
shutil.copy(os.path.join(path, "diabetes.csv"),
            os.path.join(working_dir, "diabetes.csv"))

print("Dataset copied to:", os.path.join(working_dir, "diabetes.csv"))


Dataset copied to: c:\Users\osaro\OneDrive\Documenten\Universiteit\Master\Blok 1\Data Wrangling and Data Analysis\Assignment\Assignment 1\DaWra\DaWra\Assignment 2\diabetes.csv


In [3]:
#Config
cols_with_zeros = ["BloodPressure", "SkinThickness", "BMI"]

In [4]:
# loading in the data
df = pd.read_csv("diabetes.csv")

In [5]:
# Computing the correlation matrix excluding the Outcome column
corr_before = df.drop(columns=["Outcome"]).corr()

In [6]:
# making a copy so that we can compare this with the initial correlation later
df_Zero = df.copy()

# Replace zeros -> NaN
for c in cols_with_zeros:
    df_Zero[c] = df_Zero[c].replace(0, np.nan)

In [7]:
# Filling in the cells with nan using the mean values of the records that have the same class label
for c in cols_with_zeros:
    df_Zero[c] = df_Zero.groupby("Outcome")[c].transform(lambda s: s.fillna(s.mean()))

In [8]:
# Computing the correlation matrix on the altered dataframe
corr_after = df_Zero.drop(columns=["Outcome"]).corr()


In [9]:
# Compare
diff = (corr_after - corr_before).round(6)
print("Max abs change in correlation:", diff.abs().to_numpy().max())
print("\nTop differences (absolute):")
pairs = []
cols = diff.columns
for i in range(len(cols)):
    for j in range(i+1, len(cols)):
        delta = float(diff.iloc[i, j])
        pairs.append((cols[i], cols[j], delta, abs(delta)))
pairs.sort(key=lambda x: x[3], reverse=True)
for a,b,delta,ad in pairs[:10]:
    print(f"{a} ↔ {b}: Δ={delta:.6f} |Δ|={ad:.6f}")


Max abs change in correlation: 0.332766

Top differences (absolute):
SkinThickness ↔ Insulin: Δ=-0.332766 |Δ|=0.332766
SkinThickness ↔ Age: Δ=0.249886 |Δ|=0.249886
Pregnancies ↔ SkinThickness: Δ=0.175844 |Δ|=0.175844
SkinThickness ↔ BMI: Δ=0.172869 |Δ|=0.172869
Glucose ↔ SkinThickness: Δ=0.163615 |Δ|=0.163615
BloodPressure ↔ Insulin: Δ=-0.137039 |Δ|=0.137039
BloodPressure ↔ Age: Δ=0.084911 |Δ|=0.084911
SkinThickness ↔ DiabetesPedigreeFunction: Δ=-0.081501 |Δ|=0.081501
Glucose ↔ BloodPressure: Δ=0.069828 |Δ|=0.069828
Pregnancies ↔ BloodPressure: Δ=0.067653 |Δ|=0.067653
