1: Imports and loading the dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

# Display settings
sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)

# Add the project root folder to the path
sys.path.append(os.path.abspath(".."))

# Load the dataset (path relative to the project)
from src.load_data import load_diabetes_data
df = load_diabetes_data("C:\Proyectos_DATA_&_IA\diabetes-risk-prediction_project\data\diabetes.csv")


2: Dataset overview

In [None]:
# First rows of the dataset
df.head()


In [None]:
# General information about the dataset
df.info()


In [None]:
# Descriptive statistics
df.describe()


3: Check for missing or unusual values

In [None]:
# Null values in the dataset
df.isnull().sum()


In [None]:
# Check how many 0 values ​​there are in columns that shouldn't have them
cols_to_check = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
(df[cols_to_check] == 0).sum()


Convert zeros to NaN in the data frame

In [None]:
cols_to_replace = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[cols_to_replace] = df[cols_to_replace].replace(0, np.nan)


In [None]:
df.isnull().sum()


4: Distribution of numerical variables

In [None]:
df.hist(bins=30, figsize=(15, 10), color='skyblue')
plt.suptitle("Distribution of Numerical Features", fontsize=16)
plt.tight_layout()
plt.show()


5: Comparison between Outcome 0 vs 1

In [None]:
# Compare the variable 'Glucose' between patients with and without diabetes
sns.boxplot(x='Outcome', y='Glucose', data=df)
plt.title("Glucose levels by Diabetes Outcome")


6: Correlation matrix

In [None]:
corr = df.corr()
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix")


7: Analysis of the objective variable (Outcome)

In [None]:
sns.countplot(x="Outcome", data=df)
plt.title("Distribution of Diabetes Outcome")


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Violinplot: Glucose distribution by Outcome class
sns.violinplot(x='Outcome', y='Glucose', data=df, inner='quartile', palette='pastel')

plt.title("Glucose Distribution by Diabetes Outcome")
plt.xlabel("Diabetes Outcome (0 = No, 1 = Yes)")
plt.ylabel("Glucose Level")
plt.show()


Scatter plots with color by Outcome

In [None]:
sns.scatterplot(x='Glucose', y='Age', hue='Outcome', data=df)


Boxplot: glucose vs pregnancies


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
sns.boxplot(x="Pregnancies", y="Glucose", data=df, palette="pastel")
plt.title("Glucose distribution according to number of pregnancies")
plt.xlabel("Pregnancies")
plt.ylabel("Glucose Level")
plt.tight_layout()
plt.show()


Pearson correlation between Pregnancies and Glucose

In [None]:
correlation = df["Pregnancies"].corr(df["Glucose"])
print(f"Correlation (Pearson) between pregnancies and glucose level: {correlation:.3f}")


 Pearson correlation between Glucose and BMI

In [None]:
correlation = df["Glucose"].corr(df["BMI"])
print(f"Correlación (Pearson) entre Glucose y BMI: {correlation:.3f}")


Visualization with scatterplot

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x="Glucose", y="BMI", hue="Outcome", palette="Set1", alpha=0.7)
plt.title("Relación entre Glucosa y BMI según Diabetes")
plt.xlabel("Glucose")
plt.ylabel("BMI")
plt.legend(title="Diabetes (0 = No, 1 = Sí)")
plt.tight_layout()
plt.show()


Scatterplot for Outcome

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Crear scatterplot de Age vs BMI
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x="Age", y="BMI", hue="Outcome", palette={0: "green", 1: "orange"})

plt.title("Relación entre Edad y BMI según Diabetes")
plt.xlabel("Edad")
plt.ylabel("Índice de Masa Corporal (BMI)")
plt.legend(title="Diabetes (0 = No, 1 = Sí)")
plt.grid(True)
plt.tight_layout()
plt.show()


Pearson correlation between Age and BMI

In [None]:
correlation = df["Age"].corr(df["BMI"])
print(f"Correlación (Pearson) entre Age y BMI: {correlation:.3f}")
