In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# load data from csv file
clean_file_path = './data/diabetes_prediction_dataset_clean.csv'
original_file_path = './data/diabetes_prediction_dataset.csv'

data = pd.read_csv(clean_file_path)
original_data = pd.read_csv(original_file_path)

<h1 style='color: gold'>
    Differences in datasets
</h1>

In [None]:
# Difference between original and cleaned data

print(f'Removed {len(original_data) - len(data)} rows out of {len(original_data)} leaving {len(data)}.')

clean_columns = set(data.columns)
original_columns = set(original_data.columns)

extra_columns = clean_columns - original_columns
missing_columns = original_columns - clean_columns

print("\nAdditional columns after cleaning:", extra_columns)
print("Missing columns after cleaning:", missing_columns)

<h1 style='color: gold'>
    Basic data info
</h1>

In [None]:
data.describe()

<h1 style='color: gold'>
    Histograms
</h1>

In [None]:
# List of numeric columns
numeric_columns = data.select_dtypes(include=['number']).columns

# Number of rows and columns for the plot grid
n_cols = 3                                                  # Number of columns in grid
n_rows = (len(numeric_columns) + n_cols - 1) // n_cols      # Number of rows in grid

# Creating the plot grid
fig, axs = plt.subplots(n_rows, n_cols, figsize=(5 * n_cols, 3 * n_rows))

# Generating histograms for each numeric column
for i, column in enumerate(numeric_columns):
    ax = axs[i // n_cols, i % n_cols]
    ax.hist(data[column].dropna(), bins=30, edgecolor='k', alpha=0.7)
    ax.set_title(f'Histogram of {column}')
    ax.set_xlabel(column)
    ax.set_ylabel('Frequency')
    ax.grid(True)

# Removing empty plots if they exist
for j in range(i + 1, n_rows * n_cols):
    fig.delaxes(axs[j // n_cols, j % n_cols])

plt.tight_layout()
plt.show()

<h1 style='color: gold'>
    Histograms for one-hot features
</h1>

In [None]:
data_numeric = data.select_dtypes(include=['bool']).astype(int)

n_cols = 3
n_rows = (len(data_numeric.columns) + n_cols - 1) // n_cols

# Creating the plot grid
fig, axs = plt.subplots(n_rows, n_cols, figsize=(5 * n_cols, 3 * n_rows))

# Generating histograms for each boolean column
for i, column in enumerate(data_numeric.columns):
    ax = axs[i // n_cols, i % n_cols]
    ax.hist(data_numeric[column], bins=2, edgecolor='k', alpha=0.7)
    ax.set_title(f'Histogram of {column}')
    ax.set_xlabel(column)
    ax.set_ylabel('Frequency')
    ax.set_xticks([0, 1])
    ax.set_xticklabels(['False', 'True'])
    ax.grid(True)

for j in range(i + 1, n_rows * n_cols):
    fig.delaxes(axs[j // n_cols, j % n_cols])

plt.tight_layout()
plt.show()

<h1 style='color: gold'>
    Correlation matrix
</h1>

In [None]:
correlation_matrix = data.corr()

plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation matrix')
plt.show()

<h1 style='color: gold'>
    Analysis of two features
</h1>

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(6, 5))

sns.boxplot(x='diabetes', y='bmi', data=data, ax=axs[0, 0])
axs[0, 0].set_title('BMI vs Diabetes')

sns.boxplot(x='diabetes', y='age', data=data, ax=axs[0, 1])
axs[0, 1].set_title('Age vs Diabetes')

sns.boxplot(x='diabetes', y='HbA1c_level', data=data, ax=axs[1, 0])
axs[1, 0].set_title('HbA1c level vs Diabetes')

sns.boxplot(x='diabetes', y='blood_glucose_level', data=data, ax=axs[1, 1])
axs[1, 1].set_title('Blood Glucose Level vs Diabetes')

plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(4, 3))

sns.countplot(x='gender_Male', hue='diabetes', data=data)

plt.title('Gender vs Diabetes')
plt.xlabel('Gender (Male)')
plt.ylabel('Count')

plt.legend(title='Diabetes', labels=['No', 'Yes'])

plt.show()

In [None]:
sns.pairplot(data, hue='diabetes')
plt.show()