<h1 style='color: red'>
    Code block for downloading the dataset if it is not present in the files
</h1>

In [None]:
import os
from kaggle.api.kaggle_api_extended import KaggleApi

# Dataset name on Kaggle
dataset_name = 'diabetes_prediction_dataset'
kaggle_data_set_name = 'iammustafatz/diabetes-prediction-dataset'

# Path to data
download_path = './data'

def download_kaggle_csv_dataset(dataset, download_path):
    os.makedirs(download_path, exist_ok=True)    
    api = KaggleApi()
    api.authenticate()
    api.dataset_download_files(dataset, path=download_path, unzip=True)

if not os.path.exists(download_path) or not os.path.exists(f'{download_path}/{dataset_name}'):
    os.makedirs(download_path)
    # Downloading data
    download_kaggle_csv_dataset(kaggle_data_set_name, download_path)

    print(f'Data has been downloaded to {download_path}/{dataset_name}')
else:
    print(f'Data already exists in {download_path}/{dataset_name}')

<h1>
    Preparation
</h1>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
file_path = './data/diabetes_prediction_dataset.csv'
data = pd.read_csv(file_path)

def save_cleaned_data(cleaned_file_path='./data/diabetes_prediction_dataset_clean.csv'):
    data.to_csv(cleaned_file_path, index=False)
    print(f'Data has been saved to file {cleaned_file_path}')

In [None]:
columns = data.columns

n_cols = 3                                                  # Number of columns in grid
n_rows = (len(columns) + n_cols - 1) // n_cols              # Number of rows in grid

# Creating the plots grid
fig, axs = plt.subplots(n_rows, n_cols, figsize=(5 * n_cols, 3 * n_rows))

# Generating histograms for each column
for i, column in enumerate(columns):
    ax = axs[i // n_cols, i % n_cols]
    ax.hist(data[column].dropna(), bins=30, edgecolor='k', alpha=0.7)
    ax.set_title(f'Histogram of {column}')
    ax.set_xlabel(column)
    ax.set_ylabel('Frequency')
    ax.grid(True)

# Removing empty plots if they exist
for j in range(i + 1, n_rows * n_cols):
    fig.delaxes(axs[j // n_cols, j % n_cols])

print('Original data histograms')

plt.tight_layout()
plt.show()


plt.tight_layout()
plt.show()

<h1 style='color: gold'>
    Feature Smoking
</h1>

<h2>
    Conversion of the smoking_history feature to one-hot encoding
</h2>

In [None]:
# Performing one-hot encoding for the 'smoking_history' column
data = pd.get_dummies(data, columns=['smoking_history'], prefix='smoking')

save_cleaned_data()

<h3>
    Removing rows where the value of the 'Smoking_No info' == True 
</h3>

In [None]:
# Removing rows where the value of the 'smoking_No Info' column is True
data_filtered = data[data['smoking_No Info'] == False]
data = data_filtered.drop(columns=['smoking_No Info'])

save_cleaned_data()

<h1 style='color: gold'>
    Feature gender
</h1>

<h2>
    Removing rows where the value of the gender_Other == True
</h2>

In [None]:
# Removing rows where the value of the 'gender' column is 'Other'
data = data[data['gender'] != 'Other']

save_cleaned_data()

<h2>
    Splitting the gender feature into one-hot, because !(Female > Male) and vice versa
</h2>

In [None]:
data = pd.get_dummies(data, columns=['gender'], prefix='gender')
save_cleaned_data()

<h1 style='color: gold'>
    Feature age
</h1>

<h2>
    Checking unique values in the age column and rounding them
</h2>

In [None]:
# Dictionary for storing the counts of unique values
unique_values_counts = {}

# Unique values in the 'age' column and their counts
for value in data['age'].unique():
    count = data['age'].value_counts()[value]
    unique_values_counts[value] = count

print("Unique values in the 'age' column along with their counts:")
for value, count in unique_values_counts.items():
    print(f"{value}: {count}")


In [None]:
data['age'] = data['age'].round().astype(int)
save_cleaned_data()

<h1 style='color: gold'>
    Operations on the entire dataset
</h1>

<h2>
    Finding and removing unnecessary duplicates
</h2>

In [None]:
duplicate_rows = data.duplicated()
data = data.drop_duplicates()

save_cleaned_data()
print(f'Removed duplicate rows: {duplicate_rows.sum()}')

<h2>
    Unique data.
</h2>

In [None]:
for column in data.columns:
    num_dist_values = data[column].nunique()
    print(f'{column}: {num_dist_values} unique values')

<h2>
    Missing data
</h2>

In [None]:
print(data.isnull().sum())

<h2>
    Basic statistics
</h2>

In [None]:
data.describe()

<h2>
    Min - max difference
</h2>

In [None]:
numeric_columns = data.select_dtypes(include=['number']).columns

# Difference between min and max for each numeric column
diff_min_max = data[numeric_columns].max() - data[numeric_columns].min()

print("Difference between min and max for each column (excluding boolean values):")
print(diff_min_max)