In [701]:
# Imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, average_precision_score, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import shuffle
from sklearn.ensemble import GradientBoostingRegressor
from tqdm import tqdm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn import tree
from sklearn.naive_bayes import CategoricalNB, GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.pipeline import make_pipeline
from sklearn.metrics import auc
from sklearn.calibration import LabelEncoder
import re

In [702]:
# Data loading

dataset_name = "diabetes.csv"

root_path = "https://raw.githubusercontent.com/matzim95/ML-datasets/master/"
path_to_data = root_path + dataset_name
df = pd.read_csv(path_to_data)

In [703]:
df = df.sample(frac=1)
df.head()

Unnamed: 0,Number of times pregnant,Plasma glucose concentration,Diastolic blood pressure,Triceps skin fold thickness,2-Hour serum insulin,Body mass index,Diabetes pedigree function,Age,Class
686,3,130,64,0,0,23.1,0.314,22,negative
105,1,126,56,29,152,28.7,0.801,21,negative
369,1,133,102,28,140,32.8,0.234,45,positive
141,5,106,82,30,0,39.5,0.286,38,negative
722,1,149,68,29,127,29.3,0.349,42,positive


In [704]:
# Delete all-unique ID columns

def is_column_unique(col):
    return col.nunique() == len(col)

unique_columns = df.apply(is_column_unique, axis=0)
unique_columns_list = unique_columns[unique_columns].index.tolist()

while len(unique_columns_list) > 0:
    id_present = ''
    id_present = input(f'Found columns with all unique values: {unique_columns_list}. Does the list contain ID column/s? (y/n)')
    
    if id_present == 'y':
        if len(unique_columns_list) == 1:
            df.drop(unique_columns_list[0], axis = 1, inplace = True)
            print(f"Column {unique_columns_list[0]} has been removed from analysis")
        else:    
            id_col_name = input(str((f"Please select the id column, so that it is deleted from analysis: {unique_columns_list}.")))
            df.drop(id_col_name, axis = 1, inplace = True)
            print(f"Column {id_col_name} has been removed from analysis")
    else: break
    unique_columns = df.apply(is_column_unique, axis=0)
    unique_columns_list = unique_columns[unique_columns].index.tolist()




print("Columns with all unique values:")
print(unique_columns_list)

Columns with all unique values:
[]


In [705]:
print("The list of available columns:", list(df.columns))

label_col = str(input(f'The dataframe has following columns: {list(df.columns)}.\nSelect the labels (Y) column: '))

The list of available columns: ['Number of times pregnant', 'Plasma glucose concentration', 'Diastolic blood pressure', 'Triceps skin fold thickness', '2-Hour serum insulin', 'Body mass index', 'Diabetes pedigree function', 'Age', 'Class']


In [706]:
# Dropping NAs

# List of common missing value expressions
missing_values = ['?', 'n/a', 'NA', "nan", 'null', '-', '']

# Replace missing value expressions with NaN
df.replace(missing_values, np.nan, inplace=True)

# Count the number of rows before dropping
total_rows_before = df.shape[0]

# Drop rows with any NaN values
df_dropped = df.dropna()

# Count the number of rows after dropping
total_rows_after = df_dropped.shape[0]

# Calculate the number of rows deleted and its percentage
rows_deleted = total_rows_before - total_rows_after
percentage_deleted = (rows_deleted / total_rows_before) * 100

dropping = ''
if rows_deleted > 0:
    dropping = input(F"Warning! The database contains {rows_deleted} rows with MISSING VALUES. It makes {percentage_deleted:.2f}% of the database.\nWould you like to delete these rows? (y/n)")
    if dropping == 'y':
        df = df_dropped.copy()
        print(f"Number of rows deleted: {rows_deleted}")
        print(f"Percentage of database deleted: {percentage_deleted:.2f}%")
    else:
        print('Proceeding without dropping the missing values. The missing values are replaced with zeros.')
        df.replace(np.nan, 0, inplace=True)
else: print("No missing values found in the database.")

No missing values found in the database.


In [707]:
# Dropping duplicates

# Count the number of duplicate rows before dropping
total_duplicates_before = df.duplicated().sum()

# Drop duplicate rows
df_deduplicated = df.drop_duplicates()

# Count the number of rows after dropping duplicates
total_duplicates_after = df.shape[0] - df_deduplicated.shape[0]

# Calculate the number of duplicate rows deleted and its percentage
if total_duplicates_before > 0:
    duplicates_deleted = total_duplicates_before - total_duplicates_after
    percentage_duplicates_deleted = (duplicates_deleted / total_duplicates_before) * 100
else:
    duplicates_deleted = 0
    percentage_duplicates_deleted = 0

dropping = ''
if duplicates_deleted > 0:
    dropping = input(f"Warning! The database contains {duplicates_deleted} DUPLICATE ROWS. It makes {percentage_duplicates_deleted:.2f}% of the database.\nWould you like to delete these rows? (y/n)")
    if dropping == 'y':
        df = df_deduplicated.copy()
        print(f"Number of duplicate rows deleted: {duplicates_deleted}")
        print(f"Percentage of duplicates deleted: {percentage_duplicates_deleted:.2f}%")
    else:
        print('Proceeding without dropping duplicate rows.')
else:
    print('No duplicate rows found in the database.')



No duplicate rows found in the database.


In [708]:
# Identify columns containing only integers
integer_columns = []
for col in df.columns:
    if df[col].apply(lambda x: isinstance(x, (int, bool, str)) and re.match(r'^-?\d+$', str(x)) is not None).all():
        integer_columns.append(col)

# Convert identified columns to numeric dtype
for col in integer_columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Print updated dtypes
print(df.dtypes)

print(integer_columns)

Number of times pregnant          int64
Plasma glucose concentration      int64
Diastolic blood pressure          int64
Triceps skin fold thickness       int64
2-Hour serum insulin              int64
Body mass index                 float64
Diabetes pedigree function      float64
Age                               int64
Class                            object
dtype: object
['Number of times pregnant', 'Plasma glucose concentration', 'Diastolic blood pressure', 'Triceps skin fold thickness', '2-Hour serum insulin', 'Age']


In [709]:
X = df.drop(label_col, axis=1).copy()
y = df[label_col]
label_list = list(y.unique())

In [710]:
# X encoding

threshold_percentage = 0.9  # 90%
unique_values_threshold = 6

# Step 1: Attempt to convert columns with mixed types to numeric where possible
def try_convert_to_numeric(column):
    return pd.to_numeric(column, errors='coerce')

# Step 2: Determine whether a column is numeric based on the threshold
def is_numeric_column(column, threshold_percentage):
    column_numeric = try_convert_to_numeric(column)
    num_numeric = column_numeric.notna().sum()
    return (num_numeric / len(column)) >= threshold_percentage

# Step 3: Classify columns and drop non-numeric values from numeric columns
numeric_columns = []
categorical_columns = []

for col in X.columns:
    if is_numeric_column(X[col], threshold_percentage):
        column_numeric = try_convert_to_numeric(X[col])
        if column_numeric.nunique() < unique_values_threshold:
            categorical_columns.append(col)
        else:
            numeric_columns.append(col)
            # Drop non-numeric values from numeric columns
            X[col] = column_numeric
    else:
        categorical_columns.append(col)

# Print the columns
print("Numeric Columns:")
print(numeric_columns)

print("\nCategorical Columns:")
print(categorical_columns)


Numeric Columns:
['Number of times pregnant', 'Plasma glucose concentration', 'Diastolic blood pressure', 'Triceps skin fold thickness', '2-Hour serum insulin', 'Body mass index', 'Diabetes pedigree function', 'Age']

Categorical Columns:
[]


In [711]:
categorical_columns_toformat = []
threshold_categories = 15
for col in categorical_columns:
    if len(list(X[col].unique())) > threshold_categories:
        categorical_columns_toformat.append(col)

if categorical_columns_toformat:
    dropping = ''
    dropping = input(f"Warning! Found {len(categorical_columns_toformat)} categorical columns where the number of categories is higher than {threshold_categories}: {categorical_columns_toformat}. You may need to check the formatting of the columns. Drop the columns for the analysis? (y/n)")
    if dropping == 'y':
        X.drop(categorical_columns_toformat, axis=1, inplace=True)

        numeric_columns = []
        categorical_columns = []

        for col in X.columns:
            if is_numeric_column(X[col], threshold_percentage):
                column_numeric = try_convert_to_numeric(X[col])
                if column_numeric.nunique() < unique_values_threshold:
                    categorical_columns.append(col)
                else:
                    numeric_columns.append(col)
                    # Drop non-numeric values from numeric columns
                    X[col] = column_numeric
            else:
                categorical_columns.append(col)

        # Print the columns
        print("Numeric Column after dropping:")
        print(numeric_columns)

        print("\nCategorical Columns after dropping:")
        print(categorical_columns)


In [712]:
# Encoding categorical variables

if categorical_columns:
    print(X[categorical_columns].dtypes)
    print("\n")
    integer_dtypes = [int, np.int8, np.int16, np.int32, np.int64, 
                    np.uint, np.uint8, np.uint16, np.uint32, np.uint64,
                    float, np.float16, np.float32, np.float64]

    for col in categorical_columns:
        if X[col].dtype not in integer_dtypes:
            # Step 1: Initialize and fit the LabelEncoder
            original_labels = X[col].unique()
            encoder = LabelEncoder()
            X[col] = encoder.fit_transform(X[col])

            # Step 2: Extract the original labels and their corresponding encoded values
            
            encoded_values = list(range(len(original_labels)))

            # Step 3: Create and display the reference table
            reference_table = dict(zip(original_labels, encoded_values))

            print(f"The values of column {col} are not numerical and have been encoded.\nReference Table of {col}:")
            for label, encoded in reference_table.items():
                print(f"{label} -> {encoded}")
            print('\n')
        else:
            print(f'The labels of column {col} are numerical.')



In [713]:
scaling = ''
scaling = input("Scale the numerical data using the StandardScaler? (y/n)")
if scaling == "y":
    scaler = StandardScaler()
    scaler.fit(X[numeric_columns])
    X[numeric_columns] = scaler.transform(X[numeric_columns])
    print("Numerical data has been normalized.")


Numerical data has been normalized.


In [714]:
# Checking for label balance

try:
    label_counts = pd.concat([y.value_counts(), 
                y.value_counts(normalize=True).mul(100).round(2)],axis=1, keys=('counts','percentage'))
except:
    print("Error, check the labels column.")

unbalanced = False
label_low = {}
for i in range(len(label_list)):
    if label_counts['percentage'].iloc[i] <= 10:
        label_low[label_list[i]] = label_counts['percentage'].iloc[i]
        unbalanced = True

if unbalanced:
    print('Warning! The dataset is unbalanced in terms of labels!\n')
    for i in label_low:
        print(i,"makes",label_low[i],'% of the dataset.')
else:
    print('The dataset is balanced.')

print("\n",label_counts)

The dataset is balanced.

           counts  percentage
Class                       
negative     500        65.1
positive     268        34.9


In [715]:
# Label encoding

if y.any != int:
    # Step 1: Initialize and fit the LabelEncoder
    encoder = LabelEncoder()
    y = encoder.fit_transform(y)

    # Step 2: Extract the original labels and their corresponding encoded values
    original_labels = label_list
    encoded_values = list(range(len(original_labels)))

    # Step 3: Create and display the reference table
    reference_table = dict(zip(original_labels, encoded_values))

    print("The labels are not numerical and have been encoded.\n\nReference Table:")
    for label, encoded in reference_table.items():
        print(f"{label} -> {encoded}")
else:
    print('The labels are numerical.')

The labels are not numerical and have been encoded.

Reference Table:
negative -> 0
positive -> 1


In [716]:
X[label_col+" (Label)"] = y
df = X.copy()
df

Unnamed: 0,Number of times pregnant,Plasma glucose concentration,Diastolic blood pressure,Triceps skin fold thickness,2-Hour serum insulin,Body mass index,Diabetes pedigree function,Age,Class (Label)
686,-0.250952,0.284975,-0.263941,-1.288212,-0.692891,-1.128639,-0.476805,-0.956462,0
105,-0.844885,0.159787,-0.677523,0.530902,0.626910,-0.417892,0.993993,-1.041549,0
369,-0.844885,0.378867,1.700573,0.468173,0.522715,0.102477,-0.718415,1.000557,1
141,0.342981,-0.466156,0.666618,0.593630,-0.692891,0.952836,-0.561368,0.404942,0
722,-0.844885,0.879621,-0.057150,0.530902,0.409837,-0.341740,-0.371101,0.745293,1
...,...,...,...,...,...,...,...,...,...
263,-0.250952,0.660541,0.563223,-0.347291,-0.692891,0.051710,-0.821099,2.532136,0
227,-0.250952,1.286484,-0.884314,1.095454,-0.692891,0.660922,0.543995,-0.786286,1
143,1.827813,-0.403562,-0.160546,-1.288212,-0.692891,0.051710,-0.603650,0.745293,1
258,-0.844885,2.256695,-0.987710,-0.284563,2.563195,-0.773265,0.553055,-0.786286,0


In [717]:
# Save the summary DataFrame to a CSV file
filename = input("Save the processed data? If yes, provide the name for csv file: ")

if filename:
    df.to_csv(f'{filename}.csv', index=False)
    print(f"Data saved as file: {filename}.csv")

Data saved as diabetes_processed.csv
