In [1]:
# Libraries for data loading, data manipulation and data visulisation
import pandas as pd  # Data processing and manipulation 
import matplotlib.pyplot as plt  # Data visualization
import seaborn as sns  # Statistical data visualization
import numpy as np  # Numerical operations and linear algebra 

# Libraries for data preparation and model building
from scipy.stats import norm  # Statistical functions
from sklearn.preprocessing import StandardScaler  # Data preprocessing
import warnings  # Warning handling
warnings.filterwarnings('ignore')  # Ignore warnings

# LOADING DATA

In [84]:
# load the data, used the pandas read_csv() to import our dataset 
data = pd.read_excel('DiabetesDB.xlsx', sheet_name=None)
#df.head() #Preview the dataset

In [85]:
# Cocantenation

df = pd.concat(data.values(), ignore_index=True)


In [86]:
df.head()

Unnamed: 0,NAME,SURNAME,GENDER,YOB,PHONE,EMAIL,STATE OF ORIGIN,PHONE CALL,NAME.1,CONTACT,RELATIONSHIP,GLUCOSE,BLOOD P.,BMI,HEIGHT,WEIGHT,FILE LINK
0,ALICE,JOSEPH,,70 YEARS,9021451814,NILL,FCT,Yet to Call,ALICE,9021452000.0,DAUGTHER,8.2MMOL,104/67MMHG,,,48.3,
1,AMOS,KPAKACHI,,60 YRS,9076753570,NILL,FCT,Yet to Call,MUSA,9072494000.0,BROTHER,10.1MMOL,127/74MMHG,,,62.7,
2,AYUBA,BARDE,,50 YRS,8076708464,NILL,FCT,Yet to Call,IKO AYUBA,9012614000.0,SON,7.3MM0L,90/60MMHG,,,68.9,
3,AYUBA,GWAZAWA,,1982,8074457724,AYUBAGWAZAWA@GMAIL.COM,FCT,Yet to Call,ESTHER AYUBA,7014740000.0,WIFE,6.9MMOL,123/79MMGH,,,59.8,
4,BEAUTY,SUNDAY,,1998,7011418074,NILL,FCT,Yet to Call,MATHEW SUNDAY,7087563000.0,BROTHER,4.4MMOL,104/80MMHG,,,72.4,


In [87]:
df.info()
df.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2247 entries, 0 to 2246
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   NAME             208 non-null    object 
 1   SURNAME          206 non-null    object 
 2   GENDER           2187 non-null   object 
 3   YOB              2233 non-null   object 
 4   PHONE            198 non-null    object 
 5   EMAIL            168 non-null    object 
 6   STATE OF ORIGIN  2245 non-null   object 
 7   PHONE CALL       208 non-null    object 
 8   NAME             157 non-null    object 
 9   CONTACT          158 non-null    float64
 10  RELATIONSHIP     2187 non-null   object 
 11  GLUCOSE          2241 non-null   object 
 12  BLOOD P.         2242 non-null   object 
 13  BMI              1047 non-null   float64
 14  HEIGHT           2143 non-null   object 
 15  WEIGHT           2238 non-null   object 
 16  FILE  LINK       0 non-null      float64
dtypes: float64(3),

Unnamed: 0,CONTACT,BMI,FILE LINK
count,158.0,1047.0,0.0
mean,9533310000.0,27.929809,
std,11731650000.0,6.641164,
min,70867280.0,15.6,
25%,7063358000.0,22.6,
50%,8064963000.0,27.3,
75%,9012270000.0,32.6,
max,91579320000.0,46.5,


In [88]:
df.columns

Index(['NAME ', 'SURNAME', 'GENDER', 'YOB ', 'PHONE ', 'EMAIL',
       'STATE OF ORIGIN', 'PHONE CALL ', 'NAME', 'CONTACT', 'RELATIONSHIP',
       'GLUCOSE ', 'BLOOD P.', 'BMI', 'HEIGHT', 'WEIGHT', 'FILE  LINK'],
      dtype='object')

## DATA PREPROCESSING

In [89]:
# Step 1: Create 'Full_Name' column
df['FULL_NAME'] = df['NAME '].fillna('') + ' ' + df['SURNAME'].fillna('')

# Step 2: Reorder columns to place 'Full_Name' first
columns = ['FULL_NAME'] + [col for col in df.columns if col not in ['FULL_NAME']]
df = df[columns]


In [90]:
print(df.columns)

Index(['FULL_NAME', 'NAME ', 'SURNAME', 'GENDER', 'YOB ', 'PHONE ', 'EMAIL',
       'STATE OF ORIGIN', 'PHONE CALL ', 'NAME', 'CONTACT', 'RELATIONSHIP',
       'GLUCOSE ', 'BLOOD P.', 'BMI', 'HEIGHT', 'WEIGHT', 'FILE  LINK'],
      dtype='object')


In [91]:
df = df.drop(['FULL_NAME', 'NAME ', 'SURNAME', 'PHONE ', 'EMAIL','STATE OF ORIGIN', 'NAME', 'PHONE CALL ', 'CONTACT', 'RELATIONSHIP', 'FILE  LINK'], axis=1)

In [92]:
print(df.columns)

Index(['GENDER', 'YOB ', 'GLUCOSE ', 'BLOOD P.', 'BMI', 'HEIGHT', 'WEIGHT'], dtype='object')


### Code to Clean BMI Column

In [93]:
import re

def clean_and_convert_to_meters(value):
    # Remove non-numeric characters and decimal points
    cleaned_value = re.sub(r'\D', '', str(value))
    
    if cleaned_value:
        # Convert to float and treat it as centimeters
        height_in_cm = float(cleaned_value)
        # Convert cm to meters
        return height_in_cm / 100
    else:
        return None  # Return None if value is empty or non-numeric

# Apply the function to the 'height' column
df['HEIGHT'] = df['HEIGHT'].apply(clean_and_convert_to_meters)


# Apply the function to the 'height(cm)' column
#df['HEIGHT (cm)'] = df['HEIGHT (cm)'].apply(clean_and_convert_to_meters)



In [94]:
# Create a new 'height' column
#df['HEIGHT'] = df['HEIGHT'].fillna(df['HEIGHT (cm)'])

# Drop the 'height (cm)' column now that data is merged
#df.drop(columns=['HEIGHT (cm)'], inplace=True)

# Calculate the mean height from available values and round to 2 decimal places
mean_height = round(df['HEIGHT'].mean(), 2)

# Fill missing values in 'height' with the rounded mean
df['HEIGHT'].fillna(mean_height, inplace=True)

In [95]:
# Function to remove non-numeric characters from weight column
def clean_weight(value):
    # Remove all non-numeric characters, keep only digits and decimal points
    cleaned_value = re.sub(r'[^0-9.]', '', str(value))
    return float(cleaned_value) if cleaned_value else None



# Apply the function to the 'weight' column
df['WEIGHT'] = df['WEIGHT'].apply(clean_weight)


In [96]:
# Calculate the mean height from available values and round to 2 decimal places
mean_weight = round(df['WEIGHT'].mean(), 2)

df['WEIGHT'].fillna(mean_weight, inplace=True)

In [97]:
# Calculate BMI and add it as a new column in the DataFrame
df['BMI'] = df['WEIGHT'] / (df['HEIGHT'] ** 2)

# Round BMI to 2 decimal places
df['BMI'] = df['BMI'].round(2)

In [98]:
df.head(227)

Unnamed: 0,GENDER,YOB,GLUCOSE,BLOOD P.,BMI,HEIGHT,WEIGHT
0,,70 YEARS,8.2MMOL,104/67MMHG,0.61,8.90,48.3
1,,60 YRS,10.1MMOL,127/74MMHG,0.79,8.90,62.7
2,,50 YRS,7.3MM0L,90/60MMHG,0.87,8.90,68.9
3,,1982,6.9MMOL,123/79MMGH,0.75,8.90,59.8
4,,1998,4.4MMOL,104/80MMHG,0.91,8.90,72.4
...,...,...,...,...,...,...,...
222,FEMALE,1982,5.2MMOL,159/93,41.27,1.64,111.0
223,FEMALE,1982,5.7MMOL,106/75,25.20,1.58,62.9
224,MALE,1986,2,107/75,26.16,1.74,79.2
225,MALE,1970,21.1,125/85,30.22,1.80,97.9


In [99]:
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2247 entries, 0 to 2246
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   GENDER    2187 non-null   object 
 1   YOB       2233 non-null   object 
 2   GLUCOSE   2241 non-null   object 
 3   BLOOD P.  2242 non-null   object 
 4   BMI       2247 non-null   float64
 5   HEIGHT    2247 non-null   float64
 6   WEIGHT    2247 non-null   float64
dtypes: float64(3), object(4)
memory usage: 123.0+ KB


Unnamed: 0,BMI,HEIGHT,WEIGHT
count,2247.0,2247.0,2247.0
mean,15.133934,8.90142,81.424428
std,15.895716,7.487699,20.02398
min,0.05,1.49,4.3
25%,0.29,1.69,64.6
50%,15.17,1.9,80.2
75%,28.67,16.82,97.3
max,53.29,18.99,141.8


### Code to Clean Gender Column

In [100]:
# Fill missing values in 'Gender' with 'Unknown'
df['GENDER'].fillna('Unknown', inplace=True)

# Define mapping for 'Male' and 'Female', with 'Unknown' as a separate category if needed
gender_mapping = {'MALE': 0, 'FEMALE': 1, 'MAKE': 0, 'Unknown': 0, 'FAMELE': 1, 'FEMAE': 1}
df['GENDER'] = df['GENDER'].map(gender_mapping)


In [101]:
df.head(227)

Unnamed: 0,GENDER,YOB,GLUCOSE,BLOOD P.,BMI,HEIGHT,WEIGHT
0,0.0,70 YEARS,8.2MMOL,104/67MMHG,0.61,8.90,48.3
1,0.0,60 YRS,10.1MMOL,127/74MMHG,0.79,8.90,62.7
2,0.0,50 YRS,7.3MM0L,90/60MMHG,0.87,8.90,68.9
3,0.0,1982,6.9MMOL,123/79MMGH,0.75,8.90,59.8
4,0.0,1998,4.4MMOL,104/80MMHG,0.91,8.90,72.4
...,...,...,...,...,...,...,...
222,1.0,1982,5.2MMOL,159/93,41.27,1.64,111.0
223,1.0,1982,5.7MMOL,106/75,25.20,1.58,62.9
224,0.0,1986,2,107/75,26.16,1.74,79.2
225,0.0,1970,21.1,125/85,30.22,1.80,97.9


In [102]:
df.head(50)

Unnamed: 0,GENDER,YOB,GLUCOSE,BLOOD P.,BMI,HEIGHT,WEIGHT
0,0.0,70 YEARS,8.2MMOL,104/67MMHG,0.61,8.9,48.3
1,0.0,60 YRS,10.1MMOL,127/74MMHG,0.79,8.9,62.7
2,0.0,50 YRS,7.3MM0L,90/60MMHG,0.87,8.9,68.9
3,0.0,1982,6.9MMOL,123/79MMGH,0.75,8.9,59.8
4,0.0,1998,4.4MMOL,104/80MMHG,0.91,8.9,72.4
5,0.0,70 YRS,,108/65MMHG,0.72,8.9,57.3
6,0.0,1991,7.4MMOL,97/66MMHG,1.23,8.9,97.7
7,0.0,2005,5.5MMOL,104/61MMHG,0.77,8.9,61.0
8,0.0,35 YRS,4.8MMOL,99/76MMHG,0.66,8.9,52.1
9,0.0,50 YRS,7.7MMOL,138/75MMHG,0.73,8.9,57.6


### Code to Clean Age Column

In [103]:
import pandas as pd
from datetime import datetime

# Sample data with 'YOB' column (containing both year of birth and age)
#data = {
    #'YOB ': ['1995', '74 years', '1985', 'N/A', '30', '2000', '45 years', '1982', '56', 'unknown']
#}
#df = pd.DataFrame(data)

# Get the current year
current_year = datetime.now().year

# Function to clean and convert the YOB data
def clean_yob(value):
    if isinstance(value, str):  # Check if the value is a string
        # Try to extract the numeric part from the string
        digits = ''.join([ch for ch in value if ch.isdigit()])
        if len(digits) == 4:  # If it's a 4-digit number, assume it's a year of birth
            return int(digits)
        elif len(digits) > 0 and len(digits) < 3:  # If it's age (usually 2 digits), calculate YOB
            try:
                return current_year - int(digits)
            except ValueError:
                return None
    elif isinstance(value, (int, float)):  # If the value is already numeric
        if value < current_year:  # If the number is less than the current year, assume it's a year of birth
            return int(value)
    return None  # For any invalid cases (e.g., 'N/A', 'unknown', etc.)

# Apply the function to clean the 'YOB' column
df['YOB_cleaned'] = df['YOB '].apply(clean_yob)

# Display the cleaned DataFrame
print(df)


      GENDER      YOB   GLUCOSE     BLOOD P.   BMI  HEIGHT  WEIGHT  \
0        0.0  70 YEARS   8.2MMOL  104/67MMHG  0.61    8.90    48.3   
1        0.0    60 YRS  10.1MMOL  127/74MMHG  0.79    8.90    62.7   
2        0.0    50 YRS   7.3MM0L   90/60MMHG  0.87    8.90    68.9   
3        0.0      1982   6.9MMOL  123/79MMGH  0.75    8.90    59.8   
4        0.0      1998   4.4MMOL  104/80MMHG  0.91    8.90    72.4   
...      ...       ...       ...         ...   ...     ...     ...   
2242     0.0      1982   6.3MMOL  155/92MMHG  0.16   18.93    57.9   
2243     1.0      1967   6.3MMOL  117/79MMHG  0.29   17.26    85.4   
2244     1.0      1984   5.5MMOL  115/84MMHG  0.37   15.43    87.8   
2245     1.0      1974   7.2MMOL  140/67MMHG  0.29   15.56    70.3   
2246     0.0      1961   7.4MMOL  136/86MMHG  0.25   18.98    90.3   

      YOB_cleaned  
0          1954.0  
1          1964.0  
2          1974.0  
3          1982.0  
4          1998.0  
...           ...  
2242       1982.0  

In [104]:
# Assuming `df` is the DataFrame with the cleaned 'YOB_cleaned' column
current_year = datetime.now().year

# Calculate age by subtracting the year of birth from the current year
df['Age'] = current_year - df['YOB_cleaned']

# Convert the 'Age' column to integer, handling missing values
df['Age'] = df['Age'].fillna(df['Age'].median()).astype(int)

# Display the DataFrame with the new 'Age' column
print(df)


      GENDER      YOB   GLUCOSE     BLOOD P.   BMI  HEIGHT  WEIGHT  \
0        0.0  70 YEARS   8.2MMOL  104/67MMHG  0.61    8.90    48.3   
1        0.0    60 YRS  10.1MMOL  127/74MMHG  0.79    8.90    62.7   
2        0.0    50 YRS   7.3MM0L   90/60MMHG  0.87    8.90    68.9   
3        0.0      1982   6.9MMOL  123/79MMGH  0.75    8.90    59.8   
4        0.0      1998   4.4MMOL  104/80MMHG  0.91    8.90    72.4   
...      ...       ...       ...         ...   ...     ...     ...   
2242     0.0      1982   6.3MMOL  155/92MMHG  0.16   18.93    57.9   
2243     1.0      1967   6.3MMOL  117/79MMHG  0.29   17.26    85.4   
2244     1.0      1984   5.5MMOL  115/84MMHG  0.37   15.43    87.8   
2245     1.0      1974   7.2MMOL  140/67MMHG  0.29   15.56    70.3   
2246     0.0      1961   7.4MMOL  136/86MMHG  0.25   18.98    90.3   

      YOB_cleaned  Age  
0          1954.0   70  
1          1964.0   60  
2          1974.0   50  
3          1982.0   42  
4          1998.0   26  
...      

### Code to Clean Glucose Column

In [105]:
# First, clean the 'glucose' column
df['GLUCOSE '] = df['GLUCOSE '].str.replace('MMOL', '').str.strip()  # Remove 'MMOL' and any extra spaces
df['GLUCOSE '] = pd.to_numeric(df['GLUCOSE '], errors='coerce')      # Convert to numeric, set invalid parsing as NaN

# Fill missing values with the mean (you could also use median or another method)
df['GLUCOSE '].fillna(df['GLUCOSE '].mean(), inplace=True)

# Round the 'glucose' values to 2 decimal places
df['GLUCOSE '] = df['GLUCOSE '].round(2)

# Display the cleaned and rounded 'glucose' column
print(df['GLUCOSE '])

0        8.20
1       10.10
2        6.18
3        6.90
4        4.40
        ...  
2242     6.30
2243     6.30
2244     5.50
2245     7.20
2246     7.40
Name: GLUCOSE , Length: 2247, dtype: float64


### Code to Clean Blood Pressure Column

In [106]:
# Remove 'MMHG' and any extra spaces
df['BLOOD P.'] = df['BLOOD P.'].str.replace('MMHG', '').str.strip()

# Split the blood pressure values into systolic and diastolic columns
df[['Systolic', 'Diastolic']] = df['BLOOD P.'].str.split('/', expand=True)

# Convert both columns to numeric
df['Systolic'] = pd.to_numeric(df['Systolic'], errors='coerce')
df['Diastolic'] = pd.to_numeric(df['Diastolic'], errors='coerce')

# Optional: Round the values to 2 decimal places (if necessary)
df['Systolic'] = df['Systolic'].round(2)
df['Diastolic'] = df['Diastolic'].round(2)

# Optionally, you can drop the original 'BLOOD P.' column
df.drop(columns=['BLOOD P.'], inplace=True)

# Display the cleaned data
print(df[['Systolic', 'Diastolic']])


      Systolic  Diastolic
0        104.0       67.0
1        127.0       74.0
2         90.0       60.0
3        123.0        NaN
4        104.0       80.0
...        ...        ...
2242     155.0       92.0
2243     117.0       79.0
2244     115.0       84.0
2245     140.0       67.0
2246     136.0       86.0

[2247 rows x 2 columns]


In [107]:
df.columns

Index(['GENDER', 'YOB ', 'GLUCOSE ', 'BMI', 'HEIGHT', 'WEIGHT', 'YOB_cleaned',
       'Age', 'Systolic', 'Diastolic'],
      dtype='object')

### FINAL ORDERED COLUMNS

In [108]:
# Remove unnecessary columns
df.drop(columns=['YOB ', 'YOB_cleaned'], inplace=True)

# Define the new column order
new_order = ['GENDER', 'Age', 'GLUCOSE ', 'Systolic', 'Diastolic', 'HEIGHT', 'WEIGHT', 'BMI']

# Rearrange the columns
df = df[new_order]

# Display the updated DataFrame
print(df.head())


   GENDER  Age  GLUCOSE   Systolic  Diastolic  HEIGHT  WEIGHT   BMI
0     0.0   70      8.20     104.0       67.0     8.9    48.3  0.61
1     0.0   60     10.10     127.0       74.0     8.9    62.7  0.79
2     0.0   50      6.18      90.0       60.0     8.9    68.9  0.87
3     0.0   42      6.90     123.0        NaN     8.9    59.8  0.75
4     0.0   26      4.40     104.0       80.0     8.9    72.4  0.91


### Creating column for Diabetes Status

Use Other Health Indicators to Create a Diabetes Indicator
* BMI: Obesity (BMI > 30) is a major risk factor for type 2 diabetes. You could classify patients as potentially diabetic based on their BMI.
* Blood Pressure: Hypertension (high blood pressure) is often associated with diabetes. You could use the Systolic and Diastolic values to classify patients.
* Age: Older age is a risk factor for diabetes, so you might use age as a feature.
* Multiple Criteria: You can create a composite rule that combines multiple factors, such as glucose, BMI, and blood pressure, to classify a person as diabetic or not.

For example, you might define a "high risk for diabetes" based on:

* Glucose level: even if not over 126 mg/dL, you could consider levels approaching high-normal ranges, e.g., between 100-125 mg/dL, as indicative of prediabetes.
* BMI: BMI greater than 30 can indicate obesity, which is linked with a higher risk of diabetes.
* Blood Pressure: High systolic and diastolic values can be considered risk factors.
* Age: Diabetics tend to be older, especially those with type 2 diabetes.

In [109]:
df['Diabetes'] = ((df['BMI'] > 30) | 
                  (df['Systolic'] > 140) | 
                  (df['Diastolic'] > 90) | 
                  (df['Age'] > 45)).astype(int)
df['Diabetes'] = df['Diabetes'].astype(int)  # Convert to 0 (no diabetes) and 1 (diabetes)

In [110]:
df.head(200)

Unnamed: 0,GENDER,Age,GLUCOSE,Systolic,Diastolic,HEIGHT,WEIGHT,BMI,Diabetes
0,0.0,70,8.20,104.0,67.0,8.90,48.3,0.61,1
1,0.0,60,10.10,127.0,74.0,8.90,62.7,0.79,1
2,0.0,50,6.18,90.0,60.0,8.90,68.9,0.87,1
3,0.0,42,6.90,123.0,,8.90,59.8,0.75,0
4,0.0,26,4.40,104.0,80.0,8.90,72.4,0.91,0
...,...,...,...,...,...,...,...,...,...
195,1.0,45,5.40,129.0,80.0,1.60,99.9,39.02,1
196,1.0,28,4.80,87.0,76.0,1.67,68.4,24.53,0
197,0.0,51,5.90,105.0,83.0,1.73,80.5,26.90,1
198,1.0,38,5.80,129.0,81.0,1.65,94.3,34.64,1


In [111]:
df.columns

Index(['GENDER', 'Age', 'GLUCOSE ', 'Systolic', 'Diastolic', 'HEIGHT',
       'WEIGHT', 'BMI', 'Diabetes'],
      dtype='object')

In [112]:
def bp_category(row):
    if row['Systolic'] < 120 and row['Diastolic'] < 80:
        return 'Normal'
    elif 120 <= row['Systolic'] <= 129 and row['Diastolic'] < 80:
        return 'Elevated'
    elif 130 <= row['Systolic'] <= 139 or 80 <= row['Diastolic'] <= 89:
        return 'Hypertension Stage 1'
    else:
        return 'Hypertension Stage 2'

df['BP_Category'] = df.apply(bp_category, axis=1)


In [113]:
def bmi_category(bmi):
    if bmi < 18.5:
        return 'Underweight'
    elif 18.5 <= bmi < 24.9:
        return 'Normal'
    elif 25 <= bmi < 29.9:
        return 'Overweight'
    else:
        return 'Obese'

df['BMI_Category'] = df['BMI'].apply(bmi_category)


In [114]:
bins = [0, 20, 40, 60, 80, 100]
labels = ['0-20', '21-40', '41-60', '61-80', '81+']
df['Age_Group'] = pd.cut(df['Age'], bins=bins, labels=labels)


In [115]:
df['Height_Weight_Ratio'] = df['HEIGHT'] / df['WEIGHT']


In [75]:
df['Age_BMI_Interaction'] = df['Age'] * df['BMI']


In [76]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_cols = ['GLUCOSE ', 'Systolic', 'Diastolic', 'HEIGHT', 'WEIGHT']
df[scaled_cols] = scaler.fit_transform(df[scaled_cols])


In [116]:
df = pd.get_dummies(df, columns=['BP_Category', 'BMI_Category'], drop_first=True)


In [123]:
df.head()

Unnamed: 0,GENDER,Age,GLUCOSE,Systolic,Diastolic,HEIGHT,WEIGHT,BMI,Diabetes,Age_Group,Height_Weight_Ratio,BP_Category_Hypertension Stage 1,BP_Category_Hypertension Stage 2,BP_Category_Normal,BMI_Category_Obese,BMI_Category_Overweight,BMI_Category_Underweight
0,0.0,70,8.2,104.0,67.0,8.9,48.3,0.61,1,61-80,0.184265,0,0,1,0,0,1
1,0.0,60,10.1,127.0,74.0,8.9,62.7,0.79,1,41-60,0.141946,0,0,0,0,0,1
2,0.0,50,6.18,90.0,60.0,8.9,68.9,0.87,1,41-60,0.129173,0,0,1,0,0,1
3,0.0,42,6.9,123.0,,8.9,59.8,0.75,0,41-60,0.148829,0,1,0,0,0,1
4,0.0,26,4.4,104.0,80.0,8.9,72.4,0.91,0,21-40,0.122928,1,0,0,0,0,1


In [118]:
# Correlation with target
correlation_matrix = df.corr()
target_correlation = correlation_matrix['Diabetes'].drop('Diabetes')
selected_features = target_correlation[abs(target_correlation) > 0.1].index

print("Selected Features Based on Correlation:")
print(selected_features)


Selected Features Based on Correlation:
Index(['Age', 'Systolic', 'Diastolic', 'HEIGHT', 'WEIGHT', 'BMI',
       'Height_Weight_Ratio', 'BP_Category_Hypertension Stage 2',
       'BP_Category_Normal', 'BMI_Category_Obese', 'BMI_Category_Underweight'],
      dtype='object')


In [124]:
X = df[['Age', 'Systolic', 'Diastolic', 'HEIGHT', 'WEIGHT', 'BMI',
       'Height_Weight_Ratio', 'BP_Category_Hypertension Stage 2',
       'BP_Category_Normal', 'BMI_Category_Obese', 'BMI_Category_Underweight']]
Y = df['Diabetes']


In [125]:
print(X.isnull().sum())

Age                                 0
Systolic                            5
Diastolic                           6
HEIGHT                              0
WEIGHT                              0
BMI                                 0
Height_Weight_Ratio                 0
BP_Category_Hypertension Stage 2    0
BP_Category_Normal                  0
BMI_Category_Obese                  0
BMI_Category_Underweight            0
dtype: int64


In [126]:
from sklearn.impute import SimpleImputer

# Create an imputer to fill missing values with the mean
imputer = SimpleImputer(strategy='mean')

# Impute the missing values in X
X_imputed = imputer.fit_transform(X)


In [127]:
# Check again for missing values in X
print(pd.DataFrame(X_imputed).isnull().sum())


0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
dtype: int64


### TRAIN TEST SPLIT AND MODEL TRAINING WITH ACCURACY CHECK

In [128]:
# Train a Logistic Regression model
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Split the data into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X_imputed, Y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the Logistic Regression model
model = LogisticRegression()
model.fit(X_train_scaled, Y_train)

# Evaluate the model
from sklearn.metrics import accuracy_score, classification_report
Y_pred_lr = model.predict(X_test_scaled)

accuracy_lr = accuracy_score(Y_test, Y_pred_lr) * 100
print(f'Logistic Regression Accuracy: {accuracy_lr:.2f}%')
print(f'Classification Report (Logistic Regression):\n{classification_report(Y_test, Y_pred_lr)}')


Logistic Regression Accuracy: 94.00%
Classification Report (Logistic Regression):
              precision    recall  f1-score   support

           0       0.90      0.84      0.87       104
           1       0.95      0.97      0.96       346

    accuracy                           0.94       450
   macro avg       0.92      0.90      0.91       450
weighted avg       0.94      0.94      0.94       450



In [129]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Train a Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, Y_train)

# Make predictions
Y_pred_rf = rf_model.predict(X_test_scaled)

# Calculate and display accuracy as a percentage
accuracy_rf = accuracy_score(Y_test, Y_pred_rf) * 100
print(f'Random Forest Accuracy: {accuracy_rf:.2f}%')

# Classification report
print(f'Classification Report (Random Forest):\n{classification_report(Y_test, Y_pred_rf)}')


Random Forest Accuracy: 99.78%
Classification Report (Random Forest):
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       104
           1       1.00      1.00      1.00       346

    accuracy                           1.00       450
   macro avg       1.00      1.00      1.00       450
weighted avg       1.00      1.00      1.00       450



In [130]:
from sklearn.svm import SVC

# Train an SVM model
svm_model = SVC(random_state=42)
svm_model.fit(X_train_scaled, Y_train)

# Make predictions
Y_pred_svm = svm_model.predict(X_test_scaled)

# Calculate and display accuracy as a percentage
accuracy_svm = accuracy_score(Y_test, Y_pred_svm) * 100
print(f'SVM Accuracy: {accuracy_svm:.2f}%')

# Classification report
print(f'Classification Report (SVM):\n{classification_report(Y_test, Y_pred_svm)}')


SVM Accuracy: 94.89%
Classification Report (SVM):
              precision    recall  f1-score   support

           0       0.94      0.84      0.88       104
           1       0.95      0.98      0.97       346

    accuracy                           0.95       450
   macro avg       0.94      0.91      0.93       450
weighted avg       0.95      0.95      0.95       450



In [131]:
from sklearn.ensemble import GradientBoostingClassifier

# Train a Gradient Boosting model
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train_scaled, Y_train)

# Make predictions
Y_pred_gb = gb_model.predict(X_test_scaled)

# Calculate and display accuracy as a percentage
accuracy_gb = accuracy_score(Y_test, Y_pred_gb) * 100
print(f'Gradient Boosting Accuracy: {accuracy_gb:.2f}%')

# Classification report
print(f'Classification Report (Gradient Boosting):\n{classification_report(Y_test, Y_pred_gb)}')


Gradient Boosting Accuracy: 100.00%
Classification Report (Gradient Boosting):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       104
           1       1.00      1.00      1.00       346

    accuracy                           1.00       450
   macro avg       1.00      1.00      1.00       450
weighted avg       1.00      1.00      1.00       450



In [132]:
from sklearn.neighbors import KNeighborsClassifier

# Train a KNN model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_scaled, Y_train)

# Make predictions
Y_pred_knn = knn_model.predict(X_test_scaled)

# Calculate and display accuracy as a percentage
accuracy_knn = accuracy_score(Y_test, Y_pred_knn) * 100
print(f'KNN Accuracy: {accuracy_knn:.2f}%')

# Classification report
print(f'Classification Report (KNN):\n{classification_report(Y_test, Y_pred_knn)}')


KNN Accuracy: 88.44%
Classification Report (KNN):
              precision    recall  f1-score   support

           0       0.80      0.67      0.73       104
           1       0.91      0.95      0.93       346

    accuracy                           0.88       450
   macro avg       0.85      0.81      0.83       450
weighted avg       0.88      0.88      0.88       450



In [133]:
from sklearn.neural_network import MLPClassifier

# Train an MLPClassifier (Neural Network)
mlp_model = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42)
mlp_model.fit(X_train_scaled, Y_train)

# Make predictions
Y_pred_mlp = mlp_model.predict(X_test_scaled)

# Calculate and display accuracy as a percentage
accuracy_mlp = accuracy_score(Y_test, Y_pred_mlp) * 100
print(f'MLP Accuracy: {accuracy_mlp:.2f}%')

# Classification report
print(f'Classification Report (MLP):\n{classification_report(Y_test, Y_pred_mlp)}')


MLP Accuracy: 98.00%
Classification Report (MLP):
              precision    recall  f1-score   support

           0       0.95      0.96      0.96       104
           1       0.99      0.99      0.99       346

    accuracy                           0.98       450
   macro avg       0.97      0.97      0.97       450
weighted avg       0.98      0.98      0.98       450



In [135]:
# Create a dictionary to store model names and accuracies
model_accuracies = {}

print("List of used Models in order of accuracy\n")

# Example for Gradient Boosting Model
model_accuracies["Gradient Boosting"] = accuracy_gb
# Example for Random Forest Model
model_accuracies["Random Forest"] = accuracy_rf
# Example for MLP (Neural Network) Model
model_accuracies["MLP"] = accuracy_mlp
# Example for Support Vector Machine Model
model_accuracies["SVM"] = accuracy_svm
# Example for Logistic Regression Model
model_accuracies["LR"] = accuracy_lr
# Example for KNN Model
model_accuracies["KNN"] = accuracy_knn



# Display the model accuracies
for model, accuracy in model_accuracies.items():
    print(f'{model}: {accuracy:.2f}%')


List of used Models in order of accuracy

Gradient Boosting: 100.00%
Random Forest: 99.78%
MLP: 98.00%
SVM: 94.89%
LR: 94.00%
KNN: 88.44%
