In [1]:
# Libraries for data loading, data manipulation and data visulisation
import pandas as pd  # Data processing and manipulation 
import matplotlib.pyplot as plt  # Data visualization
import seaborn as sns  # Statistical data visualization
import numpy as np  # Numerical operations and linear algebra 

# Libraries for data preparation and model building
from scipy.stats import norm  # Statistical functions
from sklearn.preprocessing import StandardScaler  # Data preprocessing
import warnings  # Warning handling
warnings.filterwarnings('ignore')  # Ignore warnings

# LOADING DATA

In [240]:
# load the data, used the pandas read_csv() to import our dataset 
data = pd.read_excel('DiabetesDB.xlsx', sheet_name=None)
#df.head() #Preview the dataset

In [241]:
# Cocantenation

df = pd.concat(data.values(), ignore_index=True)


In [242]:
df.head(227)

Unnamed: 0,NAME,SURNAME,GENDER,YOB,PHONE,EMAIL,STATE OF ORIGIN,PHONE CALL,NAME.1,CONTACT,RELATIONSHIP,GLUCOSE,BLOOD P.,BMI,HEIGHT,WEIGHT,FILE LINK,HEIGHT (cm)
0,ALICE,JOSEPH,,70 YEARS,9021451814,NILL,FCT,Yet to Call,ALICE,9.021452e+09,DAUGTHER,8.2MMOL,104/67MMHG,,,48.3,,
1,AMOS,KPAKACHI,,60 YRS,9076753570,NILL,FCT,Yet to Call,MUSA,9.072494e+09,BROTHER,10.1MMOL,127/74MMHG,,,62.7,,
2,AYUBA,BARDE,,50 YRS,8076708464,NILL,FCT,Yet to Call,IKO AYUBA,9.012614e+09,SON,7.3MM0L,90/60MMHG,,,68.9,,
3,AYUBA,GWAZAWA,,1982,8074457724,AYUBAGWAZAWA@GMAIL.COM,FCT,Yet to Call,ESTHER AYUBA,7.014740e+09,WIFE,6.9MMOL,123/79MMGH,,,59.8,,
4,BEAUTY,SUNDAY,,1998,7011418074,NILL,FCT,Yet to Call,MATHEW SUNDAY,7.087563e+09,BROTHER,4.4MMOL,104/80MMHG,,,72.4,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
222,SIMI,STEPHEN,FEMALE,2000.0,9121036617,SIMISTEPHEN66@GMAIL.COM,Plateau,Yet to Call,STEPHEN,9.121037e+09,FATHER,8.2MMOL,85/64MMHG,,149CM,55.5KG,,
223,ALIEGBA,SAMUEL,FEMALE,2002.0,7019270203,SCUPTERSEER@GMAIL.COM,Benue,Yet to Call,ALIEGBA SAMUEL,7.019270e+09,,4.8MMOL,113/70MMHG,,174CM,78.8KG,,
224,HOPE,DANIEL,FEMALE,1987.0,8178057634,NILL,Kaduna,Yet to Call,AYUBA BABA,8.032538e+08,BROTHER,8.3MMOL,115/94MMHG,,15.9CM,90.4KG,,
225,DAVID,MOSES,MALE,2001.0,81044755452,NILL,Anambra,Yet to Call,,,,5.2MMOL,116/75MMHG,,15.8CM,58.0KG,,


In [243]:
df.info()
df.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 227 entries, 0 to 226
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   NAME             151 non-null    object 
 1   SURNAME          149 non-null    object 
 2   GENDER           91 non-null     object 
 3   YOB              145 non-null    object 
 4   PHONE            146 non-null    object 
 5   EMAIL            131 non-null    object 
 6   STATE OF ORIGIN  149 non-null    object 
 7   PHONE CALL       151 non-null    object 
 8   NAME             127 non-null    object 
 9   CONTACT          126 non-null    float64
 10  RELATIONSHIP     128 non-null    object 
 11  GLUCOSE          147 non-null    object 
 12  BLOOD P.         148 non-null    object 
 13  BMI              35 non-null     float64
 14  HEIGHT           40 non-null     object 
 15  WEIGHT           145 non-null    object 
 16  FILE  LINK       0 non-null      float64
 17  HEIGHT (cm)     

Unnamed: 0,CONTACT,BMI,FILE LINK
count,126.0,35.0,0.0
mean,10106300000.0,26.670571,
std,13052820000.0,4.933087,
min,70867280.0,19.3,
25%,7078705000.0,22.465,
50%,8075620000.0,26.36,
75%,9024573000.0,29.63,
max,91579320000.0,39.0,


## DATA PREPROCESSING

In [244]:
# Step 1: Create 'Full_Name' column
df['FULL_NAME'] = df['NAME '].fillna('') + ' ' + df['SURNAME'].fillna('')

# Step 2: Reorder columns to place 'Full_Name' first
columns = ['FULL_NAME'] + [col for col in df.columns if col not in ['FULL_NAME']]
df = df[columns]


In [245]:
print(df.columns)

Index(['FULL_NAME', 'NAME ', 'SURNAME', 'GENDER', 'YOB ', 'PHONE ', 'EMAIL',
       'STATE OF ORIGIN', 'PHONE CALL ', 'NAME', 'CONTACT', 'RELATIONSHIP',
       'GLUCOSE ', 'BLOOD P.', 'BMI', 'HEIGHT', 'WEIGHT', 'FILE  LINK',
       'HEIGHT (cm)'],
      dtype='object')


In [246]:
df = df.drop(['NAME ', 'SURNAME', 'PHONE ', 'EMAIL','STATE OF ORIGIN', 'NAME', 'PHONE CALL ', 'CONTACT', 'RELATIONSHIP', 'FILE  LINK'], axis=1)

In [247]:
print(df.columns)

Index(['FULL_NAME', 'GENDER', 'YOB ', 'GLUCOSE ', 'BLOOD P.', 'BMI', 'HEIGHT',
       'WEIGHT', 'HEIGHT (cm)'],
      dtype='object')


### Code to Clean BMI Column

In [248]:
import re

def clean_and_convert_to_meters(value):
    # Remove non-numeric characters and decimal points
    cleaned_value = re.sub(r'\D', '', str(value))
    
    if cleaned_value:
        # Convert to float and treat it as centimeters
        height_in_cm = float(cleaned_value)
        # Convert cm to meters
        return height_in_cm / 100
    else:
        return None  # Return None if value is empty or non-numeric

# Apply the function to the 'height' column
df['HEIGHT'] = df['HEIGHT'].apply(clean_and_convert_to_meters)


# Apply the function to the 'height(cm)' column
df['HEIGHT (cm)'] = df['HEIGHT (cm)'].apply(clean_and_convert_to_meters)



In [249]:
# Create a new 'height' column
df['HEIGHT'] = df['HEIGHT'].fillna(df['HEIGHT (cm)'])

# Drop the 'height (cm)' column now that data is merged
df.drop(columns=['HEIGHT (cm)'], inplace=True)

# Calculate the mean height from available values and round to 2 decimal places
mean_height = round(df['HEIGHT'].mean(), 2)

# Fill missing values in 'height' with the rounded mean
df['HEIGHT'].fillna(mean_height, inplace=True)

In [250]:
# Function to remove non-numeric characters from weight column
def clean_weight(value):
    # Remove all non-numeric characters, keep only digits and decimal points
    cleaned_value = re.sub(r'[^0-9.]', '', str(value))
    return float(cleaned_value) if cleaned_value else None

# Apply the function to the 'weight' column
df['WEIGHT'] = df['WEIGHT'].apply(clean_weight)

In [251]:
# Calculate BMI and add it as a new column in the DataFrame
df['BMI'] = df['WEIGHT'] / (df['HEIGHT'] ** 2)

# Round BMI to 2 decimal places
df['BMI'] = df['BMI'].round(2)

In [252]:
df.head(227)

Unnamed: 0,FULL_NAME,GENDER,YOB,GLUCOSE,BLOOD P.,BMI,HEIGHT,WEIGHT
0,ALICE JOSEPH,,70 YEARS,8.2MMOL,104/67MMHG,8.53,2.38,48.3
1,AMOS KPAKACHI,,60 YRS,10.1MMOL,127/74MMHG,11.07,2.38,62.7
2,AYUBA BARDE,,50 YRS,7.3MM0L,90/60MMHG,12.16,2.38,68.9
3,AYUBA GWAZAWA,,1982,6.9MMOL,123/79MMGH,10.56,2.38,59.8
4,BEAUTY SUNDAY,,1998,4.4MMOL,104/80MMHG,12.78,2.38,72.4
...,...,...,...,...,...,...,...,...
222,SIMI STEPHEN,FEMALE,2000.0,8.2MMOL,85/64MMHG,25.00,1.49,55.5
223,ALIEGBA SAMUEL,FEMALE,2002.0,4.8MMOL,113/70MMHG,26.03,1.74,78.8
224,HOPE DANIEL,FEMALE,1987.0,8.3MMOL,115/94MMHG,35.76,1.59,90.4
225,DAVID MOSES,MALE,2001.0,5.2MMOL,116/75MMHG,23.23,1.58,58.0


In [253]:
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 227 entries, 0 to 226
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   FULL_NAME  227 non-null    object 
 1   GENDER     91 non-null     object 
 2   YOB        145 non-null    object 
 3   GLUCOSE    147 non-null    object 
 4   BLOOD P.   148 non-null    object 
 5   BMI        145 non-null    float64
 6   HEIGHT     227 non-null    float64
 7   WEIGHT     145 non-null    float64
dtypes: float64(3), object(5)
memory usage: 14.3+ KB


Unnamed: 0,BMI,HEIGHT,WEIGHT
count,145.0,227.0,145.0
mean,18.576207,2.381189,68.006897
std,9.882518,1.857289,18.493365
min,0.21,1.49,22.2
25%,10.1,1.74,55.3
50%,18.78,2.38,63.4
75%,26.03,2.38,78.8
max,49.65,16.48,141.8


### Code to Clean Gender Column

In [254]:
# Fill missing values in 'Gender' with 'Unknown'
df['GENDER'].fillna('Unknown', inplace=True)

# Define mapping for 'Male' and 'Female', with 'Unknown' as a separate category if needed
gender_mapping = {'MALE': 0, 'FEMALE': 1, 'Unknown': 2}
df['GENDER'] = df['GENDER'].map(gender_mapping)


In [255]:
df.head(227)

Unnamed: 0,FULL_NAME,GENDER,YOB,GLUCOSE,BLOOD P.,BMI,HEIGHT,WEIGHT
0,ALICE JOSEPH,2.0,70 YEARS,8.2MMOL,104/67MMHG,8.53,2.38,48.3
1,AMOS KPAKACHI,2.0,60 YRS,10.1MMOL,127/74MMHG,11.07,2.38,62.7
2,AYUBA BARDE,2.0,50 YRS,7.3MM0L,90/60MMHG,12.16,2.38,68.9
3,AYUBA GWAZAWA,2.0,1982,6.9MMOL,123/79MMGH,10.56,2.38,59.8
4,BEAUTY SUNDAY,2.0,1998,4.4MMOL,104/80MMHG,12.78,2.38,72.4
...,...,...,...,...,...,...,...,...
222,SIMI STEPHEN,1.0,2000.0,8.2MMOL,85/64MMHG,25.00,1.49,55.5
223,ALIEGBA SAMUEL,1.0,2002.0,4.8MMOL,113/70MMHG,26.03,1.74,78.8
224,HOPE DANIEL,1.0,1987.0,8.3MMOL,115/94MMHG,35.76,1.59,90.4
225,DAVID MOSES,0.0,2001.0,5.2MMOL,116/75MMHG,23.23,1.58,58.0


### Code to Clean Age Column

In [256]:
import pandas as pd
from datetime import datetime

# Sample data with 'YOB' column (containing both year of birth and age)
#data = {
    #'YOB ': ['1995', '74 years', '1985', 'N/A', '30', '2000', '45 years', '1982', '56', 'unknown']
#}
#df = pd.DataFrame(data)

# Get the current year
current_year = datetime.now().year

# Function to clean and convert the YOB data
def clean_yob(value):
    if isinstance(value, str):  # Check if the value is a string
        # Try to extract the numeric part from the string
        digits = ''.join([ch for ch in value if ch.isdigit()])
        if len(digits) == 4:  # If it's a 4-digit number, assume it's a year of birth
            return int(digits)
        elif len(digits) > 0 and len(digits) < 3:  # If it's age (usually 2 digits), calculate YOB
            try:
                return current_year - int(digits)
            except ValueError:
                return None
    elif isinstance(value, (int, float)):  # If the value is already numeric
        if value < current_year:  # If the number is less than the current year, assume it's a year of birth
            return int(value)
    return None  # For any invalid cases (e.g., 'N/A', 'unknown', etc.)

# Apply the function to clean the 'YOB' column
df['YOB_cleaned'] = df['YOB '].apply(clean_yob)

# Display the cleaned DataFrame
print(df)


           FULL_NAME  GENDER      YOB   GLUCOSE     BLOOD P.    BMI  HEIGHT  \
0       ALICE JOSEPH     2.0  70 YEARS   8.2MMOL  104/67MMHG   8.53    2.38   
1      AMOS KPAKACHI     2.0    60 YRS  10.1MMOL  127/74MMHG  11.07    2.38   
2        AYUBA BARDE     2.0    50 YRS   7.3MM0L   90/60MMHG  12.16    2.38   
3      AYUBA GWAZAWA     2.0      1982   6.9MMOL  123/79MMGH  10.56    2.38   
4     BEAUTY  SUNDAY     2.0      1998   4.4MMOL  104/80MMHG  12.78    2.38   
..               ...     ...       ...       ...         ...    ...     ...   
222     SIMI STEPHEN     1.0    2000.0   8.2MMOL   85/64MMHG  25.00    1.49   
223  ALIEGBA  SAMUEL     1.0    2002.0   4.8MMOL  113/70MMHG  26.03    1.74   
224      HOPE DANIEL     1.0    1987.0   8.3MMOL  115/94MMHG  35.76    1.59   
225     DAVID  MOSES     0.0    2001.0   5.2MMOL  116/75MMHG  23.23    1.58   
226      NJOKU ISSAC     0.0    1994.0   5.8MMOL  105/66MMHG   0.22   16.45   

     WEIGHT  YOB_cleaned  
0      48.3       1954.0

In [257]:
# Assuming `df` is the DataFrame with the cleaned 'YOB_cleaned' column
current_year = datetime.now().year

# Calculate age by subtracting the year of birth from the current year
df['Age'] = current_year - df['YOB_cleaned']

# Convert the 'Age' column to integer, handling missing values
df['Age'] = df['Age'].fillna(df['Age'].median()).astype(int)

# Display the DataFrame with the new 'Age' column
print(df)


           FULL_NAME  GENDER      YOB   GLUCOSE     BLOOD P.    BMI  HEIGHT  \
0       ALICE JOSEPH     2.0  70 YEARS   8.2MMOL  104/67MMHG   8.53    2.38   
1      AMOS KPAKACHI     2.0    60 YRS  10.1MMOL  127/74MMHG  11.07    2.38   
2        AYUBA BARDE     2.0    50 YRS   7.3MM0L   90/60MMHG  12.16    2.38   
3      AYUBA GWAZAWA     2.0      1982   6.9MMOL  123/79MMGH  10.56    2.38   
4     BEAUTY  SUNDAY     2.0      1998   4.4MMOL  104/80MMHG  12.78    2.38   
..               ...     ...       ...       ...         ...    ...     ...   
222     SIMI STEPHEN     1.0    2000.0   8.2MMOL   85/64MMHG  25.00    1.49   
223  ALIEGBA  SAMUEL     1.0    2002.0   4.8MMOL  113/70MMHG  26.03    1.74   
224      HOPE DANIEL     1.0    1987.0   8.3MMOL  115/94MMHG  35.76    1.59   
225     DAVID  MOSES     0.0    2001.0   5.2MMOL  116/75MMHG  23.23    1.58   
226      NJOKU ISSAC     0.0    1994.0   5.8MMOL  105/66MMHG   0.22   16.45   

     WEIGHT  YOB_cleaned  Age  
0      48.3       1

### Code to Clean Glucose Column

In [260]:
# First, clean the 'glucose' column
df['GLUCOSE '] = df['GLUCOSE '].str.replace('MMOL', '').str.strip()  # Remove 'MMOL' and any extra spaces
df['GLUCOSE '] = pd.to_numeric(df['GLUCOSE '], errors='coerce')      # Convert to numeric, set invalid parsing as NaN

# Fill missing values with the mean (you could also use median or another method)
df['GLUCOSE '].fillna(df['GLUCOSE '].mean(), inplace=True)

# Round the 'glucose' values to 2 decimal places
df['GLUCOSE '] = df['GLUCOSE '].round(2)

# Display the cleaned and rounded 'glucose' column
print(df['GLUCOSE '])

0       8.20
1      10.10
2       6.97
3       6.90
4       4.40
       ...  
222     8.20
223     4.80
224     8.30
225     5.20
226     5.80
Name: GLUCOSE , Length: 227, dtype: float64


### Code to Clean Blood Pressure Column

In [261]:
# Remove 'MMHG' and any extra spaces
df['BLOOD P.'] = df['BLOOD P.'].str.replace('MMHG', '').str.strip()

# Split the blood pressure values into systolic and diastolic columns
df[['Systolic', 'Diastolic']] = df['BLOOD P.'].str.split('/', expand=True)

# Convert both columns to numeric
df['Systolic'] = pd.to_numeric(df['Systolic'], errors='coerce')
df['Diastolic'] = pd.to_numeric(df['Diastolic'], errors='coerce')

# Optional: Round the values to 2 decimal places (if necessary)
df['Systolic'] = df['Systolic'].round(2)
df['Diastolic'] = df['Diastolic'].round(2)

# Optionally, you can drop the original 'BLOOD P.' column
df.drop(columns=['BLOOD P.'], inplace=True)

# Display the cleaned data
print(df[['Systolic', 'Diastolic']])


     Systolic  Diastolic
0       104.0       67.0
1       127.0       74.0
2        90.0       60.0
3       123.0        NaN
4       104.0       80.0
..        ...        ...
222      85.0       64.0
223     113.0       70.0
224     115.0       94.0
225     116.0       75.0
226     105.0       66.0

[227 rows x 2 columns]


In [262]:
df.columns

Index(['FULL_NAME', 'GENDER', 'YOB ', 'GLUCOSE ', 'BMI', 'HEIGHT', 'WEIGHT',
       'YOB_cleaned', 'Age', 'Systolic', 'Diastolic'],
      dtype='object')

### FINAL ORDERED COLUMNS

In [263]:
# Remove unnecessary columns
df.drop(columns=['YOB ', 'YOB_cleaned'], inplace=True)

# Define the new column order
new_order = ['FULL_NAME', 'GENDER', 'Age', 'GLUCOSE ', 'Systolic', 'Diastolic', 'HEIGHT', 'WEIGHT', 'BMI']

# Rearrange the columns
df = df[new_order]

# Display the updated DataFrame
print(df.head())


        FULL_NAME  GENDER  Age  GLUCOSE   Systolic  Diastolic  HEIGHT  WEIGHT  \
0    ALICE JOSEPH     2.0   70      8.20     104.0       67.0    2.38    48.3   
1   AMOS KPAKACHI     2.0   60     10.10     127.0       74.0    2.38    62.7   
2     AYUBA BARDE     2.0   50      6.97      90.0       60.0    2.38    68.9   
3   AYUBA GWAZAWA     2.0   42      6.90     123.0        NaN    2.38    59.8   
4  BEAUTY  SUNDAY     2.0   26      4.40     104.0       80.0    2.38    72.4   

     BMI  
0   8.53  
1  11.07  
2  12.16  
3  10.56  
4  12.78  


### Creating column for Diabetes Status

Use Other Health Indicators to Create a Diabetes Indicator
* BMI: Obesity (BMI > 30) is a major risk factor for type 2 diabetes. You could classify patients as potentially diabetic based on their BMI.
* Blood Pressure: Hypertension (high blood pressure) is often associated with diabetes. You could use the Systolic and Diastolic values to classify patients.
* Age: Older age is a risk factor for diabetes, so you might use age as a feature.
* Multiple Criteria: You can create a composite rule that combines multiple factors, such as glucose, BMI, and blood pressure, to classify a person as diabetic or not.

For example, you might define a "high risk for diabetes" based on:

* Glucose level: even if not over 126 mg/dL, you could consider levels approaching high-normal ranges, e.g., between 100-125 mg/dL, as indicative of prediabetes.
* BMI: BMI greater than 30 can indicate obesity, which is linked with a higher risk of diabetes.
* Blood Pressure: High systolic and diastolic values can be considered risk factors.
* Age: Diabetics tend to be older, especially those with type 2 diabetes.

In [268]:
df['Diabetes'] = ((df['BMI'] > 30) | 
                  (df['Systolic'] > 140) | 
                  (df['Diastolic'] > 90) | 
                  (df['Age'] > 45)).astype(int)
df['Diabetes'] = df['Diabetes'].astype(int)  # Convert to 0 (no diabetes) and 1 (diabetes)

In [272]:
df.head(200)

Unnamed: 0,FULL_NAME,GENDER,Age,GLUCOSE,Systolic,Diastolic,HEIGHT,WEIGHT,BMI,Diabetes
0,ALICE JOSEPH,2.0,70,8.20,104.0,67.0,2.38,48.3,8.53,1
1,AMOS KPAKACHI,2.0,60,10.10,127.0,74.0,2.38,62.7,11.07,1
2,AYUBA BARDE,2.0,50,6.97,90.0,60.0,2.38,68.9,12.16,1
3,AYUBA GWAZAWA,2.0,42,6.90,123.0,,2.38,59.8,10.56,0
4,BEAUTY SUNDAY,2.0,26,4.40,104.0,80.0,2.38,72.4,12.78,0
...,...,...,...,...,...,...,...,...,...,...
195,STEPHEN SILAS,0.0,27,6.60,146.0,50.0,1.69,57.1,19.99,1
196,LUKA EMMAUENEL,0.0,41,4.80,130.0,89.0,16.48,69.0,0.25,0
197,IORVAA JOSEPH,0.0,36,6.10,140.0,100.0,1.64,68.3,25.39,1
198,AARON DAMILOLA,,21,5.60,108.0,85.0,1.68,59.8,21.19,0


In [274]:
X = df[['GENDER', 'Age', 'GLUCOSE ', 'Systolic', 'Diastolic', 'HEIGHT', 'WEIGHT', 'BMI']]
Y = df['Diabetes']


In [276]:
print(X.isnull().sum())

GENDER        7
Age           0
GLUCOSE       0
Systolic     79
Diastolic    80
HEIGHT        0
WEIGHT       82
BMI          82
dtype: int64


In [277]:
from sklearn.impute import SimpleImputer

# Create an imputer to fill missing values with the mean
imputer = SimpleImputer(strategy='mean')

# Impute the missing values in X
X_imputed = imputer.fit_transform(X)


In [278]:
# Check again for missing values in X
print(pd.DataFrame(X_imputed).isnull().sum())


0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
dtype: int64


### TRAIN TEST SPLIT AND MODEL TRAINING WITH ACCURACY CHECK

In [286]:
# Train a Logistic Regression model
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Split the data into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X_imputed, Y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the Logistic Regression model
model = LogisticRegression()
model.fit(X_train_scaled, Y_train)

# Evaluate the model
from sklearn.metrics import accuracy_score, classification_report
Y_pred_lr = model.predict(X_test_scaled)

accuracy_lr = accuracy_score(Y_test, Y_pred_lr) * 100
print(f'Logistic Regression Accuracy: {accuracy_lr:.2f}%')
print(f'Classification Report (Logistic Regression):\n{classification_report(Y_test, Y_pred)}')


Logistic Regression Accuracy: 84.78%
Classification Report (Logistic Regression):
              precision    recall  f1-score   support

           0       0.86      0.94      0.90        33
           1       0.80      0.62      0.70        13

    accuracy                           0.85        46
   macro avg       0.83      0.78      0.80        46
weighted avg       0.84      0.85      0.84        46



In [285]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Train a Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, Y_train)

# Make predictions
Y_pred_rf = rf_model.predict(X_test_scaled)

# Calculate and display accuracy as a percentage
accuracy_rf = accuracy_score(Y_test, Y_pred_rf) * 100
print(f'Random Forest Accuracy: {accuracy_rf:.2f}%')

# Classification report
print(f'Classification Report (Random Forest):\n{classification_report(Y_test, Y_pred_rf)}')


Random Forest Accuracy: 97.83%
Classification Report (Random Forest):
              precision    recall  f1-score   support

           0       1.00      0.97      0.98        33
           1       0.93      1.00      0.96        13

    accuracy                           0.98        46
   macro avg       0.96      0.98      0.97        46
weighted avg       0.98      0.98      0.98        46



In [287]:
from sklearn.svm import SVC

# Train an SVM model
svm_model = SVC(random_state=42)
svm_model.fit(X_train_scaled, Y_train)

# Make predictions
Y_pred_svm = svm_model.predict(X_test_scaled)

# Calculate and display accuracy as a percentage
accuracy_svm = accuracy_score(Y_test, Y_pred_svm) * 100
print(f'SVM Accuracy: {accuracy_svm:.2f}%')

# Classification report
print(f'Classification Report (SVM):\n{classification_report(Y_test, Y_pred_svm)}')


SVM Accuracy: 91.30%
Classification Report (SVM):
              precision    recall  f1-score   support

           0       0.91      0.97      0.94        33
           1       0.91      0.77      0.83        13

    accuracy                           0.91        46
   macro avg       0.91      0.87      0.89        46
weighted avg       0.91      0.91      0.91        46



In [288]:
from sklearn.ensemble import GradientBoostingClassifier

# Train a Gradient Boosting model
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train_scaled, Y_train)

# Make predictions
Y_pred_gb = gb_model.predict(X_test_scaled)

# Calculate and display accuracy as a percentage
accuracy_gb = accuracy_score(Y_test, Y_pred_gb) * 100
print(f'Gradient Boosting Accuracy: {accuracy_gb:.2f}%')

# Classification report
print(f'Classification Report (Gradient Boosting):\n{classification_report(Y_test, Y_pred_gb)}')


Gradient Boosting Accuracy: 100.00%
Classification Report (Gradient Boosting):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        33
           1       1.00      1.00      1.00        13

    accuracy                           1.00        46
   macro avg       1.00      1.00      1.00        46
weighted avg       1.00      1.00      1.00        46



In [290]:
from sklearn.neighbors import KNeighborsClassifier

# Train a KNN model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_scaled, Y_train)

# Make predictions
Y_pred_knn = knn_model.predict(X_test_scaled)

# Calculate and display accuracy as a percentage
accuracy_knn = accuracy_score(Y_test, Y_pred_knn) * 100
print(f'KNN Accuracy: {accuracy_knn:.2f}%')

# Classification report
print(f'Classification Report (KNN):\n{classification_report(Y_test, Y_pred_knn)}')


KNN Accuracy: 91.30%
Classification Report (KNN):
              precision    recall  f1-score   support

           0       0.91      0.97      0.94        33
           1       0.91      0.77      0.83        13

    accuracy                           0.91        46
   macro avg       0.91      0.87      0.89        46
weighted avg       0.91      0.91      0.91        46



In [291]:
from sklearn.neural_network import MLPClassifier

# Train an MLPClassifier (Neural Network)
mlp_model = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42)
mlp_model.fit(X_train_scaled, Y_train)

# Make predictions
Y_pred_mlp = mlp_model.predict(X_test_scaled)

# Calculate and display accuracy as a percentage
accuracy_mlp = accuracy_score(Y_test, Y_pred_mlp) * 100
print(f'MLP Accuracy: {accuracy_mlp:.2f}%')

# Classification report
print(f'Classification Report (MLP):\n{classification_report(Y_test, Y_pred_mlp)}')


MLP Accuracy: 91.30%
Classification Report (MLP):
              precision    recall  f1-score   support

           0       0.91      0.97      0.94        33
           1       0.91      0.77      0.83        13

    accuracy                           0.91        46
   macro avg       0.91      0.87      0.89        46
weighted avg       0.91      0.91      0.91        46



In [303]:
# Create a dictionary to store model names and accuracies
model_accuracies = {}

print("List of used Models in order of accuracy\n")

# Example for Gradient Boosting Model
model_accuracies["Gradient Boosting"] = accuracy_gb
# Example for Random Forest Model
model_accuracies["Random Forest"] = accuracy_rf
# Example for Support Vector Machine Model
model_accuracies["SVM"] = accuracy_svm
# Example for KNN Model
model_accuracies["KNN"] = accuracy_knn
# Example for MLP (Neural Network) Model
model_accuracies["MLP"] = accuracy_mlp

# Display the model accuracies
for model, accuracy in model_accuracies.items():
    print(f'{model}: {accuracy:.2f}%')


List of used Models in order of accuracy

Gradient Boosting: 100.00%
Random Forest: 97.83%
SVM: 91.30%
KNN: 91.30%
MLP: 91.30%
