In [1]:
# Libraries for data loading, data manipulation and data visulisation
import pandas as pd  # Data processing and manipulation 
import matplotlib.pyplot as plt  # Data visualization
import seaborn as sns  # Statistical data visualization
import numpy as np  # Numerical operations and linear algebra 

# Libraries for data preparation and model building
from scipy.stats import norm  # Statistical functions
from sklearn.preprocessing import StandardScaler  # Data preprocessing
import warnings  # Warning handling
warnings.filterwarnings('ignore')  # Ignore warnings

# LOADING DATA

In [2]:
# load the data, used the pandas read_csv() to import our dataset 
data = pd.read_excel('DiabetesDB.xlsx', sheet_name=None)
#df.head() #Preview the dataset

In [3]:
# Cocantenation

df = pd.concat(data.values(), ignore_index=True)


In [4]:
df.head()

Unnamed: 0,NAME,SURNAME,GENDER,YOB,PHONE,EMAIL,STATE OF ORIGIN,PHONE CALL,NAME.1,CONTACT,RELATIONSHIP,GLUCOSE,BLOOD P.,BMI,HEIGHT,WEIGHT,FILE LINK,HEIGHT (cm)
0,ALICE,JOSEPH,,70 YEARS,9021451814,NILL,FCT,Yet to Call,ALICE,9021452000.0,DAUGTHER,8.2MMOL,104/67MMHG,,,48.3,,
1,AMOS,KPAKACHI,,60 YRS,9076753570,NILL,FCT,Yet to Call,MUSA,9072494000.0,BROTHER,10.1MMOL,127/74MMHG,,,62.7,,
2,AYUBA,BARDE,,50 YRS,8076708464,NILL,FCT,Yet to Call,IKO AYUBA,9012614000.0,SON,7.3MM0L,90/60MMHG,,,68.9,,
3,AYUBA,GWAZAWA,,1982,8074457724,AYUBAGWAZAWA@GMAIL.COM,FCT,Yet to Call,ESTHER AYUBA,7014740000.0,WIFE,6.9MMOL,123/79MMGH,,,59.8,,
4,BEAUTY,SUNDAY,,1998,7011418074,NILL,FCT,Yet to Call,MATHEW SUNDAY,7087563000.0,BROTHER,4.4MMOL,104/80MMHG,,,72.4,,


In [5]:
df.info()
df.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151 entries, 0 to 150
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   NAME             151 non-null    object 
 1   SURNAME          149 non-null    object 
 2   GENDER           91 non-null     object 
 3   YOB              145 non-null    object 
 4   PHONE            146 non-null    object 
 5   EMAIL            131 non-null    object 
 6   STATE OF ORIGIN  149 non-null    object 
 7   PHONE CALL       151 non-null    object 
 8   NAME             127 non-null    object 
 9   CONTACT          126 non-null    float64
 10  RELATIONSHIP     128 non-null    object 
 11  GLUCOSE          147 non-null    object 
 12  BLOOD P.         148 non-null    object 
 13  BMI              35 non-null     float64
 14  HEIGHT           40 non-null     object 
 15  WEIGHT           145 non-null    object 
 16  FILE  LINK       0 non-null      float64
 17  HEIGHT (cm)     

Unnamed: 0,CONTACT,BMI,FILE LINK
count,126.0,35.0,0.0
mean,10106300000.0,26.670571,
std,13052820000.0,4.933087,
min,70867280.0,19.3,
25%,7078705000.0,22.465,
50%,8075620000.0,26.36,
75%,9024573000.0,29.63,
max,91579320000.0,39.0,


## DATA PREPROCESSING

In [6]:
# Step 1: Create 'Full_Name' column
df['FULL_NAME'] = df['NAME '].fillna('') + ' ' + df['SURNAME'].fillna('')

# Step 2: Reorder columns to place 'Full_Name' first
columns = ['FULL_NAME'] + [col for col in df.columns if col not in ['FULL_NAME']]
df = df[columns]


In [7]:
print(df.columns)

Index(['FULL_NAME', 'NAME ', 'SURNAME', 'GENDER', 'YOB ', 'PHONE ', 'EMAIL',
       'STATE OF ORIGIN', 'PHONE CALL ', 'NAME', 'CONTACT', 'RELATIONSHIP',
       'GLUCOSE ', 'BLOOD P.', 'BMI', 'HEIGHT', 'WEIGHT', 'FILE  LINK',
       'HEIGHT (cm)'],
      dtype='object')


In [8]:
df = df.drop(['NAME ', 'SURNAME', 'PHONE ', 'EMAIL','STATE OF ORIGIN', 'NAME', 'PHONE CALL ', 'CONTACT', 'RELATIONSHIP', 'FILE  LINK'], axis=1)

In [9]:
print(df.columns)

Index(['FULL_NAME', 'GENDER', 'YOB ', 'GLUCOSE ', 'BLOOD P.', 'BMI', 'HEIGHT',
       'WEIGHT', 'HEIGHT (cm)'],
      dtype='object')


### Code to Clean BMI Column

In [10]:
import re

def clean_and_convert_to_meters(value):
    # Remove non-numeric characters and decimal points
    cleaned_value = re.sub(r'\D', '', str(value))
    
    if cleaned_value:
        # Convert to float and treat it as centimeters
        height_in_cm = float(cleaned_value)
        # Convert cm to meters
        return height_in_cm / 100
    else:
        return None  # Return None if value is empty or non-numeric

# Apply the function to the 'height' column
df['HEIGHT'] = df['HEIGHT'].apply(clean_and_convert_to_meters)


# Apply the function to the 'height(cm)' column
df['HEIGHT (cm)'] = df['HEIGHT (cm)'].apply(clean_and_convert_to_meters)



In [11]:
# Create a new 'height' column
df['HEIGHT'] = df['HEIGHT'].fillna(df['HEIGHT (cm)'])

# Drop the 'height (cm)' column now that data is merged
df.drop(columns=['HEIGHT (cm)'], inplace=True)

# Calculate the mean height from available values and round to 2 decimal places
mean_height = round(df['HEIGHT'].mean(), 2)

# Fill missing values in 'height' with the rounded mean
df['HEIGHT'].fillna(mean_height, inplace=True)

In [12]:
# Function to remove non-numeric characters from weight column
def clean_weight(value):
    # Remove all non-numeric characters, keep only digits and decimal points
    cleaned_value = re.sub(r'[^0-9.]', '', str(value))
    return float(cleaned_value) if cleaned_value else None

# Apply the function to the 'weight' column
df['WEIGHT'] = df['WEIGHT'].apply(clean_weight)

In [13]:
# Calculate BMI and add it as a new column in the DataFrame
df['BMI'] = df['WEIGHT'] / (df['HEIGHT'] ** 2)

# Round BMI to 2 decimal places
df['BMI'] = df['BMI'].round(2)

In [14]:
df.head(227)

Unnamed: 0,FULL_NAME,GENDER,YOB,GLUCOSE,BLOOD P.,BMI,HEIGHT,WEIGHT
0,ALICE JOSEPH,,70 YEARS,8.2MMOL,104/67MMHG,8.53,2.38,48.3
1,AMOS KPAKACHI,,60 YRS,10.1MMOL,127/74MMHG,11.07,2.38,62.7
2,AYUBA BARDE,,50 YRS,7.3MM0L,90/60MMHG,12.16,2.38,68.9
3,AYUBA GWAZAWA,,1982,6.9MMOL,123/79MMGH,10.56,2.38,59.8
4,BEAUTY SUNDAY,,1998,4.4MMOL,104/80MMHG,12.78,2.38,72.4
...,...,...,...,...,...,...,...,...
146,SIMI STEPHEN,FEMALE,2000.0,8.2MMOL,85/64MMHG,25.00,1.49,55.5
147,ALIEGBA SAMUEL,FEMALE,2002.0,4.8MMOL,113/70MMHG,26.03,1.74,78.8
148,HOPE DANIEL,FEMALE,1987.0,8.3MMOL,115/94MMHG,35.76,1.59,90.4
149,DAVID MOSES,MALE,2001.0,5.2MMOL,116/75MMHG,23.23,1.58,58.0


In [15]:
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151 entries, 0 to 150
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   FULL_NAME  151 non-null    object 
 1   GENDER     91 non-null     object 
 2   YOB        145 non-null    object 
 3   GLUCOSE    147 non-null    object 
 4   BLOOD P.   148 non-null    object 
 5   BMI        145 non-null    float64
 6   HEIGHT     151 non-null    float64
 7   WEIGHT     145 non-null    float64
dtypes: float64(3), object(5)
memory usage: 9.6+ KB


Unnamed: 0,BMI,HEIGHT,WEIGHT
count,145.0,151.0,145.0
mean,18.576207,2.381788,68.006897
std,9.882518,2.279754,18.493365
min,0.21,1.49,22.2
25%,10.1,1.685,55.3
50%,18.78,1.84,63.4
75%,26.03,2.38,78.8
max,49.65,16.48,141.8


### Code to Clean Gender Column

In [16]:
# Fill missing values in 'Gender' with 'Unknown'
df['GENDER'].fillna('Unknown', inplace=True)

# Define mapping for 'Male' and 'Female', with 'Unknown' as a separate category if needed
gender_mapping = {'MALE': 0, 'FEMALE': 1, 'Unknown': 2}
df['GENDER'] = df['GENDER'].map(gender_mapping)


In [17]:
df.head(151)

Unnamed: 0,FULL_NAME,GENDER,YOB,GLUCOSE,BLOOD P.,BMI,HEIGHT,WEIGHT
0,ALICE JOSEPH,2.0,70 YEARS,8.2MMOL,104/67MMHG,8.53,2.38,48.3
1,AMOS KPAKACHI,2.0,60 YRS,10.1MMOL,127/74MMHG,11.07,2.38,62.7
2,AYUBA BARDE,2.0,50 YRS,7.3MM0L,90/60MMHG,12.16,2.38,68.9
3,AYUBA GWAZAWA,2.0,1982,6.9MMOL,123/79MMGH,10.56,2.38,59.8
4,BEAUTY SUNDAY,2.0,1998,4.4MMOL,104/80MMHG,12.78,2.38,72.4
...,...,...,...,...,...,...,...,...
146,SIMI STEPHEN,1.0,2000.0,8.2MMOL,85/64MMHG,25.00,1.49,55.5
147,ALIEGBA SAMUEL,1.0,2002.0,4.8MMOL,113/70MMHG,26.03,1.74,78.8
148,HOPE DANIEL,1.0,1987.0,8.3MMOL,115/94MMHG,35.76,1.59,90.4
149,DAVID MOSES,0.0,2001.0,5.2MMOL,116/75MMHG,23.23,1.58,58.0


### Code to Clean Age Column

In [18]:
import pandas as pd
from datetime import datetime

# Sample data with 'YOB' column (containing both year of birth and age)
#data = {
    #'YOB ': ['1995', '74 years', '1985', 'N/A', '30', '2000', '45 years', '1982', '56', 'unknown']
#}
#df = pd.DataFrame(data)

# Get the current year
current_year = datetime.now().year

# Function to clean and convert the YOB data
def clean_yob(value):
    if isinstance(value, str):  # Check if the value is a string
        # Try to extract the numeric part from the string
        digits = ''.join([ch for ch in value if ch.isdigit()])
        if len(digits) == 4:  # If it's a 4-digit number, assume it's a year of birth
            return int(digits)
        elif len(digits) > 0 and len(digits) < 3:  # If it's age (usually 2 digits), calculate YOB
            try:
                return current_year - int(digits)
            except ValueError:
                return None
    elif isinstance(value, (int, float)):  # If the value is already numeric
        if value < current_year:  # If the number is less than the current year, assume it's a year of birth
            return int(value)
    return None  # For any invalid cases (e.g., 'N/A', 'unknown', etc.)

# Apply the function to clean the 'YOB' column
df['YOB_cleaned'] = df['YOB '].apply(clean_yob)

# Display the cleaned DataFrame
print(df)


           FULL_NAME  GENDER      YOB   GLUCOSE     BLOOD P.    BMI  HEIGHT  \
0       ALICE JOSEPH     2.0  70 YEARS   8.2MMOL  104/67MMHG   8.53    2.38   
1      AMOS KPAKACHI     2.0    60 YRS  10.1MMOL  127/74MMHG  11.07    2.38   
2        AYUBA BARDE     2.0    50 YRS   7.3MM0L   90/60MMHG  12.16    2.38   
3      AYUBA GWAZAWA     2.0      1982   6.9MMOL  123/79MMGH  10.56    2.38   
4     BEAUTY  SUNDAY     2.0      1998   4.4MMOL  104/80MMHG  12.78    2.38   
..               ...     ...       ...       ...         ...    ...     ...   
146     SIMI STEPHEN     1.0    2000.0   8.2MMOL   85/64MMHG  25.00    1.49   
147  ALIEGBA  SAMUEL     1.0    2002.0   4.8MMOL  113/70MMHG  26.03    1.74   
148      HOPE DANIEL     1.0    1987.0   8.3MMOL  115/94MMHG  35.76    1.59   
149     DAVID  MOSES     0.0    2001.0   5.2MMOL  116/75MMHG  23.23    1.58   
150      NJOKU ISSAC     0.0    1994.0   5.8MMOL  105/66MMHG   0.22   16.45   

     WEIGHT  YOB_cleaned  
0      48.3       1954.0

In [19]:
# Assuming `df` is the DataFrame with the cleaned 'YOB_cleaned' column
current_year = datetime.now().year

# Calculate age by subtracting the year of birth from the current year
df['Age'] = current_year - df['YOB_cleaned']

# Convert the 'Age' column to integer, handling missing values
df['Age'] = df['Age'].fillna(df['Age'].median()).astype(int)

# Display the DataFrame with the new 'Age' column
print(df)


           FULL_NAME  GENDER      YOB   GLUCOSE     BLOOD P.    BMI  HEIGHT  \
0       ALICE JOSEPH     2.0  70 YEARS   8.2MMOL  104/67MMHG   8.53    2.38   
1      AMOS KPAKACHI     2.0    60 YRS  10.1MMOL  127/74MMHG  11.07    2.38   
2        AYUBA BARDE     2.0    50 YRS   7.3MM0L   90/60MMHG  12.16    2.38   
3      AYUBA GWAZAWA     2.0      1982   6.9MMOL  123/79MMGH  10.56    2.38   
4     BEAUTY  SUNDAY     2.0      1998   4.4MMOL  104/80MMHG  12.78    2.38   
..               ...     ...       ...       ...         ...    ...     ...   
146     SIMI STEPHEN     1.0    2000.0   8.2MMOL   85/64MMHG  25.00    1.49   
147  ALIEGBA  SAMUEL     1.0    2002.0   4.8MMOL  113/70MMHG  26.03    1.74   
148      HOPE DANIEL     1.0    1987.0   8.3MMOL  115/94MMHG  35.76    1.59   
149     DAVID  MOSES     0.0    2001.0   5.2MMOL  116/75MMHG  23.23    1.58   
150      NJOKU ISSAC     0.0    1994.0   5.8MMOL  105/66MMHG   0.22   16.45   

     WEIGHT  YOB_cleaned  Age  
0      48.3       1

### Code to Clean Glucose Column

In [20]:
# First, clean the 'glucose' column
df['GLUCOSE '] = df['GLUCOSE '].str.replace('MMOL', '').str.strip()  # Remove 'MMOL' and any extra spaces
df['GLUCOSE '] = pd.to_numeric(df['GLUCOSE '], errors='coerce')      # Convert to numeric, set invalid parsing as NaN

# Fill missing values with the mean (you could also use median or another method)
df['GLUCOSE '].fillna(df['GLUCOSE '].mean(), inplace=True)

# Round the 'glucose' values to 2 decimal places
df['GLUCOSE '] = df['GLUCOSE '].round(2)

# Display the cleaned and rounded 'glucose' column
print(df['GLUCOSE '])

0       8.20
1      10.10
2       6.97
3       6.90
4       4.40
       ...  
146     8.20
147     4.80
148     8.30
149     5.20
150     5.80
Name: GLUCOSE , Length: 151, dtype: float64


### Code to Clean Blood Pressure Column

In [21]:
# Remove 'MMHG' and any extra spaces
df['BLOOD P.'] = df['BLOOD P.'].str.replace('MMHG', '').str.strip()

# Split the blood pressure values into systolic and diastolic columns
df[['Systolic', 'Diastolic']] = df['BLOOD P.'].str.split('/', expand=True)

# Convert both columns to numeric
df['Systolic'] = pd.to_numeric(df['Systolic'], errors='coerce')
df['Diastolic'] = pd.to_numeric(df['Diastolic'], errors='coerce')

# Optional: Round the values to 2 decimal places (if necessary)
df['Systolic'] = df['Systolic'].round(2)
df['Diastolic'] = df['Diastolic'].round(2)

# Optionally, you can drop the original 'BLOOD P.' column
df.drop(columns=['BLOOD P.'], inplace=True)

# Display the cleaned data
print(df[['Systolic', 'Diastolic']])


     Systolic  Diastolic
0       104.0       67.0
1       127.0       74.0
2        90.0       60.0
3       123.0        NaN
4       104.0       80.0
..        ...        ...
146      85.0       64.0
147     113.0       70.0
148     115.0       94.0
149     116.0       75.0
150     105.0       66.0

[151 rows x 2 columns]


In [22]:
df.columns

Index(['FULL_NAME', 'GENDER', 'YOB ', 'GLUCOSE ', 'BMI', 'HEIGHT', 'WEIGHT',
       'YOB_cleaned', 'Age', 'Systolic', 'Diastolic'],
      dtype='object')

### FINAL ORDERED COLUMNS

In [23]:
# Remove unnecessary columns
df.drop(columns=['YOB ', 'YOB_cleaned'], inplace=True)

# Define the new column order
new_order = ['FULL_NAME', 'GENDER', 'Age', 'GLUCOSE ', 'Systolic', 'Diastolic', 'HEIGHT', 'WEIGHT', 'BMI']

# Rearrange the columns
df = df[new_order]

# Display the updated DataFrame
print(df.head(151))


           FULL_NAME  GENDER  Age  GLUCOSE   Systolic  Diastolic  HEIGHT  \
0       ALICE JOSEPH     2.0   70      8.20     104.0       67.0    2.38   
1      AMOS KPAKACHI     2.0   60     10.10     127.0       74.0    2.38   
2        AYUBA BARDE     2.0   50      6.97      90.0       60.0    2.38   
3      AYUBA GWAZAWA     2.0   42      6.90     123.0        NaN    2.38   
4     BEAUTY  SUNDAY     2.0   26      4.40     104.0       80.0    2.38   
..               ...     ...  ...       ...       ...        ...     ...   
146     SIMI STEPHEN     1.0   24      8.20      85.0       64.0    1.49   
147  ALIEGBA  SAMUEL     1.0   22      4.80     113.0       70.0    1.74   
148      HOPE DANIEL     1.0   37      8.30     115.0       94.0    1.59   
149     DAVID  MOSES     0.0   23      5.20     116.0       75.0    1.58   
150      NJOKU ISSAC     0.0   30      5.80     105.0       66.0   16.45   

     WEIGHT    BMI  
0      48.3   8.53  
1      62.7  11.07  
2      68.9  12.16  
3  