In [1]:
# IMPORTING LIBRARIES

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler




# NHanes data from 2017 - 2020

In [2]:
# LOADING DATASETS

df_diabetes = pd.read_sas("P_DIQ_Diabetes.XPT")

df_diet = pd.read_sas("P_DBQ_Diet.XPT")

df_demo = pd.read_sas("P_DEMO_Demographics.XPT")

df_activity = pd.read_sas("P_PAQ_Activity.XPT")

df_body = pd.read_sas("P_BMX_BodyMes.XPT")

df_alc = pd.read_sas("P_ALQ_Alcohol.XPT")

# Renaming column names

In [3]:
#  DIABETES
new_diabetes_column_names = {
    'SEQN': 'Respondent_ID',
    'DIQ010': 'Doctor_Told_Diabetes',
    'DID040': 'Age_When_First_Told_Diabetes',
    'DIQ159': 'Check_Item_1',
    'DIQ160': 'Ever_Told_Prediabetes',
    'DIQ180': 'Had_Blood_Test_Past_Three_Years',
    'DIQ050': 'Taking_Insulin_Now',
    'DID060': 'How_Long_Taking_Insulin',
    'DIQ060U': 'Insulin_Duration_Unit',
    'DIQ065': 'Check_Item_2',
    'DIQ070': 'Take_Diabetic_Pills',
    'DIQ229': 'Check_Item_3',
    'DIQ230': 'Time_Since_Saw_Diabetes_Specialist',
    'DIQ240': 'One_Doctor_For_Diabetes',
    'DID250': 'Times_Seen_Doctor_Past_Year',
    'DID260': 'How_Often_Check_Blood_Glucose',
    'DIQ260U': 'Blood_Glucose_Check_Unit',
    'DIQ275': 'Dr_Checked_A1C_Past_Year',
    'DIQ280': 'Last_A1C_Level',
    'DIQ291': 'Dr_Recommended_A1C',
    'DIQ295': 'Check_Item_4',
    'DIQ300S': 'Recent_SBP',
    'DIQ300D': 'Recent_DBP',
    'DID310S': 'Dr_Recommended_SBP',
    'DID310D': 'Dr_Recommended_DBP',
    'DID320': 'Most_Recent_LDL_Number',
    'DID330': 'Dr_Recommended_LDL',
    'DID341': 'Times_Dr_Checked_Feet_For_Sores',
    'DID350': 'How_Often_Check_Feet',
    'DIQ350U': 'Feet_Check_Unit',
    'DIQ360': 'Last_Pupil_Dilation_Date',
    'DIQ080': 'Diabetes_Affected_Eyes_Retinopathy'
}

# Renaming the columns
df_diabetes.rename(columns=new_diabetes_column_names, inplace=True)

In [4]:
# DIET

new_diet_column_names= {
    'SEQN': 'Respondent_ID',
    'DBQ010': 'Ever_Breastfed_or_Fed_Breastmilk',
    'DBD030': 'Age_Stopped_Breastfeeding_Days',
    'DBD041': 'Age_First_Fed_Formula_Days',
    'DBD050': 'Age_Stopped_Receiving_Formula_Days',
    'DBD055': 'Age_Started_Other_Food_Beverage',
    'DBD061': 'Age_First_Fed_Milk_Days',
    'DBQ073A': 'Type_of_Milk_First_Fed_Whole_Milk',
    'DBQ073B': 'Type_of_Milk_First_Fed_2_Percent_Milk',
    'DBQ073C': 'Type_of_Milk_First_Fed_1_Percent_Milk',
    'DBQ073D': 'Type_of_Milk_First_Fed_Fat_Free_Milk',
    'DBQ073E': 'Type_of_Milk_First_Fed_Soy_Milk',
    'DBQ073U': 'Type_of_Milk_First_Fed_Other',
    'DBD085': 'Check_Item_1',
    'DBQ700': 'How_Healthy_is_the_Diet',
    'DBQ197': 'Past_30_Day_Milk_Product_Consumption',
    'DBQ223A': 'You_Drink_Whole_or_Regular_Milk',
    'DBQ223B': 'You_Drink_2_Percent_Fat_Milk',
    'DBQ223C': 'You_Drink_1_Percent_Fat_Milk',
    'DBQ223D': 'You_Drink_Fat_Free_Skim_Milk',
    'DBQ223E': 'You_Drink_Soy_Milk',
    'DBQ223U': 'You_Drink_Another_Type_of_Milk',
    'DBD225': 'Check_Item_2',
    'DBQ229': 'Regular_Milk_Use_5_Times_Per_Week',
    'DBQ235A': 'How_Often_Drank_Milk_Age_5_12',
    'DBQ235B': 'How_Often_Drank_Milk_Age_13_17',
    'DBQ235C': 'How_Often_Drank_Milk_Age_18_35',
    'DBD265a': 'Check_Item_3',
    'DBQ301': 'Community_Government_Meals_Delivered',
    'DBQ330': 'Eat_Meals_at_Community_Senior_Center',
    'DBD355': 'Check_Item_4',
    'DBQ360': 'Attend_Kindergarten_Thru_High_School',
    'DBQ370': 'School_Serves_School_Lunches',
    'DBD381': 'Number_of_Times_Per_Week_Get_School_Lunch',
    'DBQ390': 'School_Lunch_Free_Reduced_Full_Price',
    'DBQ400': 'School_Serve_Complete_Breakfast_Each_Day',
    'DBD411': 'Number_of_Times_Per_Week_Get_School_Breakfast',
    'DBQ421': 'School_Breakfast_Free_Reduced_Full_Price',
    'DBQ422': 'Check_Item_5',
    'DBQ424': 'Summer_Program_Meal_Free_Reduced_Price',
    'DBD895': 'Number_of_Meals_Not_Home_Prepared',
    'DBD900': 'Number_of_Meals_From_Fast_Food_or_Pizza_Place',
    'DBD905': 'Number_of_Ready_to_Eat_Foods_in_Past_30_Days',
    'DBD910': 'Number_of_Frozen_Meals_Pizza_in_Past_30_Days',
    'DBQ715a': 'Check_Item_6',
    'CBQ596': 'Heard_of_My_Plate',
    'CBQ606': 'Looked_Up_My_Plate_on_Internet',
    'CBQ611': 'Tried_My_Plate_Plan',
    'DBQ930': 'Main_Meal_Planner_Preparer',
    'DBQ935': 'Shared_Meal_Planning_Preparing_Duty',
    'DBQ940': 'Main_Food_Shopper',
    'DBQ945': 'Shared_Food_Shopping_Duty'
    }

df_diet.rename(columns=new_diet_column_names, inplace=True)


In [5]:
# DEMOGRAPHICS
new_demo_column_names = {
    'SEQN': 'Respondent_ID',
    'SDDSRVYR': 'Data_Release_Cycle',
    'RIDSTATR': 'Interview_Examination_Status',
    'RIAGENDR': 'Gender',
    'RIDAGEYR': 'Age_Years_Screening',
    'RIDAGEMN': 'Age_Months_Screening_0_24_Mos',
    'RIDRETH1': 'Race_Hispanic_Origin',
    'RIDRETH3': 'Race_Hispanic_Origin_with_NH_Asian',
    'RIDEXMON': 'Six_Month_Time_Period',
    'DMDBORN4': 'Country_of_Birth',
    'DMDYRUSZ': 'Length_of_Time_in_US',
    'DMDEDUC2': 'Education_Level_Adults_20',
    'DMDMARTZ': 'Marital_Status',
    'RIDEXPRG': 'Pregnancy_Status_at_Exam',
    'SIALANG': 'Language_of_SP_Interview',
    'SIAPROXY': 'Proxy_Used_in_SP_Interview',
    'SIAINTRP': 'Interpreter_Used_in_SP_Interview',
    'FIALANG': 'Language_of_Family_Interview',
    'FIAPROXY': 'Proxy_Used_in_Family_Interview',
    'FIAINTRP': 'Interpreter_Used_in_Family_Interview',
    'MIALANG': 'Language_of_MEC_Interview',
    'MIAPROXY': 'Proxy_Used_in_MEC_Interview',
    'MIAINTRP': 'Interpreter_Used_in_MEC_Interview',
    'AIALANGA': 'Language_of_ACASI_Interview',
    'WTINTPRP': 'Full_Sample_Interview_Weight',
    'WTMECPRP': 'Full_Sample_MEC_Exam_Weight',
    'SDMVPSU': 'Masked_Variance_Pseudo_PSU',
    'SDMVSTRA': 'Masked_Variance_Pseudo_Stratum'
}


df_demo.rename(columns = new_demo_column_names, inplace=True)

In [6]:
# ACTIVITY

new_activity_column_names = {
'SEQN': 'Respondent_ID',
'PAQ605': 'Vigorous work activity',
'PAQ610': 'Number of days vigorous work',
'PAD615': 'Minutes vigorous-intensity work',
'PAQ620': 'Moderate work activity',
'PAQ625': 'Number of days moderate work',
'PAD630': 'Minutes moderate-intensity work',
'PAQ635': 'Walk or bicycle',
'PAQ640': 'Number of days walk or bicycle',
'PAD645': 'Minutes walk/bicycle for transportation',
'PAQ650': 'Vigorous recreational activities',
'PAQ655': 'Days vigorous recreational activities',
'PAD660': 'Minutes vigorous recreational activities',
'PAQ665': 'Moderate recreational activities',
'PAQ670': 'Days moderate recreational activities',
'PAD675': 'Minutes moderate recreational activities',
'PAD680': 'Minutes sedentary activity'
}


df_activity.rename(columns = new_activity_column_names, inplace=True)


In [7]:
# BODY MEASUREMENTS

new_body_column_names = {
    'SEQN': 'Respondent_ID',
    'BMDSTATS': 'Body Measures Component Status Code',
    'BMXWT': 'Weight (kg)',
    'BMIWT': 'Weight Comment',
    'BMXRECUM': 'Recumbent Length (cm)',
    'BMIRECUM': 'Recumbent Length Comment',
    'BMXHEAD': 'Head Circumference (cm)',
    'BMIHEAD': 'Head Circumference Comment',
    'BMXHT': 'Standing Height (cm)',
    'BMIHT': 'Standing Height Comment',
    'BMXBMI': 'Body Mass Index (kg/m**2)',
    'BMDBMIC': 'BMI Category - Children/Youth',
    'BMXLEG': 'Upper Leg Length (cm)',
    'BMILEG': 'Upper Leg Length Comment',
    'BMXARML': 'Upper Arm Length (cm)',
    'BMIARML': 'Upper Arm Length Comment',
    'BMXARMC': 'Arm Circumference (cm)',
    'BMIARMC': 'Arm Circumference Comment',
    'BMXWAIST': 'Waist Circumference (cm)',
    'BMIWAIST': 'Waist Circumference Comment',
    'BMXHIP': 'Hip Circumference (cm)',
    'BMIHIP': 'Hip Circumference Comment'
}


df_body.rename(columns=new_body_column_names, inplace=True)


In [8]:
# ALCOHOL

new_alc_column_names = {
    'SEQN': 'Respondent_ID',
    'ALQ111': 'Ever Had a Drink of Any Kind of Alcohol',
    'ALQ121': 'Past 12 Months Frequency of Drinking Alcoholic Beverages',
    'ALQ130': 'Average Number of Alcoholic Drinks/Day in Past 12 Months',
    'ALQ142': 'Number of Days with 4 or 5 Drinks in Past 12 Months',
    'ALQ270': 'Number of Times 4-5 Drinks in 2 Hours in Past 12 Months',
    'ALQ280': 'Number of Times 8+ Drinks in 1 Day in Past 12 Months',
    'ALQ290': 'Number of Times 12+ Drinks in 1 Day in Past 12 Months',
    'ALQ151': 'Ever Had 4/5 or More Drinks Every Day',
    'ALQ170CK': 'CHECK ITEM',
    'ALQ170': 'Past 30 Days Number of Times 4-5 Drinks on an Occasion'
}

df_alc.rename(columns=new_alc_column_names, inplace=True)


# Dropping columns

In [9]:
df_diabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14986 entries, 0 to 14985
Data columns (total 28 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Respondent_ID                       14986 non-null  float64
 1   Doctor_Told_Diabetes                14986 non-null  float64
 2   Age_When_First_Told_Diabetes        1443 non-null   float64
 3   Ever_Told_Prediabetes               9516 non-null   float64
 4   Had_Blood_Test_Past_Three_Years     9796 non-null   float64
 5   Taking_Insulin_Now                  1445 non-null   float64
 6   How_Long_Taking_Insulin             427 non-null    float64
 7   Insulin_Duration_Unit               418 non-null    float64
 8   Take_Diabetic_Pills                 2679 non-null   float64
 9   Time_Since_Saw_Diabetes_Specialist  1443 non-null   float64
 10  One_Doctor_For_Diabetes             1443 non-null   float64
 11  Times_Seen_Doctor_Past_Year         1107 

In [10]:
# List of columns to keep
columns_to_keep_diabetes = ['Respondent_ID', 'Doctor_Told_Diabetes', 'Age_When_First_Told_Diabetes']

# Drop columns not in the list
df_diabetes = df_diabetes[columns_to_keep_diabetes]


In [11]:
# DEMO

columns_to_keep_activity = ['Respondent_ID', 'Gender', 'Age_Years_Screening', 'Race_Hispanic_Origin_with_NH_Asian', 'Country_of_Birth']

df_demo = df_demo[columns_to_keep_activity]

# Merging the datasets

In [12]:
# Merge df_diabetes and df_diet on 'Respondent_ID'
df_merged = pd.merge(df_diabetes, df_diet, on='Respondent_ID')

# Merge df_demo on 'Respondent_ID' with the already merged DataFrame
df_merged = pd.merge(df_merged, df_demo, on='Respondent_ID')

# Merge 'df_activity' with the already merged DataFrame on 'Respondent_ID'
df_merged = pd.merge(df_merged, df_activity, on='Respondent_ID')

# Merge 'df_body' with the already merged DataFrame on 'Respondent_ID'
df_merged = pd.merge(df_merged, df_body, on='Respondent_ID')

# Merge 'df_alc' with the already merged DataFrame on 'Respondent_ID'
df_merged = pd.merge(df_merged, df_alc, on='Respondent_ID')


In [13]:
df_merged.round(0)

Unnamed: 0,Respondent_ID,Doctor_Told_Diabetes,Age_When_First_Told_Diabetes,Ever_Breastfed_or_Fed_Breastmilk,Age_Stopped_Breastfeeding_Days,Age_First_Fed_Formula_Days,Age_Stopped_Receiving_Formula_Days,Age_Started_Other_Food_Beverage,Age_First_Fed_Milk_Days,Type_of_Milk_First_Fed_Whole_Milk,...,Hip Circumference Comment,Ever Had a Drink of Any Kind of Alcohol,Past 12 Months Frequency of Drinking Alcoholic Beverages,Average Number of Alcoholic Drinks/Day in Past 12 Months,Number of Days with 4 or 5 Drinks in Past 12 Months,Number of Times 4-5 Drinks in 2 Hours in Past 12 Months,Number of Times 8+ Drinks in 1 Day in Past 12 Months,Number of Times 12+ Drinks in 1 Day in Past 12 Months,Ever Had 4/5 or More Drinks Every Day,Past 30 Days Number of Times 4-5 Drinks on an Occasion
0,109266.0,2.0,,,,,,,,,...,,1.0,10.0,1.0,0.0,,,,2.0,0.0
1,109271.0,2.0,,,,,,,,,...,,1.0,0.0,,,,,,1.0,
2,109273.0,2.0,,,,,,,,,...,,1.0,0.0,,,,,,2.0,
3,109274.0,1.0,54.0,,,,,,,,...,,1.0,4.0,2.0,5.0,7.0,0.0,,2.0,0.0
4,109282.0,2.0,,,,,,,,,...,1.0,1.0,0.0,,,,,,2.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8960,124815.0,2.0,,,,,,,,,...,,1.0,3.0,1.0,0.0,,,,2.0,0.0
8961,124817.0,1.0,67.0,,,,,,,,...,,1.0,3.0,2.0,0.0,,,,2.0,0.0
8962,124818.0,2.0,,,,,,,,,...,,1.0,9.0,2.0,0.0,,,,2.0,0.0
8963,124821.0,3.0,,,,,,,,,...,,1.0,5.0,5.0,7.0,0.0,0.0,,2.0,0.0


In [14]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8965 entries, 0 to 8964
Data columns (total 98 columns):
 #   Column                                                    Non-Null Count  Dtype  
---  ------                                                    --------------  -----  
 0   Respondent_ID                                             8965 non-null   float64
 1   Doctor_Told_Diabetes                                      8965 non-null   float64
 2   Age_When_First_Told_Diabetes                              1326 non-null   float64
 3   Ever_Breastfed_or_Fed_Breastmilk                          0 non-null      float64
 4   Age_Stopped_Breastfeeding_Days                            0 non-null      float64
 5   Age_First_Fed_Formula_Days                                0 non-null      float64
 6   Age_Stopped_Receiving_Formula_Days                        0 non-null      float64
 7   Age_Started_Other_Food_Beverage                           0 non-null      float64
 8   Age_First_Fed_Milk

In [15]:
df_merged.describe()

Unnamed: 0,Respondent_ID,Doctor_Told_Diabetes,Age_When_First_Told_Diabetes,Ever_Breastfed_or_Fed_Breastmilk,Age_Stopped_Breastfeeding_Days,Age_First_Fed_Formula_Days,Age_Stopped_Receiving_Formula_Days,Age_Started_Other_Food_Beverage,Age_First_Fed_Milk_Days,Type_of_Milk_First_Fed_Whole_Milk,...,Hip Circumference Comment,Ever Had a Drink of Any Kind of Alcohol,Past 12 Months Frequency of Drinking Alcoholic Beverages,Average Number of Alcoholic Drinks/Day in Past 12 Months,Number of Days with 4 or 5 Drinks in Past 12 Months,Number of Times 4-5 Drinks in 2 Hours in Past 12 Months,Number of Times 8+ Drinks in 1 Day in Past 12 Months,Number of Times 12+ Drinks in 1 Day in Past 12 Months,Ever Had 4/5 or More Drinks Every Day,Past 30 Days Number of Times 4-5 Drinks on an Occasion
count,8965.0,8965.0,1326.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,350.0,8370.0,7503.0,5863.0,5863.0,2471.0,2471.0,933.0,7501.0,5849.0
mean,117107.849637,1.883882,59.315988,,,,,,,,...,1.0,1.103584,4.945622,4.187958,3.390585,4.541076,3.290571,4.37299,1.859085,14.96512
std,4501.048325,0.435757,98.07012,,,,,,,,...,0.0,0.304739,3.976266,40.354534,6.646142,11.28773,7.357537,10.0207,0.43849,111.2895
min,109266.0,1.0,1.0,,,,,,,,...,1.0,1.0,5.397605e-79,1.0,5.397605e-79,5.397605e-79,5.397605e-79,5.397605e-79,1.0,5.397605e-79
25%,113211.0,2.0,40.0,,,,,,,,...,1.0,1.0,1.0,1.0,5.397605e-79,5.397605e-79,5.397605e-79,5.397605e-79,2.0,5.397605e-79
50%,117091.0,2.0,50.0,,,,,,,,...,1.0,1.0,5.0,2.0,5.397605e-79,5.397605e-79,5.397605e-79,5.397605e-79,2.0,5.397605e-79
75%,121022.0,2.0,59.0,,,,,,,,...,1.0,1.0,8.0,3.0,7.0,8.0,7.0,8.0,2.0,1.0
max,124822.0,9.0,999.0,,,,,,,,...,1.0,2.0,99.0,999.0,99.0,99.0,99.0,99.0,9.0,999.0


In [16]:
# List of columns for logistic regression
columns_for_log_reg = ["Doctor_Told_Diabetes", "Gender", "Race_Hispanic_Origin_with_NH_Asian", 
                "Country_of_Birth", "How_Healthy_is_the_Diet", "Number_of_Meals_Not_Home_Prepared", 
                "Vigorous work activity", "Moderate work activity", "Walk or bicycle", "Weight (kg)", 
                "Body Mass Index (kg/m**2)", "Arm Circumference (cm)", "Waist Circumference (cm)", 
                "Hip Circumference (cm)", "Past 12 Months Frequency of Drinking Alcoholic Beverages", ]


df_10 = df_merged[columns_for_log_reg]

In [17]:
df_10.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8965 entries, 0 to 8964
Data columns (total 15 columns):
 #   Column                                                    Non-Null Count  Dtype  
---  ------                                                    --------------  -----  
 0   Doctor_Told_Diabetes                                      8965 non-null   float64
 1   Gender                                                    8965 non-null   float64
 2   Race_Hispanic_Origin_with_NH_Asian                        8965 non-null   float64
 3   Country_of_Birth                                          8965 non-null   float64
 4   How_Healthy_is_the_Diet                                   8965 non-null   float64
 5   Number_of_Meals_Not_Home_Prepared                         8965 non-null   float64
 6   Vigorous work activity                                    8965 non-null   float64
 7   Moderate work activity                                    8965 non-null   float64
 8   Walk or bicycle   

In [18]:
# Removing 3 and 9 from predictor variable
df_10 = df_10[df_10["Doctor_Told_Diabetes"].isin([1.0, 2.0])]

# Fill NAs with mean
df_10 = df_10.fillna(df_10.mean())

df_10 = df_10[df_10['Past 12 Months Frequency of Drinking Alcoholic Beverages'].isin([77, 99]) == False]


In [19]:
df_10.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8705 entries, 0 to 8964
Data columns (total 15 columns):
 #   Column                                                    Non-Null Count  Dtype  
---  ------                                                    --------------  -----  
 0   Doctor_Told_Diabetes                                      8705 non-null   float64
 1   Gender                                                    8705 non-null   float64
 2   Race_Hispanic_Origin_with_NH_Asian                        8705 non-null   float64
 3   Country_of_Birth                                          8705 non-null   float64
 4   How_Healthy_is_the_Diet                                   8705 non-null   float64
 5   Number_of_Meals_Not_Home_Prepared                         8705 non-null   float64
 6   Vigorous work activity                                    8705 non-null   float64
 7   Moderate work activity                                    8705 non-null   float64
 8   Walk or bicycle        

In [20]:
df_10["Walk or bicycle"].unique()

array([2., 1., 9.])

In [21]:
# Split the data into X (features) and y (target)
X = df_10.drop('Doctor_Told_Diabetes', axis=1)
y = df_10['Doctor_Told_Diabetes']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize logistic regression model
logreg = LogisticRegression(max_iter=10000)

# Fit the model to the training data
logreg.fit(X_train, y_train)

# Predict on test data
y_pred = logreg.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

coefficients = logreg.coef_[0]
features = X.columns

coeff_df = pd.DataFrame({'Feature': features, 'Coefficient': coefficients})

sorted_coeff_df = coeff_df.sort_values(by='Coefficient', ascending=False)


print(sorted_coeff_df)

Accuracy: 0.8575531303848363
              precision    recall  f1-score   support

         1.0       0.45      0.04      0.07       246
         2.0       0.86      0.99      0.92      1495

    accuracy                           0.86      1741
   macro avg       0.66      0.52      0.50      1741
weighted avg       0.81      0.86      0.80      1741

                                              Feature  Coefficient
0                                              Gender     0.181965
12                             Hip Circumference (cm)     0.061216
10                             Arm Circumference (cm)     0.056485
13  Past 12 Months Frequency of Drinking Alcoholic...     0.044565
3                             How_Healthy_is_the_Diet     0.028913
8                                         Weight (kg)     0.024123
2                                    Country_of_Birth     0.003554
4                   Number_of_Meals_Not_Home_Prepared    -0.000047
1                  Race_Hispanic_Origin_w

## 👨‍💻 ML MODEL FOR FIRST STREAMLIT VERSION

Below I will try to make a dumb machine learning model with only gender, hip circumference and moderate work activity as predictors.

Why? 
To make a basic Streamlit model that we can work more on when we have decided on the predictor values we will ask the user.
- this includes taking a string input and converting it into one of the numbers our model us using


In [22]:
columns_for_dumb_model = ["Doctor_Told_Diabetes", "Gender", "Hip Circumference (cm)", "Moderate work activity"]
df_3 = df_merged[columns_for_dumb_model]


# Removing 3 and 9 from predictor variable
df_3 = df_3[df_3["Doctor_Told_Diabetes"].isin([1.0, 2.0])]

df_3 = df_3[df_3["Moderate work activity"].isin([1.0, 2.0])]

# Fill NAs with mean
df_3 = df_3.fillna(df_3.mean())

df_3.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8705 entries, 0 to 8964
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Doctor_Told_Diabetes    8705 non-null   float64
 1   Gender                  8705 non-null   float64
 2   Hip Circumference (cm)  8705 non-null   float64
 3   Moderate work activity  8705 non-null   float64
dtypes: float64(4)
memory usage: 340.0 KB


In [23]:
for column in df_3.columns:
    print(f"Unique values in {column}:")
    print(df_3[column].unique())
    print("-" * 50)  # Just to separate the output for better visibility


Unique values in Doctor_Told_Diabetes:
[2. 1.]
--------------------------------------------------
Unique values in Gender:
[2. 1.]
--------------------------------------------------
Unique values in Hip Circumference (cm):
[126.1        108.2         94.5        107.8        107.17033836
 125.5        106.5        106.2        106.4        120.2
 101.          92.5        110.2        106.7         82.
  88.9         93.4         96.3         96.4        107.7
 102.         121.3        110.8        114.4        102.3
  94.8         97.5         97.          95.5        119.5
  96.          99.5        129.5        103.8        122.3
 134.3        119.3        150.4         90.5        120.4
 129.          95.          85.7        114.         102.5
  99.3        103.5        107.9        104.8        105.1
 115.5         93.8        129.7        120.9         86.1
  95.8        106.          88.7         87.5        148.3
 111.         103.         130.5         97.3         99.2
 112

In [24]:
# Split the data into X (features) and y (target)
X = df_3.drop('Doctor_Told_Diabetes', axis=1)
y = df_3['Doctor_Told_Diabetes']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize logistic regression model
logreg = LogisticRegression(max_iter=10000)

# Fit the model to the training data
logreg.fit(X_train, y_train)

# Predict on test data
y_pred = logreg.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

coefficients = logreg.coef_[0]
features = X.columns

coeff_df = pd.DataFrame({'Feature': features, 'Coefficient': coefficients})

sorted_coeff_df = coeff_df.sort_values(by='Coefficient', ascending=False)


print(sorted_coeff_df)

Accuracy: 0.8477886272257323
              precision    recall  f1-score   support

         1.0       0.00      0.00      0.00       265
         2.0       0.85      1.00      0.92      1476

    accuracy                           0.85      1741
   macro avg       0.42      0.50      0.46      1741
weighted avg       0.72      0.85      0.78      1741

                  Feature  Coefficient
0                  Gender     0.437383
1  Hip Circumference (cm)    -0.024556
2  Moderate work activity    -0.435457


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
X

Unnamed: 0,Gender,Hip Circumference (cm),Moderate work activity
0,2.0,126.100000,2.0
1,1.0,108.200000,1.0
2,1.0,94.500000,2.0
3,1.0,107.800000,1.0
4,1.0,107.170338,2.0
...,...,...,...
8959,1.0,117.400000,2.0
8960,1.0,105.300000,2.0
8961,2.0,121.400000,2.0
8962,1.0,118.000000,2.0


In [26]:
X1 = np.array([[2., 172.8, 2.]])
X1

array([[  2. , 172.8,   2. ]])

In [27]:
y_pred = logreg.predict(X1)
y_pred



array([2.])

In [28]:
from joblib import dump

# Save the logistic regression model and scaler
dump(logreg, 'logreg_model.joblib')
dump(scaler, 'scaler.joblib')

['scaler.joblib']