In [1]:
# IMPORTING LIBRARIES

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler




# NHanes data from 2017 - 2020

In [2]:
# LOADING DATASETS

df_diabetes = pd.read_sas("data_files/P_DIQ_Diabetes.XPT")

df_diet = pd.read_sas("data_files/P_DBQ_Diet.XPT")

df_demo = pd.read_sas("data_files/P_DEMO_Demographics.XPT")

df_activity = pd.read_sas("data_files/P_PAQ_Activity.XPT")

df_body = pd.read_sas("data_files/P_BMX_BodyMes.XPT")

df_alc = pd.read_sas("data_files/P_ALQ_Alcohol.XPT")


# Renaming column names

In [3]:
#  DIABETES
new_diabetes_column_names = {
    'SEQN': 'Respondent_ID',
    'DIQ010': 'Doctor_Told_Diabetes',
    'DID040': 'Age_When_First_Told_Diabetes',
    'DIQ159': 'Check_Item_1',
    'DIQ160': 'Ever_Told_Prediabetes',
    'DIQ180': 'Had_Blood_Test_Past_Three_Years',
    'DIQ050': 'Taking_Insulin_Now',
    'DID060': 'How_Long_Taking_Insulin',
    'DIQ060U': 'Insulin_Duration_Unit',
    'DIQ065': 'Check_Item_2',
    'DIQ070': 'Take_Diabetic_Pills',
    'DIQ229': 'Check_Item_3',
    'DIQ230': 'Time_Since_Saw_Diabetes_Specialist',
    'DIQ240': 'One_Doctor_For_Diabetes',
    'DID250': 'Times_Seen_Doctor_Past_Year',
    'DID260': 'How_Often_Check_Blood_Glucose',
    'DIQ260U': 'Blood_Glucose_Check_Unit',
    'DIQ275': 'Dr_Checked_A1C_Past_Year',
    'DIQ280': 'Last_A1C_Level',
    'DIQ291': 'Dr_Recommended_A1C',
    'DIQ295': 'Check_Item_4',
    'DIQ300S': 'Recent_SBP',
    'DIQ300D': 'Recent_DBP',
    'DID310S': 'Dr_Recommended_SBP',
    'DID310D': 'Dr_Recommended_DBP',
    'DID320': 'Most_Recent_LDL_Number',
    'DID330': 'Dr_Recommended_LDL',
    'DID341': 'Times_Dr_Checked_Feet_For_Sores',
    'DID350': 'How_Often_Check_Feet',
    'DIQ350U': 'Feet_Check_Unit',
    'DIQ360': 'Last_Pupil_Dilation_Date',
    'DIQ080': 'Diabetes_Affected_Eyes_Retinopathy'
}

# Renaming the columns
df_diabetes.rename(columns=new_diabetes_column_names, inplace=True)

In [4]:
# DIET

new_diet_column_names= {
    'SEQN': 'Respondent_ID',
    'DBQ010': 'Ever_Breastfed_or_Fed_Breastmilk',
    'DBD030': 'Age_Stopped_Breastfeeding_Days',
    'DBD041': 'Age_First_Fed_Formula_Days',
    'DBD050': 'Age_Stopped_Receiving_Formula_Days',
    'DBD055': 'Age_Started_Other_Food_Beverage',
    'DBD061': 'Age_First_Fed_Milk_Days',
    'DBQ073A': 'Type_of_Milk_First_Fed_Whole_Milk',
    'DBQ073B': 'Type_of_Milk_First_Fed_2_Percent_Milk',
    'DBQ073C': 'Type_of_Milk_First_Fed_1_Percent_Milk',
    'DBQ073D': 'Type_of_Milk_First_Fed_Fat_Free_Milk',
    'DBQ073E': 'Type_of_Milk_First_Fed_Soy_Milk',
    'DBQ073U': 'Type_of_Milk_First_Fed_Other',
    'DBD085': 'Check_Item_1',
    'DBQ700': 'How_Healthy_is_the_Diet',
    'DBQ197': 'Past_30_Day_Milk_Product_Consumption',
    'DBQ223A': 'You_Drink_Whole_or_Regular_Milk',
    'DBQ223B': 'You_Drink_2_Percent_Fat_Milk',
    'DBQ223C': 'You_Drink_1_Percent_Fat_Milk',
    'DBQ223D': 'You_Drink_Fat_Free_Skim_Milk',
    'DBQ223E': 'You_Drink_Soy_Milk',
    'DBQ223U': 'You_Drink_Another_Type_of_Milk',
    'DBD225': 'Check_Item_2',
    'DBQ229': 'Regular_Milk_Use_5_Times_Per_Week',
    'DBQ235A': 'How_Often_Drank_Milk_Age_5_12',
    'DBQ235B': 'How_Often_Drank_Milk_Age_13_17',
    'DBQ235C': 'How_Often_Drank_Milk_Age_18_35',
    'DBD265a': 'Check_Item_3',
    'DBQ301': 'Community_Government_Meals_Delivered',
    'DBQ330': 'Eat_Meals_at_Community_Senior_Center',
    'DBD355': 'Check_Item_4',
    'DBQ360': 'Attend_Kindergarten_Thru_High_School',
    'DBQ370': 'School_Serves_School_Lunches',
    'DBD381': 'Number_of_Times_Per_Week_Get_School_Lunch',
    'DBQ390': 'School_Lunch_Free_Reduced_Full_Price',
    'DBQ400': 'School_Serve_Complete_Breakfast_Each_Day',
    'DBD411': 'Number_of_Times_Per_Week_Get_School_Breakfast',
    'DBQ421': 'School_Breakfast_Free_Reduced_Full_Price',
    'DBQ422': 'Check_Item_5',
    'DBQ424': 'Summer_Program_Meal_Free_Reduced_Price',
    'DBD895': 'Number_of_Meals_Not_Home_Prepared',
    'DBD900': 'Number_of_Meals_From_Fast_Food_or_Pizza_Place',
    'DBD905': 'Number_of_Ready_to_Eat_Foods_in_Past_30_Days',
    'DBD910': 'Number_of_Frozen_Meals_Pizza_in_Past_30_Days',
    'DBQ715a': 'Check_Item_6',
    'CBQ596': 'Heard_of_My_Plate',
    'CBQ606': 'Looked_Up_My_Plate_on_Internet',
    'CBQ611': 'Tried_My_Plate_Plan',
    'DBQ930': 'Main_Meal_Planner_Preparer',
    'DBQ935': 'Shared_Meal_Planning_Preparing_Duty',
    'DBQ940': 'Main_Food_Shopper',
    'DBQ945': 'Shared_Food_Shopping_Duty'
    }

df_diet.rename(columns=new_diet_column_names, inplace=True)


In [5]:
# DEMOGRAPHICS
new_demo_column_names = {
    'SEQN': 'Respondent_ID',
    'SDDSRVYR': 'Data_Release_Cycle',
    'RIDSTATR': 'Interview_Examination_Status',
    'RIAGENDR': 'Gender',
    'RIDAGEYR': 'Age_Years',
    'RIDAGEMN': 'Age_Months_Screening_0_24_Mos',
    'RIDRETH1': 'Race_Hispanic_Origin',
    'RIDRETH3': 'Race_Hispanic_Origin_with_NH_Asian',
    'RIDEXMON': 'Six_Month_Time_Period',
    'DMDBORN4': 'Country_of_Birth',
    'DMDYRUSZ': 'Length_of_Time_in_US',
    'DMDEDUC2': 'Education_Level_Adults_20',
    'DMDMARTZ': 'Marital_Status',
    'RIDEXPRG': 'Pregnancy_Status_at_Exam',
    'SIALANG': 'Language_of_SP_Interview',
    'SIAPROXY': 'Proxy_Used_in_SP_Interview',
    'SIAINTRP': 'Interpreter_Used_in_SP_Interview',
    'FIALANG': 'Language_of_Family_Interview',
    'FIAPROXY': 'Proxy_Used_in_Family_Interview',
    'FIAINTRP': 'Interpreter_Used_in_Family_Interview',
    'MIALANG': 'Language_of_MEC_Interview',
    'MIAPROXY': 'Proxy_Used_in_MEC_Interview',
    'MIAINTRP': 'Interpreter_Used_in_MEC_Interview',
    'AIALANGA': 'Language_of_ACASI_Interview',
    'WTINTPRP': 'Full_Sample_Interview_Weight',
    'WTMECPRP': 'Full_Sample_MEC_Exam_Weight',
    'SDMVPSU': 'Masked_Variance_Pseudo_PSU',
    'SDMVSTRA': 'Masked_Variance_Pseudo_Stratum'
}


df_demo.rename(columns = new_demo_column_names, inplace=True)

In [6]:
# ACTIVITY

new_activity_column_names = {
'SEQN': 'Respondent_ID',
'PAQ605': 'Vigorous work activity',
'PAQ610': 'Number of days vigorous work',
'PAD615': 'Minutes vigorous-intensity work',
'PAQ620': 'Moderate work activity',
'PAQ625': 'Number of days moderate work',
'PAD630': 'Minutes moderate-intensity work',
'PAQ635': 'Walk or bicycle',
'PAQ640': 'Number of days walk or bicycle',
'PAD645': 'Minutes walk/bicycle for transportation',
'PAQ650': 'Vigorous recreational activities',
'PAQ655': 'Days vigorous recreational activities',
'PAD660': 'Minutes vigorous recreational activities',
'PAQ665': 'Moderate recreational activities',
'PAQ670': 'Days moderate recreational activities',
'PAD675': 'Minutes moderate recreational activities',
'PAD680': 'Minutes sedentary activity'
}


df_activity.rename(columns = new_activity_column_names, inplace=True)


In [7]:
# BODY MEASUREMENTS

new_body_column_names = {
    'SEQN': 'Respondent_ID',
    'BMDSTATS': 'Body Measures Component Status Code',
    'BMXWT': 'Weight (kg)',
    'BMIWT': 'Weight Comment',
    'BMXRECUM': 'Recumbent Length (cm)',
    'BMIRECUM': 'Recumbent Length Comment',
    'BMXHEAD': 'Head Circumference (cm)',
    'BMIHEAD': 'Head Circumference Comment',
    'BMXHT': 'Standing Height (cm)',
    'BMIHT': 'Standing Height Comment',
    'BMXBMI': 'Body Mass Index (kg/m**2)',
    'BMDBMIC': 'BMI Category - Children/Youth',
    'BMXLEG': 'Upper Leg Length (cm)',
    'BMILEG': 'Upper Leg Length Comment',
    'BMXARML': 'Upper Arm Length (cm)',
    'BMIARML': 'Upper Arm Length Comment',
    'BMXARMC': 'Arm Circumference (cm)',
    'BMIARMC': 'Arm Circumference Comment',
    'BMXWAIST': 'Waist Circumference (cm)',
    'BMIWAIST': 'Waist Circumference Comment',
    'BMXHIP': 'Hip Circumference (cm)',
    'BMIHIP': 'Hip Circumference Comment'
}


df_body.rename(columns=new_body_column_names, inplace=True)


In [8]:
# ALCOHOL

new_alc_column_names = {
    'SEQN': 'Respondent_ID',
    'ALQ111': 'Ever Had a Drink of Any Kind of Alcohol',
    'ALQ121': 'Past 12 Months Frequency of Drinking Alcoholic Beverages',
    'ALQ130': 'Average Number of Alcoholic Drinks/Day in Past 12 Months',
    'ALQ142': 'Number of Days with 4 or 5 Drinks in Past 12 Months',
    'ALQ270': 'Number of Times 4-5 Drinks in 2 Hours in Past 12 Months',
    'ALQ280': 'Number of Times 8+ Drinks in 1 Day in Past 12 Months',
    'ALQ290': 'Number of Times 12+ Drinks in 1 Day in Past 12 Months',
    'ALQ151': 'Ever Had 4/5 or More Drinks Every Day',
    'ALQ170CK': 'CHECK ITEM',
    'ALQ170': 'Past 30 Days Number of Times 4-5 Drinks on an Occasion'
}

df_alc.rename(columns=new_alc_column_names, inplace=True)


# Dropping columns & Cleaning 

In [9]:
# DIABETES DROPPING
# List of columns to keep
columns_to_keep_diabetes = ['Respondent_ID', 
                            'Doctor_Told_Diabetes']

# Drop columns not in the list
df_diabetes = df_diabetes[columns_to_keep_diabetes]

df_diabetes['Doctor_Told_Diabetes'] = df_diabetes['Doctor_Told_Diabetes'].replace({2.0: 0, 9.0: 0, 1.0: 1, 3.0: 1})

In [10]:
# DIET DROPPING
columns_to_keep_diet = ["Respondent_ID",
                        "Number_of_Meals_Not_Home_Prepared",
                        "Number_of_Meals_From_Fast_Food_or_Pizza_Place", 
                        "Number_of_Ready_to_Eat_Foods_in_Past_30_Days",
                        "Number_of_Frozen_Meals_Pizza_in_Past_30_Days"]

df_diet = df_diet[columns_to_keep_diet]

In [11]:
# DEMO DROPPING

columns_to_keep_demo = ['Respondent_ID', 
                            'Gender', 
                            'Age_Years', 
                            'Race_Hispanic_Origin_with_NH_Asian']

df_demo = df_demo[columns_to_keep_demo]

In [12]:
# ACTIVITY DROPPING

columns_to_keep_activity = ["Respondent_ID",
                            'Vigorous work activity',
                            'Walk or bicycle',
                            'Vigorous recreational activities']

df_activity = df_activity[columns_to_keep_activity]

In [13]:
# BODY MEASUREMENTS DROPPING

columns_to_keep_body = ["Respondent_ID",
                        'Body Mass Index (kg/m**2)',
                        'Waist Circumference (cm)',
                        'Hip Circumference (cm)']

df_body = df_body[columns_to_keep_body]

In [14]:
columns_to_keep_alc = ["Respondent_ID",
                       'Past 12 Months Frequency of Drinking Alcoholic Beverages']

df_alc = df_alc[columns_to_keep_alc]

df_alc = df_alc[df_alc['Past 12 Months Frequency of Drinking Alcoholic Beverages'].isin([77, 99]) == False]

# Merging the datasets

In [15]:
# Merge df_diabetes and df_diet on 'Respondent_ID'
df_merged = pd.merge(df_diabetes, df_diet, on='Respondent_ID')

# Merge df_demo on 'Respondent_ID' with the already merged DataFrame
df_merged = pd.merge(df_merged, df_demo, on='Respondent_ID')

# Merge 'df_activity' with the already merged DataFrame on 'Respondent_ID'
df_merged = pd.merge(df_merged, df_activity, on='Respondent_ID')

# Merge 'df_body' with the already merged DataFrame on 'Respondent_ID'
df_merged = pd.merge(df_merged, df_body, on='Respondent_ID')

# Merge 'df_alc' with the already merged DataFrame on 'Respondent_ID'
df_merged = pd.merge(df_merged, df_alc, on='Respondent_ID')


In [16]:
df_merged.round(0)

Unnamed: 0,Respondent_ID,Doctor_Told_Diabetes,Number_of_Meals_Not_Home_Prepared,Number_of_Meals_From_Fast_Food_or_Pizza_Place,Number_of_Ready_to_Eat_Foods_in_Past_30_Days,Number_of_Frozen_Meals_Pizza_in_Past_30_Days,Gender,Age_Years,Race_Hispanic_Origin_with_NH_Asian,Vigorous work activity,Walk or bicycle,Vigorous recreational activities,Body Mass Index (kg/m**2),Waist Circumference (cm),Hip Circumference (cm),Past 12 Months Frequency of Drinking Alcoholic Beverages
0,109266.0,0.0,7.0,0.0,0.0,5.0,2.0,29.0,6.0,2.0,2.0,1.0,38.0,118.0,126.0,10.0
1,109271.0,0.0,2.0,2.0,0.0,0.0,1.0,49.0,3.0,2.0,2.0,2.0,30.0,120.0,108.0,0.0
2,109273.0,0.0,2.0,2.0,0.0,7.0,1.0,36.0,3.0,1.0,2.0,2.0,22.0,87.0,94.0,0.0
3,109274.0,1.0,0.0,,0.0,0.0,1.0,68.0,7.0,1.0,1.0,2.0,30.0,110.0,108.0,4.0
4,109282.0,0.0,1.0,1.0,1.0,8.0,1.0,76.0,3.0,2.0,2.0,2.0,27.0,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8956,124815.0,0.0,1.0,1.0,0.0,0.0,1.0,52.0,4.0,1.0,1.0,1.0,30.0,99.0,105.0,3.0
8957,124817.0,1.0,4.0,1.0,0.0,0.0,2.0,67.0,1.0,1.0,2.0,2.0,38.0,110.0,121.0,3.0
8958,124818.0,0.0,4.0,4.0,5.0,0.0,1.0,40.0,4.0,2.0,2.0,2.0,38.0,115.0,118.0,9.0
8959,124821.0,1.0,0.0,,0.0,0.0,1.0,63.0,4.0,1.0,2.0,2.0,26.0,97.0,100.0,5.0


In [17]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8961 entries, 0 to 8960
Data columns (total 16 columns):
 #   Column                                                    Non-Null Count  Dtype  
---  ------                                                    --------------  -----  
 0   Respondent_ID                                             8961 non-null   float64
 1   Doctor_Told_Diabetes                                      8961 non-null   float64
 2   Number_of_Meals_Not_Home_Prepared                         8961 non-null   float64
 3   Number_of_Meals_From_Fast_Food_or_Pizza_Place             7068 non-null   float64
 4   Number_of_Ready_to_Eat_Foods_in_Past_30_Days              8960 non-null   float64
 5   Number_of_Frozen_Meals_Pizza_in_Past_30_Days              8958 non-null   float64
 6   Gender                                                    8961 non-null   float64
 7   Age_Years                                                 8961 non-null   float64
 8   Race_Hispanic_Orig

In [18]:
df_merged.head()

Unnamed: 0,Respondent_ID,Doctor_Told_Diabetes,Number_of_Meals_Not_Home_Prepared,Number_of_Meals_From_Fast_Food_or_Pizza_Place,Number_of_Ready_to_Eat_Foods_in_Past_30_Days,Number_of_Frozen_Meals_Pizza_in_Past_30_Days,Gender,Age_Years,Race_Hispanic_Origin_with_NH_Asian,Vigorous work activity,Walk or bicycle,Vigorous recreational activities,Body Mass Index (kg/m**2),Waist Circumference (cm),Hip Circumference (cm),Past 12 Months Frequency of Drinking Alcoholic Beverages
0,109266.0,0.0,7.0,5.397605e-79,5.397605e-79,5.0,2.0,29.0,6.0,2.0,2.0,1.0,37.8,117.9,126.1,10.0
1,109271.0,0.0,2.0,2.0,5.397605e-79,5.397605e-79,1.0,49.0,3.0,2.0,2.0,2.0,29.7,120.4,108.2,5.397605e-79
2,109273.0,0.0,2.0,2.0,5.397605e-79,7.0,1.0,36.0,3.0,1.0,2.0,2.0,21.9,86.8,94.5,5.397605e-79
3,109274.0,1.0,5.397605e-79,,5.397605e-79,5.397605e-79,1.0,68.0,7.0,1.0,1.0,2.0,30.2,109.6,107.8,4.0
4,109282.0,0.0,1.0,1.0,1.0,8.0,1.0,76.0,3.0,2.0,2.0,2.0,26.6,,,5.397605e-79


In [19]:
# Fill NAs with mean
df_merged = df_merged.fillna(df_merged.mean())

df_merged.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8961 entries, 0 to 8960
Data columns (total 16 columns):
 #   Column                                                    Non-Null Count  Dtype  
---  ------                                                    --------------  -----  
 0   Respondent_ID                                             8961 non-null   float64
 1   Doctor_Told_Diabetes                                      8961 non-null   float64
 2   Number_of_Meals_Not_Home_Prepared                         8961 non-null   float64
 3   Number_of_Meals_From_Fast_Food_or_Pizza_Place             8961 non-null   float64
 4   Number_of_Ready_to_Eat_Foods_in_Past_30_Days              8961 non-null   float64
 5   Number_of_Frozen_Meals_Pizza_in_Past_30_Days              8961 non-null   float64
 6   Gender                                                    8961 non-null   float64
 7   Age_Years                                                 8961 non-null   float64
 8   Race_Hispanic_Orig

In [20]:
# Split the data into X (features) and y (target)
X = df_merged.drop('Doctor_Told_Diabetes', axis=1)
y = df_merged['Doctor_Told_Diabetes']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize logistic regression model
logreg = LogisticRegression(max_iter=10000)

# Fit the model to the training data
logreg.fit(X_train, y_train)

# Predict on test data
y_pred = logreg.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

coefficients = logreg.coef_[0]
features = X.columns

coeff_df = pd.DataFrame({'Feature': features, 'Coefficient': coefficients})

sorted_coeff_df = coeff_df.sort_values(by='Coefficient', ascending=False)


print(sorted_coeff_df)

Accuracy: 0.8131622978248745
              precision    recall  f1-score   support

         0.0       0.82      0.98      0.90      1465
         1.0       0.41      0.05      0.09       328

    accuracy                           0.81      1793
   macro avg       0.62      0.52      0.49      1793
weighted avg       0.75      0.81      0.75      1793

                                              Feature  Coefficient
6                                           Age_Years     0.051223
12                           Waist Circumference (cm)     0.023786
11                          Body Mass Index (kg/m**2)     0.006340
10                   Vigorous recreational activities     0.000611
7                  Race_Hispanic_Origin_with_NH_Asian     0.000275
8                              Vigorous work activity     0.000230
2       Number_of_Meals_From_Fast_Food_or_Pizza_Place     0.000202
1                   Number_of_Meals_Not_Home_Prepared     0.000031
9                                     Wal

## 👨‍💻 ML MODEL FOR FIRST STREAMLIT VERSION

Below I will try to make a dumb machine learning model with only gender, hip circumference and moderate work activity as predictors.

Why? 
To make a basic Streamlit model that we can work more on when we have decided on the predictor values we will ask the user.
- this includes taking a string input and converting it into one of the numbers our model us using


In [21]:
columns_for_dumb_model = ["Doctor_Told_Diabetes", "Gender", "Hip Circumference (cm)", "Vigorous recreational activities"]
df_3 = df_merged[columns_for_dumb_model]

df_3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8961 entries, 0 to 8960
Data columns (total 4 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Doctor_Told_Diabetes              8961 non-null   float64
 1   Gender                            8961 non-null   float64
 2   Hip Circumference (cm)            8961 non-null   float64
 3   Vigorous recreational activities  8961 non-null   float64
dtypes: float64(4)
memory usage: 280.2 KB


In [22]:
for column in df_3.columns:
    print(f"Unique values in {column}:")
    print(df_3[column].unique())
    print("-" * 50)  # Just to separate the output for better visibility


Unique values in Doctor_Told_Diabetes:
[0. 1.]
--------------------------------------------------
Unique values in Gender:
[2. 1.]
--------------------------------------------------
Unique values in Hip Circumference (cm):
[126.1        108.2         94.5        107.8        107.23156028
 125.5        106.5        106.2        106.4        120.2
 101.          92.5        110.2        106.7         82.
  88.9         93.4         96.3         96.4        107.7
 102.         121.3        110.8        114.4        102.3
  94.8         97.5         97.          95.5        119.5
  96.          99.5        129.5        103.8        122.3
 134.3        119.3        150.4         90.5        120.4
 129.          95.          85.7        114.         102.5
  99.3        103.5        107.9        104.8        105.1
 116.5        115.5         93.8        129.7        120.9
  86.1         95.8        106.          88.7         87.5
 148.3        111.         103.         130.5         97.3
  99

In [23]:
# Split the data into X (features) and y (target)
X = df_3.drop('Doctor_Told_Diabetes', axis=1)
y = df_3['Doctor_Told_Diabetes']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize logistic regression model
logreg = LogisticRegression(max_iter=10000)

# Fit the model to the training data
logreg.fit(X_train, y_train)

# Predict on test data
y_pred = logreg.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

coefficients = logreg.coef_[0]
features = X.columns

coeff_df = pd.DataFrame({'Feature': features, 'Coefficient': coefficients})

sorted_coeff_df = coeff_df.sort_values(by='Coefficient', ascending=False)


print(sorted_coeff_df)

Accuracy: 0.8170663692136084
              precision    recall  f1-score   support

         0.0       0.82      1.00      0.90      1465
         1.0       0.00      0.00      0.00       328

    accuracy                           0.82      1793
   macro avg       0.41      0.50      0.45      1793
weighted avg       0.67      0.82      0.73      1793

                            Feature  Coefficient
2  Vigorous recreational activities     1.106592
1            Hip Circumference (cm)     0.020510
0                            Gender    -0.470707


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
X

Unnamed: 0,Gender,Hip Circumference (cm),Vigorous recreational activities
0,2.0,126.10000,1.0
1,1.0,108.20000,2.0
2,1.0,94.50000,2.0
3,1.0,107.80000,2.0
4,1.0,107.23156,2.0
...,...,...,...
8956,1.0,105.30000,1.0
8957,2.0,121.40000,2.0
8958,1.0,118.00000,2.0
8959,1.0,99.80000,2.0


In [25]:
X1 = np.array([[2., 172.8, 2.]])
X1

array([[  2. , 172.8,   2. ]])

In [26]:
y_pred = logreg.predict(X1)
y_pred



array([0.])

In [27]:
from joblib import dump

# Save the logistic regression model and scaler
dump(logreg, 'logreg_model.joblib')
dump(scaler, 'scaler.joblib')

['scaler.joblib']