In [89]:
import pandas as pd

df = pd.read_csv('diabetes_binary_5050split_health_indicators_BRFSS2015.csv')

print(df.shape)
df.head()

(70692, 22)


Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,3.0,5.0,30.0,0.0,1.0,4.0,6.0,8.0
1,0.0,1.0,1.0,1.0,26.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,3.0,0.0,0.0,0.0,1.0,12.0,6.0,8.0
2,0.0,0.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,10.0,0.0,1.0,13.0,6.0,8.0
3,0.0,1.0,1.0,1.0,28.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,3.0,0.0,3.0,0.0,1.0,11.0,6.0,8.0
4,0.0,0.0,0.0,1.0,29.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,8.0,5.0,8.0


In [90]:
# Check for missing values
print(df.isnull().sum())

# Check the data types of each column
print(df.info())

Diabetes_binary         0
HighBP                  0
HighChol                0
CholCheck               0
BMI                     0
Smoker                  0
Stroke                  0
HeartDiseaseorAttack    0
PhysActivity            0
Fruits                  0
Veggies                 0
HvyAlcoholConsump       0
AnyHealthcare           0
NoDocbcCost             0
GenHlth                 0
MentHlth                0
PhysHlth                0
DiffWalk                0
Sex                     0
Age                     0
Education               0
Income                  0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70692 entries, 0 to 70691
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Diabetes_binary       70692 non-null  float64
 1   HighBP                70692 non-null  float64
 2   HighChol              70692 non-null  float64
 3   CholCheck             70692 non-null  float64


##### The apriori algorithm is a data mining technique used to identify frequet itemsets and generate association rules from a dataset. It is mostly used for market basket analysis but we'll try and use it to find associations between different categories.

##### Since the algorithm is designed to identify patterns based on the presence or absence of items then all category values need to be binary. We'll transform the categories below

In [91]:
# Transform BMI
df['BMI_Category'] = pd.cut(
    df['BMI'], 
    bins=[0, 18.5, 24.9, 29.9, 100],  # Define meaningful ranges
    labels=['Underweight', 'Normal', 'Overweight', 'Obese'], 
    right=True
)

# Convert BMI categories to binary columns
df = pd.get_dummies(df, columns=['BMI_Category'], prefix='BMI')

# Transform GenHlth
df['Health_Category'] = pd.cut(
    df['GenHlth'], 
    bins=[1, 2, 3, 4, 5, 6], 
    labels=['Excellent', 'Very good', 'Good', 'Fair', 'Poor'], 
    right=True
)

# Transform MentHlth
df['MentHlth_Category'] = pd.cut(
    df['MentHlth'], 
    bins=[0, 1, 7, 30],  # No poor days, 1-7 poor days (some), 8-30 poor days (chronic)
    labels=['No poor days', 'Some poor days', 'Chronic issues'], 
    right=True
)

# Transform PhysHlth
df['PhysHlth_Category'] = pd.cut(
    df['PhysHlth'], 
    bins=[0, 1, 7, 30],  # Same logic as MentHlth
    labels=['No poor days', 'Some poor days', 'Chronic issues'], 
    right=True
)

# Transform Age
age_mapping = {
    1: "18-24", 2: "25-29", 3: "30-34", 4: "35-39",
    5: "40-44", 6: "45-49", 7: "50-54", 8: "55-59",
    9: "60-64", 10: "65-69", 11: "70-74", 12: "75-79", 
    13: "80 or older"
}
df['Age_Category'] = df['Age'].map(age_mapping)

# Transform Education
education_mapping = {
    1: "Never attended school or kindergarten", 
    2: "Grades 1-8 (Elementary)", 
    3: "Grades 9-11 (Some high school)", 
    4: "Grade 12 or GED (High school graduate)", 
    5: "College 1-3 years (Some college or technical school)", 
    6: "College 4+ years (College graduate)"
}
df['Education_Level'] = df['Education'].map(education_mapping)

# Transform Income
income_mapping = {
    1: "<$10,000", 2: "$10,000-$15,000", 3: "$15,000-$20,000", 
    4: "$20,000-$25,000", 5: "$25,000-$35,000", 
    6: "$35,000-$50,000", 7: "$50,000-$75,000", 
    8: ">$75,000"
}
df['Income_Category'] = df['Income'].map(income_mapping)

# Verify transformations
print(df[['MentHlth', 'MentHlth_Category']].head())
print(df[['PhysHlth', 'PhysHlth_Category']].head())
print(df[['Age', 'Age_Category']].head())
print(df[['Education', 'Education_Level']].head())
print(df[['Income', 'Income_Category']].head())


   MentHlth MentHlth_Category
0       5.0    Some poor days
1       0.0               NaN
2       0.0               NaN
3       0.0               NaN
4       0.0               NaN
   PhysHlth PhysHlth_Category
0      30.0    Chronic issues
1       0.0               NaN
2      10.0    Chronic issues
3       3.0    Some poor days
4       0.0               NaN
    Age Age_Category
0   4.0        35-39
1  12.0        75-79
2  13.0  80 or older
3  11.0        70-74
4   8.0        55-59
   Education                                    Education_Level
0        6.0                College 4+ years (College graduate)
1        6.0                College 4+ years (College graduate)
2        6.0                College 4+ years (College graduate)
3        6.0                College 4+ years (College graduate)
4        5.0  College 1-3 years (Some college or technical s...
   Income Income_Category
0     8.0        >$75,000
1     8.0        >$75,000
2     8.0        >$75,000
3     8.0        >$75,000


In [92]:
# Drop original integer columns
df_transformed = df.drop(columns=['BMI', 'MentHlth', 'PhysHlth', 'Age', 'Education', 'Income'])

# Convert categorical columns to binary
df_transactions = pd.get_dummies(df_transformed)

print(df_transactions.head())  # Preview the transactional dataset


   Diabetes_binary  HighBP  HighChol  CholCheck  Smoker  Stroke  \
0              0.0     1.0       0.0        1.0     0.0     0.0   
1              0.0     1.0       1.0        1.0     1.0     1.0   
2              0.0     0.0       0.0        1.0     0.0     0.0   
3              0.0     1.0       1.0        1.0     1.0     0.0   
4              0.0     0.0       0.0        1.0     1.0     0.0   

   HeartDiseaseorAttack  PhysActivity  Fruits  Veggies  HvyAlcoholConsump  \
0                   0.0           1.0     0.0      1.0                0.0   
1                   0.0           0.0     1.0      0.0                0.0   
2                   0.0           1.0     1.0      1.0                0.0   
3                   0.0           1.0     1.0      1.0                0.0   
4                   0.0           1.0     1.0      1.0                0.0   

   AnyHealthcare  NoDocbcCost  GenHlth  DiffWalk  Sex  BMI_Underweight  \
0            1.0          0.0      3.0       0.0  1.0       

In [93]:
# Set display options
pd.set_option('display.max_columns', None)

# View the DataFrame
print(df_transactions.head())

   Diabetes_binary  HighBP  HighChol  CholCheck  Smoker  Stroke  \
0              0.0     1.0       0.0        1.0     0.0     0.0   
1              0.0     1.0       1.0        1.0     1.0     1.0   
2              0.0     0.0       0.0        1.0     0.0     0.0   
3              0.0     1.0       1.0        1.0     1.0     0.0   
4              0.0     0.0       0.0        1.0     1.0     0.0   

   HeartDiseaseorAttack  PhysActivity  Fruits  Veggies  HvyAlcoholConsump  \
0                   0.0           1.0     0.0      1.0                0.0   
1                   0.0           0.0     1.0      0.0                0.0   
2                   0.0           1.0     1.0      1.0                0.0   
3                   0.0           1.0     1.0      1.0                0.0   
4                   0.0           1.0     1.0      1.0                0.0   

   AnyHealthcare  NoDocbcCost  GenHlth  DiffWalk  Sex  BMI_Underweight  \
0            1.0          0.0      3.0       0.0  1.0       

In [94]:
df_transactions

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,DiffWalk,Sex,BMI_Underweight,BMI_Normal,BMI_Overweight,BMI_Obese,Health_Category_Excellent,Health_Category_Very good,Health_Category_Good,Health_Category_Fair,Health_Category_Poor,MentHlth_Category_No poor days,MentHlth_Category_Some poor days,MentHlth_Category_Chronic issues,PhysHlth_Category_No poor days,PhysHlth_Category_Some poor days,PhysHlth_Category_Chronic issues,Age_Category_18-24,Age_Category_25-29,Age_Category_30-34,Age_Category_35-39,Age_Category_40-44,Age_Category_45-49,Age_Category_50-54,Age_Category_55-59,Age_Category_60-64,Age_Category_65-69,Age_Category_70-74,Age_Category_75-79,Age_Category_80 or older,Education_Level_College 1-3 years (Some college or technical school),Education_Level_College 4+ years (College graduate),Education_Level_Grade 12 or GED (High school graduate),Education_Level_Grades 1-8 (Elementary),Education_Level_Grades 9-11 (Some high school),Education_Level_Never attended school or kindergarten,"Income_Category_$10,000-$15,000","Income_Category_$15,000-$20,000","Income_Category_$20,000-$25,000","Income_Category_$25,000-$35,000","Income_Category_$35,000-$50,000","Income_Category_$50,000-$75,000","Income_Category_<$10,000","Income_Category_>$75,000"
0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,3.0,0.0,1.0,False,False,True,False,False,True,False,False,False,False,True,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True
1,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,3.0,0.0,1.0,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,True
3,0.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,3.0,0.0,1.0,False,False,True,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True
4,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70687,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,4.0,0.0,0.0,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False
70688,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,2.0,1.0,1.0,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False
70689,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,5.0,1.0,0.0,False,False,True,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,True,False,False,False,False,False
70690,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,4.0,1.0,0.0,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False


In [95]:
# Drop the target column
X = df_transactions.drop(columns=['Diabetes_binary'])

# Ensure all values in X are binary (True/False)
transactions = X.map(lambda x: True if x == 1 else False)

In [96]:
# pip install mlxtend

from mlxtend.frequent_patterns import apriori, association_rules

# Run Apriori algorithm
frequent_itemsets = apriori(transactions, min_support=0.05, use_colnames=True)

# Display frequent itemsets
frequent_itemsets[frequent_itemsets['itemsets'].apply(len) > 2]

Unnamed: 0,support,itemsets
420,0.371640,"(HighBP, HighChol, CholCheck)"
421,0.200744,"(HighBP, Smoker, HighChol)"
422,0.094806,"(HighBP, HighChol, HeartDiseaseorAttack)"
423,0.235967,"(HighBP, PhysActivity, HighChol)"
424,0.218313,"(HighBP, Fruits, HighChol)"
...,...,...
6119,0.052665,"(HighBP, Smoker, Sex, Veggies, HighChol, AnyHe..."
6120,0.057347,"(HighBP, Sex, Fruits, Veggies, HighChol, AnyHe..."
6121,0.061294,"(BMI_Obese, HighBP, Fruits, Veggies, HighChol,..."
6122,0.050119,"(HighBP, Education_Level_College 4+ years (Col..."


##### Above is a list categories that commonly appear together

In [97]:
# Generate association rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)

# Display association rules
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(HighBP),(HighChol),0.563458,0.525703,0.374597,0.664817,1.264625,0.078385,1.415040,0.479340
1,(HighChol),(HighBP),0.525703,0.563458,0.374597,0.712564,1.264625,0.078385,1.518741,0.441183
2,(HighBP),(CholCheck),0.563458,0.975259,0.557475,0.989380,1.014480,0.007957,2.329761,0.032696
3,(CholCheck),(HighBP),0.975259,0.563458,0.557475,0.571617,1.014480,0.007957,1.019045,0.576898
4,(HighBP),(Smoker),0.563458,0.475273,0.289453,0.513708,1.080868,0.021656,1.079036,0.171388
...,...,...,...,...,...,...,...,...,...,...
38984,"(Veggies, Sex, Income_Category_>$75,000, Fruits)","(PhysActivity, AnyHealthcare, Education_Level_...",0.087634,0.283639,0.051123,0.583374,2.056748,0.026267,1.719433,0.563146
38985,"(Sex, Income_Category_>$75,000, Fruits, AnyHea...","(PhysActivity, Veggies, Education_Level_Colleg...",0.095485,0.254201,0.051123,0.535407,2.106234,0.026851,1.605275,0.580663
38986,"(Sex, Income_Category_>$75,000, Fruits, PhysAc...","(Veggies, AnyHealthcare, Education_Level_Colle...",0.082810,0.301293,0.051123,0.617356,2.049021,0.026173,1.825996,0.558185
38987,"(Sex, Income_Category_>$75,000, Education_Leve...","(CholCheck, Fruits, Veggies, AnyHealthcare, Ph...",0.100238,0.385334,0.051123,0.510020,1.323580,0.012498,1.254472,0.271709


##### The association rules are patterns discovered from frequent itemsets that describe relationships between items in the set. They show how the presence of one set of items (antecedents) is associated with another set (consequents), often quantified using metrics like support, confidence, and lift. These rules highlight co-occurence patterns such as "If a person has high blood pressure, they likely have high cholesterol." 

##### These rules can help cluster data points by grouping instances that share common attributes. For example, in our dataset, patients with similar symptoms and behaviours can form clusters based on frequent itemsets, allowing models to leverage these groups for prediction or segmentation tasks. This aids in identifying distinct patterns within data for improved model performance.