# Download Dataset for "Diabetes Health Indicators"

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mohankrishnathalla/diabetes-health-indicators-dataset")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /home/codespace/.cache/kagglehub/datasets/mohankrishnathalla/diabetes-health-indicators-dataset/versions/1


# Process Dataset

In [3]:
import pandas as pd

# Load the dataset
data = pd.read_csv(f"{path}/diabetes_dataset.csv")

# Age (18â€“90)
data['Age_Group'] = pd.cut(data['age'], 
                           bins=[18, 30, 40, 50, 60, 70, 80, 90],
                           labels=['18-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80-90'])

# Alcohol consumption per week
data['Alcohol_Group'] = pd.cut(data['alcohol_consumption_per_week'],
                               bins=[0, 1, 3, 7, 14, 21, 100],
                               labels=['None', 'Low', 'Moderate', 'Frequent', 'Heavy', 'Extreme'])

# Physical activity per week
data['Activity_Group'] = pd.cut(data['physical_activity_minutes_per_week'],
                                bins=[0, 60, 120, 180, 300, 600, 1000],
                                labels=['Sedentary', 'Low', 'Moderate', 'Active', 'VeryActive', 'Athlete'])

# Diet score
data['Diet_Group'] = pd.cut(data['diet_score'], 
                            bins=[0, 3, 5, 7, 8.5, 10],
                            labels=['Poor', 'Fair', 'Good', 'VeryGood', 'Excellent'])

# One-hot encode categorical features
categorical_features = [
    'gender', 'ethnicity', 'education_level', 'income_level', 
    'employment_status', 'smoking_status',
    'Age_Group', 'Alcohol_Group', 'Activity_Group', 'Diet_Group'
]

df = pd.get_dummies(data[categorical_features])

# Convert to boolean
df = df.astype(bool)

print(df)


       gender_Female  gender_Male  gender_Other  ethnicity_Asian  \
0              False         True         False             True   
1               True        False         False            False   
2              False         True         False            False   
3               True        False         False            False   
4              False         True         False            False   
...              ...          ...           ...              ...   
99995          False         True         False            False   
99996           True        False         False            False   
99997           True        False         False            False   
99998           True        False         False            False   
99999           True        False         False            False   

       ethnicity_Black  ethnicity_Hispanic  ethnicity_Other  ethnicity_White  \
0                False               False            False            False   
1                False 

# Mine Frequent Itemsets with Apriori & Associative Rule Mining

In [4]:
from mlxtend.frequent_patterns import apriori, association_rules

# Mine Frequent Itemsets with Apriori
frequent_itemsets = apriori(df, min_support =0.25, use_colnames =True)
print(frequent_itemsets)

# Generate Associative Rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.3)
print(rules)

    support                                           itemsets
0   0.50216                                    (gender_Female)
1   0.47771                                      (gender_Male)
2   0.44997                                  (ethnicity_White)
3   0.35037                         (education_level_Graduate)
4   0.44891                       (education_level_Highschool)
5   0.25150                        (income_level_Lower-Middle)
6   0.35152                              (income_level_Middle)
7   0.60175                       (employment_status_Employed)
8   0.59813                             (smoking_status_Never)
9   0.26995                               (Alcohol_Group_None)
10  0.45015                                (Alcohol_Group_Low)
11  0.27391                         (Activity_Group_Sedentary)
12  0.32639                               (Activity_Group_Low)
13  0.42037                                  (Diet_Group_Good)
14  0.30214        (employment_status_Employed, gender_

# Calculate Rule Weights

In [5]:
import math
def compute_rule_weights(rules):
    # Calculate weights for each rule based on support and confidence
    weights = []
    for _, row in rules.iterrows():
        weight = row['confidence'] * math.log(1 + row['consequent support'])
        weights.append(weight)
    rules['weight'] = weights
    return rules

# Calculate Rule Weights
weighted_rules = compute_rule_weights(rules)
print(weighted_rules)

                     antecedents                   consequents  \
0   (employment_status_Employed)               (gender_Female)   
1                (gender_Female)  (employment_status_Employed)   
2         (smoking_status_Never)               (gender_Female)   
3                (gender_Female)        (smoking_status_Never)   
4   (employment_status_Employed)                 (gender_Male)   
5                  (gender_Male)  (employment_status_Employed)   
6         (smoking_status_Never)                 (gender_Male)   
7                  (gender_Male)        (smoking_status_Never)   
8   (employment_status_Employed)             (ethnicity_White)   
9              (ethnicity_White)  (employment_status_Employed)   
10        (smoking_status_Never)             (ethnicity_White)   
11             (ethnicity_White)        (smoking_status_Never)   
12  (employment_status_Employed)  (education_level_Highschool)   
13  (education_level_Highschool)  (employment_status_Employed)   
14        

# Find Rules Applicable to Match Record

In [6]:
def apply_rules_to_record(record, rules):
    applicable_rules = []
    for _, rule in rules.iterrows():
        antecedents = set(rule['antecedents'])
        if antecedents.issubset(set(record[record == True].index)):
            applicable_rules.append(rule)
    return applicable_rules

# Find Rules Applicable to Record
for index, row in df.iterrows():
    applicable_rules = apply_rules_to_record(row, weighted_rules)
    print(f"Record {index} applicable rules:")
    total_weight = 0
    for rule in applicable_rules:
        total_weight += rule['weight']
    df['Total Rule Weight'] = total_weight
print(df.head())

Record 0 applicable rules:
Record 1 applicable rules:
Record 2 applicable rules:
Record 3 applicable rules:
Record 4 applicable rules:
Record 5 applicable rules:
Record 6 applicable rules:
Record 7 applicable rules:
Record 8 applicable rules:
Record 9 applicable rules:
Record 10 applicable rules:
Record 11 applicable rules:
Record 12 applicable rules:
Record 13 applicable rules:
Record 14 applicable rules:
Record 15 applicable rules:
Record 16 applicable rules:
Record 17 applicable rules:
Record 18 applicable rules:
Record 19 applicable rules:
Record 20 applicable rules:
Record 21 applicable rules:
Record 22 applicable rules:
Record 23 applicable rules:
Record 24 applicable rules:
Record 25 applicable rules:
Record 26 applicable rules:
Record 27 applicable rules:
Record 28 applicable rules:
Record 29 applicable rules:
Record 30 applicable rules:
Record 31 applicable rules:
Record 32 applicable rules:
Record 33 applicable rules:
Record 34 applicable rules:
Record 35 applicable rules:
Re

KeyboardInterrupt: 

# Apply Normalization

In [None]:
def normalize_weights(record, applicable_rules):
    total_weight = record['Total Rule Weight']
    normalized_r_weight = []
    for rule in applicable_rules:
        normalized_weight = rule['weight'] / total_weight if total_weight > 0 else 0
        normalized_r_weight.append((normalized_weight, rule))
    return normalized_r_weight

# Apply Normalization
for index, row in df.iterrows():
    applicable_rules = apply_rules_to_record(row, weighted_rules)
    normalized_weights = normalize_weights(row, applicable_rules)
    print(f"Record {index} normalized rule weights:")
    for norm_weight, rule in normalized_weights:
        print(f"Rule: {rule['antecedents']} -> {rule['consequents']}, Normalized Weight: {norm_weight}")

Record 0 normalized rule weights:
Rule: frozenset({'employment_status_Employed'}) -> frozenset({'gender_Female'}), Normalized Weight: 0.05419095348353323
Rule: frozenset({'smoking_status_Never'}) -> frozenset({'gender_Female'}), Normalized Weight: 0.05448825251370005
Rule: frozenset({'employment_status_Employed'}) -> frozenset({'gender_Male'}), Normalized Weight: 0.0494545552686225
Rule: frozenset({'gender_Male'}) -> frozenset({'employment_status_Employed'}), Normalized Weight: 0.07515437935553321
Rule: frozenset({'smoking_status_Never'}) -> frozenset({'gender_Male'}), Normalized Weight: 0.04918241832663514
Rule: frozenset({'gender_Male'}) -> frozenset({'smoking_status_Never'}), Normalized Weight: 0.07393439124881605
Rule: frozenset({'employment_status_Employed'}) -> frozenset({'ethnicity_White'}), Normalized Weight: 0.04440143585605146
Rule: frozenset({'smoking_status_Never'}) -> frozenset({'ethnicity_White'}), Normalized Weight: 0.04449880999630681
Rule: frozenset({'education_level_H

KeyboardInterrupt: 

# Prepare Data for Nerual Network Classification

In [10]:
import pandas as pd
import numpy as np

# Collect all unique rule identifiers
rule_ids = [f"Rule_{i}" for i in range(len(weighted_rules))]

rule_matrix = pd.DataFrame(0.0, index=df.index, columns=rule_ids)

# Fill in normalized weights per record
for idx, row in df.iterrows():
    applicable_rules = apply_rules_to_record(row, weighted_rules)
    normalized_weights = normalize_weights(row, applicable_rules)
    for norm_weight, rule in normalized_weights:
        rule_id = f"Rule_{rule['id']}"  # ensure every rule dict has an 'id'
        rule_matrix.loc[idx, rule_id] = norm_weight


NameError: name 'normalize_weights' is not defined

# Train Neural Network

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

X_train, X_test, y_train, y_test = train_test_split(X_augmented, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = Sequential([
    Dense(128, activation='relu', input_dim=X_train_scaled.shape[1]),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train_scaled, y_train, epochs=25, batch_size=32, validation_split=0.2)


2025-11-11 05:33:53.266401: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-11-11 05:33:54.971045: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-11-11 05:34:02.553817: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.


NameError: name 'X_augmented' is not defined