# Diabetes Detection Concept
This notebook is not functional and by no means efficient but illustrates a processing pipeline for predicting classes based on association rules and weighted contributions to the parent class in which it belongs

## Download Dataset for "Diabetes Health Indicators"

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mohankrishnathalla/diabetes-health-indicators-dataset")

print("Path to dataset files:", path)

## Process Dataset

In [None]:
import pandas as pd

def preprocess_data(path):

    # Load the dataset
    data = pd.read_csv(f"{path}/diabetes_dataset.csv")

    # Age (18–90)
    data['age_group'] = pd.cut(data['age'], 
                            bins=[18, 30, 40, 50, 60, 70, 80, 90],
                            labels=['18-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80-90'])

    # Alcohol consumption per week
    data['alcohol_group'] = pd.cut(data['alcohol_consumption_per_week'],
                                bins=[0, 1, 3, 7, 14, 21, 100],
                                labels=['None', 'Low', 'Moderate', 'Frequent', 'Heavy', 'Extreme'])

    # Physical activity per week
    data['activity_group'] = pd.cut(data['physical_activity_minutes_per_week'],
                                    bins=[0, 60, 120, 180, 300, 600, 1000],
                                    labels=['Sedentary', 'Low', 'Moderate', 'Active', 'VeryActive', 'Athlete'])

    # Diet score
    data['diet_group'] = pd.cut(data['diet_score'], 
                                bins=[0, 3, 5, 7, 8.5, 10],
                                labels=['Poor', 'Fair', 'Good', 'VeryGood', 'Excellent'])

    # Sleep (hours/day)
    data['sleep_group'] = pd.cut(data['sleep_hours_per_day'], 
                                bins=[3, 6.3, 7, 7.7, 10],
                                labels=['Deprivation', 'Poor', 'Healthy', 'Oversleeping'])

    # BMI
    data['bmi_group'] = pd.cut(data['bmi'], 
                                bins=[15, 23.2, 25.6, 28, 39.2],
                                labels=['Underweight', 'Healthy', 'Overweight', 'Obesity'])

    # Waist-to-Hip Ratio
    data['waist_ratio'] = pd.cut(data['waist_to_hip_ratio'],
                                bins=[0.67, 0.82, 0.86, 0.89, 1.06],
                                labels=['Low', 'Moderate', 'High', 'VeryHigh'])


    # --- Cardiovascular Indicators ---

    # Systolic Blood Pressure (mmHg)
    data['systolic_bp'] = pd.cut(data['systolic_bp'], 
                                bins=[80, 100, 120, 130, 140, 160, 200],
                                labels=['Low', 'Ideal', 'Elevated', 'Hypertension_Stage1', 'Hypertension_Stage2', 'Crisis'])

    # Diastolic Blood Pressure (mmHg)
    data['diastolic_bp'] = pd.cut(data['diastolic_bp'], 
                                bins=[40, 60, 80, 90, 100, 120],
                                labels=['Low', 'Ideal', 'Elevated', 'Stage1', 'Stage2'])

    # Heart Rate (bpm)
    data['heart_rate'] = pd.cut(data['heart_rate'],
                                bins=[40, 60, 80, 100, 120, 200],
                                labels=['Bradycardia', 'Normal', 'Elevated', 'Tachycardia', 'Extreme'])

    # --- Lipid Profile ---

    # Total Cholesterol (mg/dL)
    data['cholesterol_level'] = pd.cut(data['cholesterol_total'],
                                    bins=[100, 160, 200, 240, 300, 400],
                                    labels=['Low', 'Desirable', 'Borderline', 'High', 'VeryHigh'])

    # HDL Cholesterol (mg/dL)
    data['hdl_level'] = pd.cut(data['hdl_cholesterol'],
                            bins=[10, 40, 60, 100],
                            labels=['Low', 'Normal', 'High'])

    # LDL Cholesterol (mg/dL)
    data['ldl_level'] = pd.cut(data['ldl_cholesterol'],
                            bins=[30, 100, 130, 160, 190, 300],
                            labels=['Optimal', 'NearOptimal', 'Borderline', 'High', 'VeryHigh'])

    # Triglycerides (mg/dL)
    data['triglycerides_level'] = pd.cut(data['triglycerides'],
                                        bins=[20, 150, 200, 500, 1000],
                                        labels=['Normal', 'Borderline', 'High', 'VeryHigh'])

    # --- Glucose Metabolism ---

    # Fasting Glucose (mg/dL)
    data['glucose_fasting'] = pd.cut(data['glucose_fasting'],
                                    bins=[50, 100, 126, 200, 400],
                                    labels=['Normal', 'Prediabetic', 'Diabetic', 'Severe'])

    # Postprandial Glucose (mg/dL)
    data['glucose_postprandial'] = pd.cut(data['glucose_postprandial'],
                                        bins=[50, 140, 200, 400],
                                        labels=['Normal', 'Prediabetic', 'Diabetic'])

    # Insulin (μU/mL)
    data['insulin_level'] = pd.cut(data['insulin_level'],
                                bins=[1, 5, 20, 40, 100],
                                labels=['Low', 'Normal', 'Elevated', 'Severe'])

    # HbA1c (%)
    data['hba1c_level'] = pd.cut(data['hba1c'],
                                bins=[3.5, 5.7, 6.5, 8, 10, 14],
                                labels=['Normal', 'Prediabetic', 'Diabetic', 'PoorControl', 'Severe'])

    # --- Risk & Stage ---

    # Diabetes Risk Score (0–100)
    data['diabetes_risk_score'] = pd.cut(data['diabetes_risk_score'],
                                        bins=[0, 25, 50, 75, 90, 100],
                                        labels=['Low', 'Mild', 'Moderate', 'High', 'Severe'])

    
    # One-hot encode categorical features
    categorical_features = [
        'gender', 'ethnicity', 'education_level', 'income_level', 
        'employment_status', 'smoking_status',
        'age_group', 'alcohol_group', 'activity_group', 'diet_group',
        'sleep_group', 'family_history_diabetes', 'hypertension_history',
        'cardiovascular_history', 'bmi_group', 'waist_ratio',
        'systolic_bp', 'diastolic_bp', 'heart_rate', 'cholesterol_level',
        'hdl_level', 'ldl_level', 'triglycerides_level', 'glucose_fasting',
        'glucose_postprandial', 'insulin_level', 'hba1c_level', 'diabetes_risk_score',
        'diabetes_stage', 'diagnosed_diabetes'
    ]

    processed_data = pd.get_dummies(data[categorical_features])

    # Convert to boolean
    finalized_data = processed_data.astype(bool)

    print(finalized_data)

## Mine Frequent Itemsets with Apriori & Associative Rule Mining

In [None]:
from mlxtend.frequent_patterns import apriori, association_rules

# Mine Frequent Itemsets with Apriori
frequent_itemsets = apriori(df, min_support =0.25, use_colnames =True)
print(frequent_itemsets)

# Generate Associative Rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.3)
print(rules)

## Calculate Rule Weights

In [None]:
import math
def compute_rule_weights(rules):
    # Calculate weights for each rule based on support and confidence
    weights = []
    for _, row in rules.iterrows():
        weight = row['confidence'] * math.log(1 + row['consequent support'])
        weights.append(weight)
    rules['weight'] = weights
    return rules

# Calculate Rule Weights
weighted_rules = compute_rule_weights(rules)
print(weighted_rules)

## Find Rules Applicable to Match Record

In [None]:
def apply_rules_to_record(record, rules):
    applicable_rules = []
    for _, rule in rules.iterrows():
        antecedents = set(rule['antecedents'])
        if antecedents.issubset(set(record[record == True].index)):
            applicable_rules.append(rule)
    return applicable_rules

# Find Rules Applicable to Record
for index, row in df.iterrows():
    applicable_rules = apply_rules_to_record(row, weighted_rules)
    print(f"Record {index} applicable rules:")
    total_weight = 0
    for rule in applicable_rules:
        total_weight += rule['weight']
    df['Total Rule Weight'] = total_weight
print(df.head())

## Apply Normalization

In [None]:
def normalize_weights(record, applicable_rules):
    total_weight = record['Total Rule Weight']
    normalized_r_weight = []
    for rule in applicable_rules:
        normalized_weight = rule['weight'] / total_weight if total_weight > 0 else 0
        normalized_r_weight.append((normalized_weight, rule))
    return normalized_r_weight

# Apply Normalization
for index, row in df.iterrows():
    applicable_rules = apply_rules_to_record(row, weighted_rules)
    normalized_weights = normalize_weights(row, applicable_rules)
    print(f"Record {index} normalized rule weights:")
    for norm_weight, rule in normalized_weights:
        print(f"Rule: {rule['antecedents']} -> {rule['consequents']}, Normalized Weight: {norm_weight}")

## Calculate Risk Score

In [None]:
def compute_risk_score(applicable_rules, normalized_weights):
    """
    Compute a rule-based risk score for diabetes.
    Score = sum(normalized_weight * rule_confidence for all applicable rules)
    """
    if not applicable_rules:
        return 0.0
    score = 0.0
    for (rule_id, weight), rule in zip(normalized_weights, applicable_rules):
        score += weight * rule["confidence"]
    return score


## Train Neural Network