In [1]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

# --- Adjusting display options ---
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000) 
pd.set_option('display.max_colwidth', None) 
pd.set_option('display.max_rows', None)      

df = pd.read_csv('dashboard/final_cleaned.csv')

# --- 1. Create a new, binned DataFrame for mining ---
df_binned = pd.DataFrame()

# --- 2. Discretize Numerical Columns ---

# Bin age (using 4 quantile bins: 0-25%, 25-50%, 50-75%, 75-100%)
df_binned['age'] = pd.qcut(df['age_years'], 4, labels=['Age_Q1', 'Age_Q2', 'Age_Q3', 'Age_Q4'])

# Bin lead levels
df_binned['lead'] = pd.qcut(df['lead_µg/dL'], 3, labels=['Lead_Low', 'Lead_Medium', 'Lead_High'])

# Bin cadmium levels
df_binned['cadmium'] = pd.qcut(df['cadmium_µg/L'], 3, labels=['Cadmium_Low', 'Cadmium_Medium', 'Cadmium_High'])

# Bin estradiol levels
df_binned['estradiol'] = pd.qcut(df['estradiol'], 3, labels=['Estradiol_Low', 'Estradiol_Medium', 'Estradiol_High'])

# --- 3. Convert Key Categorical Outcomes ---
df_binned['infertility'] = df['infertility_1yr'].map({1: 'Infertility_Yes', 2: 'Infertility_No'})
df_binned['regular_periods'] = df['regular_periods'].map({1.0: 'Periods_Regular', 2.0: 'Periods_Irregular'})

# Drop any rows that have missing values after binning
df_binned.dropna(inplace=True)

print("--- Binned Data for Mining ---")
print(df_binned)

--- Binned Data for Mining ---
         age         lead         cadmium         estradiol      infertility    regular_periods
7     Age_Q3     Lead_Low  Cadmium_Medium    Estradiol_High   Infertility_No    Periods_Regular
15    Age_Q4    Lead_High  Cadmium_Medium     Estradiol_Low   Infertility_No  Periods_Irregular
18    Age_Q3    Lead_High    Cadmium_High    Estradiol_High   Infertility_No    Periods_Regular
46    Age_Q3     Lead_Low     Cadmium_Low    Estradiol_High   Infertility_No    Periods_Regular
56    Age_Q3     Lead_Low    Cadmium_High    Estradiol_High   Infertility_No    Periods_Regular
63    Age_Q4    Lead_High  Cadmium_Medium     Estradiol_Low   Infertility_No  Periods_Irregular
65    Age_Q3  Lead_Medium    Cadmium_High    Estradiol_High   Infertility_No    Periods_Regular
71    Age_Q3     Lead_Low    Cadmium_High    Estradiol_High   Infertility_No  Periods_Irregular
72    Age_Q3     Lead_Low     Cadmium_Low  Estradiol_Medium   Infertility_No  Periods_Irregular
75    Age

In [2]:
# Convert the DataFrame into a one-hot encoded format
df_onehot = pd.get_dummies(df_binned)

print("\n--- One-Hot Encoded Data ---")
print(df_onehot.head())


--- One-Hot Encoded Data ---
    age_Age_Q1  age_Age_Q2  age_Age_Q3  age_Age_Q4  lead_Lead_Low  lead_Lead_Medium  lead_Lead_High  cadmium_Cadmium_Low  cadmium_Cadmium_Medium  cadmium_Cadmium_High  estradiol_Estradiol_Low  estradiol_Estradiol_Medium  estradiol_Estradiol_High  infertility_Infertility_No  infertility_Infertility_Yes  regular_periods_Periods_Irregular  regular_periods_Periods_Regular
7        False       False        True       False           True             False           False                False                    True                 False                    False                       False                      True                        True                        False                              False                             True
15       False       False       False        True          False             False            True                False                    True                 False                     True                       False         

In [3]:
# --- 4. Find frequent itemsets with Apriori ---

# min_support=0.05 - itemsets that appear in at least 5% of the rows
frequent_itemsets = apriori(df_onehot, min_support=0.05, use_colnames=True)

# --- 5. Generate the association rules ---
# rules with a 'lift' > 1.2
# Lift > 1 - items are more likely to co-occur than by random chance
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.2)

# --- 6. View the Results ---
# Sort the rules by 'lift' to see the strongest associations first
rules_sorted = rules.sort_values(by="lift", ascending=False)

print("\n--- Top Association Rules ---")
print(rules_sorted[['antecedents', 'consequents', 'support', 'confidence', 'lift']])


--- Top Association Rules ---
                                                                                                                            antecedents                                                                                                                        consequents   support  confidence      lift
1703                                  (infertility_Infertility_No, age_Age_Q4, cadmium_Cadmium_High, regular_periods_Periods_Irregular)                                                                                          (lead_Lead_High, estradiol_Estradiol_Low)  0.064792    0.500000  5.049383
1724                                                                                          (lead_Lead_High, estradiol_Estradiol_Low)                                  (infertility_Infertility_No, age_Age_Q4, cadmium_Cadmium_High, regular_periods_Periods_Irregular)  0.064792    0.654321  5.049383
1704                                                              (infer