# Apriori Algorithms

In [1]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
from imblearn.under_sampling import RandomUnderSampler

# ignore warning
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('../data/clean/heart/heart_all.csv')
df = df.drop(columns=['Race', 'SleepTime_6_8'])
df.shape

(319795, 27)

### Balance Data

In [3]:
y = df['HeartDisease'].to_numpy()
undersample = RandomUnderSampler(sampling_strategy='majority')
df_sample, y_sample = undersample.fit_resample(df, y)
print(df_sample.shape)
df_sample.head()

(54746, 27)


Unnamed: 0,HeartDisease,Smoking,AlcoholDrinking,Stroke,DiffWalking,Diabetic,PhysicalActivity,Asthma,KidneyDisease,SkinCancer,...,isFemale,Age_18_29,Age_30_39,Age_40_49,Age_50_59,Age_60_69,Age_70_79,Age_80,SleepTime_0_6,SleepTime_8
0,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0


In [4]:
df_apriori = apriori(df_sample, min_support=0.2, use_colnames=True)
df_apriori.to_csv('../data/clean/heart/heart_apriori.csv', index=False)
df_apriori.head(20)

Unnamed: 0,support,itemsets
0,0.5,(HeartDisease)
1,0.492639,(Smoking)
2,0.242538,(DiffWalking)
3,0.224948,(Diabetic)
4,0.287053,(PhysicalActivity)
5,0.203613,(BMI_18.5_24)
6,0.429639,(BMI_24_30)
7,0.351752,(BMI_30_100)
8,0.372813,(PhysicalNotHealth)
9,0.347952,(MentalNotHealth)


In [5]:
df_association= association_rules(df_apriori, metric="confidence", min_threshold=0.4)
df_association.to_csv('../data/clean/association_rules.csv', index=False)

df_association.sort_values(by='confidence', ascending=False, inplace=True)
df_association.head(20)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
4,(PhysicalNotHealth),(HeartDisease),0.372813,0.5,0.236565,0.634542,1.269084,0.050159,1.368146,0.338065
0,(Smoking),(HeartDisease),0.492639,0.5,0.292935,0.594624,1.189247,0.046615,1.233422,0.313646
6,(HeartDisease),(isMale),0.5,0.527801,0.294798,0.589596,1.117079,0.030897,1.15057,0.209616
1,(HeartDisease),(Smoking),0.5,0.492639,0.292935,0.585869,1.189247,0.046615,1.225124,0.318264
14,(Smoking),(isMale),0.492639,0.527801,0.288459,0.585539,1.109394,0.028444,1.13931,0.194353
19,(BMI_24_30),(isMale),0.429639,0.527801,0.25116,0.584584,1.107584,0.024396,1.136689,0.170302
20,(MentalNotHealth),(isFemale),0.347952,0.472199,0.202755,0.582708,1.23403,0.038452,1.264824,0.290849
13,(PhysicalNotHealth),(Smoking),0.372813,0.492639,0.2086,0.55953,1.135781,0.024938,1.151862,0.19061
7,(isMale),(HeartDisease),0.527801,0.5,0.294798,0.55854,1.117079,0.030897,1.132604,0.221958
15,(isMale),(Smoking),0.527801,0.492639,0.288459,0.546531,1.109394,0.028444,1.118843,0.208825


## Separate data into three hospitals

In [6]:
df_1 = pd.read_csv('../data/clean/heart/heart_part1.csv')
df_1 = df_1.drop(columns=['SleepTime_6_8'])
y_1 = df_1['HeartDisease'].to_numpy()
df_sample_1, y_sample = undersample.fit_resample(df_1, y_1)
df_apriori_1 = apriori(df_sample_1, min_support=0.2, use_colnames=True)
df_association_1 = association_rules(df_apriori_1, metric="confidence", min_threshold=0.45)

# sort by confidence
df_association_1.sort_values(by='confidence', ascending=False, inplace=True)
df_association_1.head(20)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
5,(Smoking),(isMale),0.351504,0.595865,0.25188,0.716578,1.202584,0.042431,1.425911,0.259766
7,(BMI_24_30),(isMale),0.434211,0.595865,0.302632,0.69697,1.169678,0.043901,1.333647,0.256392
3,(HeartDisease),(isMale),0.5,0.595865,0.336466,0.672932,1.129338,0.038534,1.235632,0.22905
0,(Smoking),(HeartDisease),0.351504,0.5,0.223684,0.636364,1.272727,0.047932,1.375,0.330435
4,(isMale),(HeartDisease),0.595865,0.5,0.336466,0.564669,1.129338,0.038534,1.148551,0.283383
2,(BMI_24_30),(HeartDisease),0.434211,0.5,0.229323,0.528139,1.056277,0.012218,1.059633,0.094167
6,(isMale),(BMI_24_30),0.595865,0.434211,0.302632,0.507886,1.169678,0.043901,1.149713,0.358948
1,(HeartDisease),(BMI_24_30),0.5,0.434211,0.229323,0.458647,1.056277,0.012218,1.045139,0.106557


In [8]:
df_2 = pd.read_csv('../data/clean/heart/heart_part1.csv')
df_2 = df_2.drop(columns=['SleepTime_6_8'])
y_2 = df_2['HeartDisease'].to_numpy()
df_sample_2, y_sample = undersample.fit_resample(df_2, y_2)
df_apriori_2 = apriori(df_sample_2, min_support=0.2, use_colnames=True)
df_association_2 = association_rules(df_apriori_2, metric="confidence", min_threshold=0.45)

# sort by confidence
df_association_2.sort_values(by='confidence', ascending=False, inplace=True)
df_association_2.head(20)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
5,(Smoking),(isMale),0.332707,0.593985,0.234962,0.706215,1.188944,0.03734,1.382013,0.238152
7,(BMI_24_30),(isMale),0.43609,0.593985,0.302632,0.693966,1.168322,0.043601,1.326697,0.255487
3,(HeartDisease),(isMale),0.5,0.593985,0.336466,0.672932,1.132911,0.039474,1.241379,0.234637
0,(Smoking),(HeartDisease),0.332707,0.5,0.223684,0.672316,1.344633,0.057331,1.525862,0.384093
4,(isMale),(HeartDisease),0.593985,0.5,0.336466,0.566456,1.132911,0.039474,1.153285,0.288951
2,(BMI_24_30),(HeartDisease),0.43609,0.5,0.229323,0.525862,1.051724,0.011278,1.054545,0.087213
6,(isMale),(BMI_24_30),0.593985,0.43609,0.302632,0.509494,1.168322,0.043601,1.149648,0.354842
1,(HeartDisease),(BMI_24_30),0.5,0.43609,0.229323,0.458647,1.051724,0.011278,1.041667,0.098361


In [7]:
df_3 = pd.read_csv('../data/clean/heart/heart_part1.csv')
df_3 = df_3.drop(columns=['SleepTime_6_8'])
y_3 = df_3['HeartDisease'].to_numpy()
df_sample_3, y_sample = undersample.fit_resample(df_3, y_3)
df_apriori_3 = apriori(df_sample_3, min_support=0.2, use_colnames=True)
df_association_3 = association_rules(df_apriori_3, metric="confidence", min_threshold=0.45)

# sort by confidence
df_association_3.sort_values(by='confidence', ascending=False, inplace=True)
df_association_3.head(20)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
5,(Smoking),(isMale),0.332707,0.584586,0.238722,0.717514,1.227388,0.044226,1.470564,0.277631
7,(BMI_24_30),(isMale),0.43609,0.584586,0.295113,0.676724,1.157612,0.04018,1.285013,0.241444
3,(HeartDisease),(isMale),0.5,0.584586,0.336466,0.672932,1.151125,0.044173,1.270115,0.26257
0,(Smoking),(HeartDisease),0.332707,0.5,0.223684,0.672316,1.344633,0.057331,1.525862,0.384093
4,(isMale),(HeartDisease),0.584586,0.5,0.336466,0.575563,1.151125,0.044173,1.17803,0.316034
2,(BMI_24_30),(HeartDisease),0.43609,0.5,0.229323,0.525862,1.051724,0.011278,1.054545,0.087213
6,(isMale),(BMI_24_30),0.584586,0.43609,0.295113,0.504823,1.157612,0.04018,1.138805,0.327752
1,(HeartDisease),(BMI_24_30),0.5,0.43609,0.229323,0.458647,1.051724,0.011278,1.041667,0.098361
