# Apriori Algorithms

In [1]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
from imblearn.under_sampling import RandomUnderSampler

# ignore warning
import warnings
warnings.filterwarnings('ignore')

In [8]:
df = pd.read_csv('../data/clean/heart/heart_all.csv')
df = df.drop(columns=['Race', 'SleepTime_6_8'])
df.shape

(319795, 27)

### Balance Data

In [9]:
y = df['HeartDisease'].to_numpy()
undersample = RandomUnderSampler(sampling_strategy='majority')
df_sample, y_sample = undersample.fit_resample(df, y)
print(df_sample.shape)
df_sample.head()

(54746, 27)


Unnamed: 0,HeartDisease,Smoking,AlcoholDrinking,Stroke,DiffWalking,Diabetic,PhysicalActivity,Asthma,KidneyDisease,SkinCancer,...,isFemale,Age_18_29,Age_30_39,Age_40_49,Age_50_59,Age_60_69,Age_70_79,Age_80,SleepTime_0_6,SleepTime_8
0,0,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
3,0,1,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
4,0,1,1,0,0,0,1,0,0,1,...,1,0,0,0,1,0,0,0,0,0


In [10]:
df_apriori = apriori(df_sample, min_support=0.2, use_colnames=True)
df_apriori.to_csv('../data/clean/heart/heart_apriori.csv', index=False)
df_apriori.head(20)

Unnamed: 0,support,itemsets
0,0.5,(HeartDisease)
1,0.489844,(Smoking)
2,0.240986,(DiffWalking)
3,0.222957,(Diabetic)
4,0.284496,(PhysicalActivity)
5,0.203686,(BMI_18.5_24)
6,0.427922,(BMI_24_30)
7,0.352793,(BMI_30_100)
8,0.372904,(PhysicalNotHealth)
9,0.346235,(MentalNotHealth)


In [11]:
df_association= association_rules(df_apriori, metric="confidence", min_threshold=0.5)
df_association.to_csv('../data/clean/association_rules.csv', index=False)

df_association.sort_values(by='confidence', ascending=False, inplace=True)
df_association.head(20)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
2,(PhysicalNotHealth),(HeartDisease),0.372904,0.5,0.236565,0.634386,1.268773,0.050113,1.367564,0.337806
0,(Smoking),(HeartDisease),0.489844,0.5,0.292935,0.598016,1.196032,0.048013,1.243831,0.321279
3,(HeartDisease),(isMale),0.5,0.529683,0.294798,0.589596,1.113111,0.029957,1.145985,0.203234
1,(HeartDisease),(Smoking),0.5,0.489844,0.292935,0.585869,1.196032,0.048013,1.231872,0.327804
8,(BMI_24_30),(isMale),0.427922,0.529683,0.250703,0.585862,1.106063,0.024041,1.135655,0.167622
6,(Smoking),(isMale),0.489844,0.529683,0.286742,0.585375,1.105143,0.027281,1.13432,0.186491
9,(MentalNotHealth),(isFemale),0.346235,0.470317,0.200325,0.578581,1.230192,0.037485,1.256902,0.286217
4,(isMale),(HeartDisease),0.529683,0.5,0.294798,0.556556,1.113111,0.029957,1.127537,0.216061
5,(PhysicalNotHealth),(Smoking),0.372904,0.489844,0.206499,0.553759,1.130481,0.023834,1.143231,0.184056
7,(isMale),(Smoking),0.529683,0.489844,0.286742,0.541348,1.105143,0.027281,1.112293,0.202288


## Separate data into three hospitals

In [18]:
df_1 = pd.read_csv('../data/clean/heart/heart_part1.csv')
df_1 = df_1.drop(columns=['SleepTime_6_8'])
y_1 = df_1['HeartDisease'].to_numpy()
df_sample_1, y_sample = undersample.fit_resample(df_1, y_1)
df_apriori_1 = apriori(df_sample_1, min_support=0.2, use_colnames=True)
df_association_1 = association_rules(df_apriori_1, metric="confidence", min_threshold=0.5)

# sort by confidence
df_association_1.sort_values(by='confidence', ascending=False, inplace=True)
df_association_1.head(20)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
4,(Smoking),(isMale),0.31203,0.599624,0.227444,0.728916,1.215621,0.040343,1.476942,0.257824
0,(Smoking),(HeartDisease),0.31203,0.5,0.223684,0.716867,1.433735,0.067669,1.765957,0.43973
5,(BMI_24_30),(isMale),0.421053,0.599624,0.300752,0.714286,1.191223,0.048279,1.401316,0.277273
2,(HeartDisease),(isMale),0.5,0.599624,0.336466,0.672932,1.122257,0.036654,1.224138,0.217877
3,(isMale),(HeartDisease),0.599624,0.5,0.336466,0.561129,1.122257,0.036654,1.139286,0.272091
1,(BMI_24_30),(HeartDisease),0.421053,0.5,0.229323,0.544643,1.089286,0.018797,1.098039,0.14158
6,(isMale),(BMI_24_30),0.599624,0.421053,0.300752,0.501567,1.191223,0.048279,1.161536,0.400939


In [15]:
df_2 = pd.read_csv('../data/clean/heart/heart_part1.csv')
df_2 = df_2.drop(columns=['SleepTime_6_8'])
y_2 = df_2['HeartDisease'].to_numpy()
df_sample_2, y_sample = undersample.fit_resample(df_2, y_2)
df_apriori_2 = apriori(df_sample_2, min_support=0.2, use_colnames=True)
df_association_2 = association_rules(df_apriori_2, metric="confidence", min_threshold=0.5)

# sort by confidence
df_association_2.sort_values(by='confidence', ascending=False, inplace=True)
df_association_2.head(20)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
4,(Smoking),(isMale),0.328947,0.578947,0.236842,0.72,1.243636,0.046399,1.503759,0.291939
0,(Smoking),(HeartDisease),0.328947,0.5,0.223684,0.68,1.36,0.059211,1.5625,0.394464
5,(BMI_24_30),(isMale),0.428571,0.578947,0.289474,0.675439,1.166667,0.041353,1.297297,0.25
2,(HeartDisease),(isMale),0.5,0.578947,0.336466,0.672932,1.162338,0.046992,1.287356,0.27933
3,(isMale),(HeartDisease),0.578947,0.5,0.336466,0.581169,1.162338,0.046992,1.193798,0.331704
1,(BMI_24_30),(HeartDisease),0.428571,0.5,0.229323,0.535088,1.070175,0.015038,1.075472,0.114754
6,(isMale),(BMI_24_30),0.578947,0.428571,0.289474,0.5,1.166667,0.041353,1.142857,0.339286


In [16]:
df_3 = pd.read_csv('../data/clean/heart/heart_part1.csv')
df_3 = df_3.drop(columns=['SleepTime_6_8'])
y_3 = df_3['HeartDisease'].to_numpy()
df_sample_3, y_sample = undersample.fit_resample(df_3, y_3)
df_apriori_3 = apriori(df_sample_3, min_support=0.2, use_colnames=True)
df_association_3 = association_rules(df_apriori_3, metric="confidence", min_threshold=0.5)

# sort by confidence
df_association_3.sort_values(by='confidence', ascending=False, inplace=True)
df_association_3.head(20)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
4,(Smoking),(isMale),0.345865,0.629699,0.253759,0.733696,1.165152,0.035969,1.390517,0.216688
5,(BMI_24_30),(isMale),0.488722,0.629699,0.343985,0.703846,1.11775,0.036237,1.250366,0.206043
1,(HeartDisease),(isMale),0.5,0.629699,0.336466,0.672932,1.068657,0.021617,1.132184,0.128492
0,(Smoking),(HeartDisease),0.345865,0.5,0.223684,0.646739,1.293478,0.050752,1.415385,0.346856
3,(Smoking),(BMI_24_30),0.345865,0.488722,0.204887,0.592391,1.212124,0.035856,1.254336,0.267531
6,(isMale),(BMI_24_30),0.629699,0.488722,0.343985,0.546269,1.11775,0.036237,1.12683,0.284486
2,(isMale),(HeartDisease),0.629699,0.5,0.336466,0.534328,1.068657,0.021617,1.073718,0.173496
