# Association Rule Mining

In [2]:
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler

# ignore warning
import warnings
warnings.filterwarnings('ignore')

## Dataset

### Read Dataset

In [3]:
df = pd.read_csv('../data/clean/heart/heart_all.csv')
df = df.drop(columns=['Race', 'SleepTime_6_8'])
df.shape

(319795, 27)

### Balance Dataset

In [4]:
y = df['HeartDisease'].to_numpy()
undersample = RandomUnderSampler(sampling_strategy='majority')
df_sample, y_sample = undersample.fit_resample(df, y)
print(df_sample.shape)
df_sample.head()

(54746, 27)


Unnamed: 0,HeartDisease,Smoking,AlcoholDrinking,Stroke,DiffWalking,Diabetic,PhysicalActivity,Asthma,KidneyDisease,SkinCancer,...,isFemale,Age_18_29,Age_30_39,Age_40_49,Age_50_59,Age_60_69,Age_70_79,Age_80,SleepTime_0_6,SleepTime_8
0,0,1,0,0,0,1,0,0,1,1,...,1,0,0,0,0,0,1,0,0,0
1,0,1,0,1,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,1
2,0,1,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0


In [5]:
def apriori(df: pd.DataFrame, min_support: float, min_confidence: float):
    '''
    Association Rule Mining with Apriori Algorithm
    Without using any library
    '''
    import time
    start_time = time.time()
    # get all unique items
    unique_items = set(df.columns)

    # calculate support
    support_candidate = []
    support_remaining = []
    remain_items = []
    for item in unique_items:
        # Tag: apply smc to get support
        support = df[item].sum() / df.shape[0]
        if support >= min_support:
            support_candidate.append([set([item]), support])
            support_remaining.append([set([item]), support])
            remain_items.append(item)
    

    
    cnt = 0
    while len(support_candidate) != 0:
        cur_subset = support_candidate.pop(0)[0]
        for item in remain_items:
            if item not in cur_subset:
                cnt += 1
                new_subset = cur_subset | set([item])
                # Tag: apply smc to get support
                support = df[list(new_subset)].all(axis=1).sum() / df.shape[0]
                if support >= min_support:
                    support_candidate.append([new_subset, support])
                    support_remaining.append([new_subset, support])
    
    print(cnt)
    print(support_remaining)

    # calculate confidence
    # result format: [antecedent, consequent, support, confidence]
    result = []
    for antecedent in support_remaining:
        for consequent in support_remaining:
            antecedent_set, support_antecedent = antecedent[0], antecedent[1]
            consequent_set = consequent[0]

            if antecedent_set.issubset(consequent_set):
                continue
            if consequent_set.issubset(antecedent_set):
                continue
            
            union_set = antecedent_set | consequent_set
            # Tag: apply smc to get support
            support_union = df[list(union_set)].all(axis=1).sum() / df.shape[0]

            if support_union < min_support:
                continue
            
            confidence = support_union / support_antecedent

            if confidence >= min_confidence:
                result.append([antecedent_set, consequent_set, support_union, confidence])
    
    # print(result)
    print('Elapsed Time:', time.time() - start_time)

    df_result = pd.DataFrame(result, columns=['antecedent', 'consequent', 'support', 'confidence'])
    return df_result

In [6]:
df_result = apriori(df_sample, 0.2, 0.5)
df_result.head(20)

446
[[{'Diabetic'}, 0.22321265480583055], [{'MentalNotHealth'}, 0.34744090892485296], [{'BMI_24_30'}, 0.4270996967815], [{'Smoking'}, 0.4921638110546889], [{'BMI_18.5_24'}, 0.2055492638731597], [{'PhysicalNotHealth'}, 0.3727212947064626], [{'BMI_30_100'}, 0.35182479085229973], [{'isFemale'}, 0.47561465677857745], [{'Age_60_69'}, 0.23842837832901034], [{'Age_70_79'}, 0.2353779271544953], [{'isMale'}, 0.5243853432214226], [{'HeartDisease'}, 0.5], [{'PhysicalActivity'}, 0.28582910166952835], [{'DiffWalking'}, 0.24091257808789684], [{'isFemale', 'MentalNotHealth'}, 0.20184123040952764], [{'Smoking', 'BMI_24_30'}, 0.21154056917400357], [{'isMale', 'BMI_24_30'}, 0.24794505534650935], [{'HeartDisease', 'BMI_24_30'}, 0.21334892046907536], [{'Smoking', 'BMI_24_30'}, 0.21154056917400357], [{'Smoking', 'PhysicalNotHealth'}, 0.2072114857706499], [{'Smoking', 'isFemale'}, 0.2065904358309283], [{'Smoking', 'isMale'}, 0.28557337522376064], [{'Smoking', 'HeartDisease'}, 0.29293464362693167], [{'Smokin

Unnamed: 0,antecedent,consequent,support,confidence
0,{MentalNotHealth},{isFemale},0.201841,0.580937
1,{BMI_24_30},{isMale},0.247945,0.580532
2,{Smoking},{isMale},0.285573,0.58024
3,{Smoking},{HeartDisease},0.292935,0.595197
4,{PhysicalNotHealth},{Smoking},0.207211,0.555942
5,{PhysicalNotHealth},{HeartDisease},0.236565,0.634697
6,{isMale},{Smoking},0.285573,0.544587
7,{isMale},{HeartDisease},0.294798,0.562178
8,{HeartDisease},{Smoking},0.292935,0.585869
9,{HeartDisease},{isMale},0.294798,0.589596
