# Association Rule Mining

In [1]:
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler

# ignore warning
import warnings
warnings.filterwarnings('ignore')

## Dataset

### Read Dataset

In [2]:
df = pd.read_csv('../data/clean/heart/heart_all.csv')
df = df.drop(columns=['Race', 'SleepTime_6_8'])
df.shape

(319795, 27)

### Balance Dataset

In [3]:
y = df['HeartDisease'].to_numpy()
undersample = RandomUnderSampler(sampling_strategy='majority')
df_sample, y_sample = undersample.fit_resample(df, y)
print(df_sample.shape)
df_sample.head()

(54746, 27)


Unnamed: 0,HeartDisease,Smoking,AlcoholDrinking,Stroke,DiffWalking,Diabetic,PhysicalActivity,Asthma,KidneyDisease,SkinCancer,...,isFemale,Age_18_29,Age_30_39,Age_40_49,Age_50_59,Age_60_69,Age_70_79,Age_80,SleepTime_0_6,SleepTime_8
0,0,1,0,1,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,1,0
1,0,0,0,0,0,1,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,1,...,1,0,0,0,0,1,0,0,0,0
3,0,1,0,0,0,0,1,0,0,0,...,1,0,0,0,1,0,0,0,0,0
4,0,1,0,0,0,0,0,1,0,0,...,1,0,1,0,0,0,0,0,0,0


In [15]:
def apriori(df: pd.DataFrame, min_support: float, min_confidence: float):
    '''
    Association Rule Mining with Apriori Algorithm
    Without using any library
    '''

    # get all unique items
    unique_items = set(df.columns)

    # calculate support
    support_candidate = []
    support_remaining = []
    for item in unique_items:
        # Tag: apply smc to get support
        support = df[item].sum() / df.shape[0]
        if support >= min_support:
            support_candidate.append([set([item]), support])
            support_remaining.append([set([item]), support])
    
    while len(support_candidate) != 0:
        cur_subset = support_candidate.pop(0)[0]
        for item in unique_items:
            if item not in cur_subset:
                new_subset = cur_subset | set([item])
                # Tag: apply smc to get support
                support = df[list(new_subset)].all(axis=1).sum() / df.shape[0]
                if support >= min_support:
                    support_candidate.append([new_subset, support])
                    support_remaining.append([new_subset, support])
    
    # print(support_remaining)

    # calculate confidence
    # result format: [antecedent, consequent, support, confidence]
    result = []
    for antecedent in support_remaining:
        for consequent in support_remaining:
            antecedent_set = antecedent[0]
            consequent_set = consequent[0]

            if antecedent_set.issubset(consequent_set):
                continue
            if consequent_set.issubset(antecedent_set):
                continue
            
            union_set = antecedent_set | consequent_set
            # Tag: apply smc to get support
            support_union = df[list(union_set)].all(axis=1).sum() / df.shape[0]

            if support_union < min_support:
                continue

            # Tag: apply smc to get support
            support_antecedent = df[list(antecedent_set)].all(axis=1).sum() / df.shape[0]
            
            confidence = support_union / support_antecedent

            if confidence >= min_confidence:
                result.append([antecedent_set, consequent_set, support_union, confidence])
    
    # print(result)

    df_result = pd.DataFrame(result, columns=['antecedent', 'consequent', 'support', 'confidence'])
    return df_result

In [16]:
df_result = apriori(df_sample, 0.2, 0.5)
df_result.head(20)

Unnamed: 0,antecedent,consequent,support,confidence
0,{HeartDisease},{Smoking},0.292935,0.585869
1,{HeartDisease},{isMale},0.294798,0.589596
2,{BMI_24_30},{isMale},0.247854,0.579278
3,{Smoking},{HeartDisease},0.292935,0.598083
4,{Smoking},{isMale},0.286633,0.585217
5,{PhysicalNotHealth},{HeartDisease},0.236565,0.635913
6,{PhysicalNotHealth},{Smoking},0.206536,0.55519
7,{isMale},{HeartDisease},0.294798,0.560693
8,{isMale},{Smoking},0.286633,0.545164
