# Assignment - Association Rule Mining

In [1]:
import json
import time
from functools import reduce
import pandas as pd
from itertools import combinations
from operator import itemgetter
from itertools import islice
from itertools import permutations

# Creating the updated CSV File

In [2]:
print("Creating the updated data set and the CSV file ... ")

# Read 2020 and 2021 VAERS Data csv and combine them
a = pd.read_csv("2020VAERSDATA.csv", encoding="ISO-8859-1", engine='python')
b = pd.read_csv("2021VAERSDATA.csv", encoding="ISO-8859-1", engine='python')
VAERSData = pd.concat([a, b], sort=False).drop_duplicates().reset_index(drop=True)

# Read 2020 and 2021 VAERS Vax csv and combine them
a = pd.read_csv("2020VAERSVAX.csv", encoding="ISO-8859-1", engine='python')
b = pd.read_csv("2021VAERSVAX.csv", encoding="ISO-8859-1", engine='python')
VAERSVax = pd.concat([a, b], sort=False).drop_duplicates(subset='VAERS_ID').reset_index(drop=True)
# Remove all non Covid rows
VAERSVax = VAERSVax[VAERSVax.VAX_TYPE.eq("COVID19")]

# Read 2020 and 2021 VAERS Symptoms csv and combine them
a = pd.read_csv("2020VAERSSYMPTOMS.csv", encoding="ISO-8859-1", engine='python')
b = pd.read_csv("2021VAERSSYMPTOMS.csv", encoding="ISO-8859-1", engine='python')
VAERSSymptoms = pd.concat([a, b], sort=False)

# Combine all 3 datasets
updatedData = reduce(lambda x, y: pd.merge(x, y, on='VAERS_ID', how='outer', sort=False),
                   [VAERSData, VAERSVax, VAERSSymptoms])
# Remove all non Covid rows again
updatedData = updatedData[updatedData.VAX_TYPE.eq("COVID19")]

# Create a CSV file of the updated dataset
updatedData.to_csv('VAERSDataNov15_21.csv', index=False)

Creating the updated data set and the CSV file ... 


In [3]:
# Convert Dataframe to JSON
updatedDataJSON = json.loads(updatedData.to_json(orient="records"))

# Data Formatting and Processing

## Ensure no duplicate VAERS_ID

In [4]:
# Dictionary(HashMap) to store data
hashMap = {}

# Loop through the data and store it in a hashmap
for row in updatedDataJSON:
    vaersId = row["VAERS_ID"]
    # If ID is already in the HashMap, just update the symptoms for it
    if vaersId in hashMap.keys():
        # Update the existing object with the additional symptoms
        obj = hashMap[vaersId]
        finalSymptoms = obj["Symptoms"]
        # Go through all symptoms and create a new symptoms array
        newSymptoms = []
        for x in range(0, 5):
            if row["SYMPTOM%d" % (x + 1)] is not None:
                newSymptoms.append(json.loads(
                    '{ "SymptomName": "%s", "SymptomVersion": "%s"}' % (
                        row["SYMPTOM%d" % (x + 1)], row["SYMPTOMVERSION%d" % (x + 1)])))
        # Append the arrays and update the hashmap
        finalSymptoms.extend(newSymptoms)
        obj["Symptoms"] = finalSymptoms
        hashMap[vaersId] = obj
    # If ID is not in Hashmap, create a new entry
    else:
        # Create an array of symptoms by going through all symptoms row
        newSymptoms = []
        for x in range(0, 5):
            if row["SYMPTOM%d" % (x + 1)] is not None:
                newSymptoms.append(json.loads(
                    '{ "SymptomName": "%s", "SymptomVersion": "%s"}' % (
                        row["SYMPTOM%d" % (x + 1)], row["SYMPTOMVERSION%d" % (x + 1)])))
        # Create a Covid Object with all the data
        for x in range(0, 5):
            del row['SYMPTOM%d' % (x + 1)]
            del row['SYMPTOMVERSION%d' % (x + 1)]
        obj = row
        obj["Symptoms"] = newSymptoms
        hashMap[vaersId] = obj

## Only use relevant fields

In [5]:
jsonForTask1 = []
for key in hashMap:
    # For every JSON object, we want to loop through the symptoms and create a more flattened JSON
    obj = hashMap[key]
    newObj = {}
    newObj["VAERS_ID"] = obj["VAERS_ID"]
    newObj["VAX_MANU"] = obj["VAX_MANU"]
    newObj["RECVDATE"] = obj["RECVDATE"]
    
    # Age is a range - to make it easier when trying to find association
    age = obj["AGE_YRS"]
    newAge = ""
    if age is None:
        newAge = "N/A"
    elif age <= 1:
        newAge = "<1"
    elif age < 3:
        newAge = "1-3"
    elif age < 12:
        newAge = "4-11"
    elif age < 19:
        newAge = "12-18"
    elif age < 31:
        newAge = "19-30"
    elif age < 41:
        newAge = "31-40"
    elif age < 51:
        newAge = "41-50"
    elif age < 61:
        newAge = "51-60"
    elif age < 71:
        newAge = "61-70"
    elif age < 80:
        newAge = "71-80"
    else:
        newAge = ">80"
        
    newObj["AGE_YRS"] = newAge
    newObj["SEX"] = obj["SEX"]
    newObj["DIED"] = obj["DIED"]
    newObj["DATEDIED"] = obj["DATEDIED"]
    newObj["VAX_DATE"] = obj["VAX_DATE"]
    newObj["no_of_symptoms"] = len(obj["Symptoms"])
    for count, symptom in enumerate(obj["Symptoms"], start=1):
        newObj["symptom_%d" % (count)] = symptom['SymptomName']
    newJson = json.loads(json.dumps(newObj))
    jsonForTask1.append(newJson)

# Convert this JSON into a Dataset
VAERSData_ML = pd.DataFrame(jsonForTask1)

## Transactional Dataset

In [6]:
# TRANSACTION DATASET WITH JUST THE SYMPTOMS
SymptomsDataset = []
for jsonObj in jsonForTask1:
    singleSet = []
    for key, value in jsonObj.items():
        if key.startswith("symptom"):
            singleSet.append(str(value))
        else:
            pass
    SymptomsDataset.append(singleSet)

In [7]:
allGenders = ['F', 'M', 'U']
allAgeGroups = ['N/A', '<1', '1-3', '4-11', '12-18', '19-30', '31-40', '41-50', '51-60', '61-70', '71-80', '>80']
allVaccines = ['PFIZER\\BIONTECH', 'MODERNA', 'JANSSEN', 'UNKNOWN MANUFACTURER']

# Apriori Algorithm

In [8]:
# Apriori Algorithm
    
def getFrequentItems(transactions, minAmount):
    # Count all items first
    itemCount = {}
    for transaction in transactions:
        for item in transaction:
            if item in itemCount:
                itemCount[item] = itemCount[item] + 1
            else:
                itemCount[item] = 1
    
    # Only grab items that are above the minimum required amount
    itemSetCountDescendindOrder = dict(sorted(itemCount.items(), key=itemgetter(1),reverse=True))
    indexToSlice = len(itemCount)
    for index, (key, value) in enumerate(itemSetCountDescendindOrder.items()):
        if value < minAmount:
            indexToSlice = index
            break
    newItemSet = dict(islice(itemSetCountDescendindOrder.items(), indexToSlice))
    return set(newItemSet)


def createC1(transactions, minAmount):
    # Only use the items that show up more than the minimum amount to reduce computational power needed
    frequentItems = getFrequentItems(transactions, minAmount)
    C1 = []
    for item in frequentItems:
        C1.append([item])
    # Frozenset because we will use them as keys in a dcit
    return list(map(frozenset, C1))

    
def candidateGen(Lk, k):
    # Generate candidate itemsets
    Ck = []
    lenLk = len(Lk)
    for i in range(lenLk):
        for j in range(i+1, lenLk): 
            L1 = list(Lk[i])[:k-2] 
            L2 = list(Lk[j])[:k-2]
            L1.sort()
            L2.sort()
            # If the first k-2 elements are equal, then we do union (Fk−1 × Fk−1 Method)
            if L1 == L2:
                Ck.append(Lk[i] | Lk[j])
    return Ck


def scanAndPrune(Dataset, Ck, minSupport):
    # Count the item (to calculate support)
    frequency = {}
    for transaction in Dataset:
        for item in Ck:
            if item.issubset(transaction):
                if item not in frequency: 
                    frequency[item] = 1
                else: 
                    frequency[item] += 1
                    
    # Total number of transactions
    N = float(len(Dataset))
    Lk = []
    
    # Track support - going to return it
    supportData = {}
    for key in frequency:
        # Calculate support --> frequency(A,B)/N
        support = frequency[key]/N
        # Prune this candidate if it doesn't passes the minimum support
        if support >= minSupport:
            Lk.insert(0,key)
        supportData[key] = support
    return Lk, supportData
       
    
def apriori(transactions, minSupport, minAmount = 1000):        
        # Get C1 - Only items that show up more than min amount - in order to reduce computational power needed
        C1 = createC1(transactions, minAmount)
        
        # This is the dataset in the setform
        Dataset = list(map(set,transactions))
    
        # Get L1 using C1
        # Track support
        L1, support = scanAndPrune(Dataset, C1, minSupport)
        
        L = [L1]
        k = 2
        
        # We need to keep generating and pruning till we can't create subsets anymore
        while (len(L[k-2]) > 0):
            # Generate candidate itemsets
            Ck = candidateGen(L[k-2], k)
            # Prune candidate itemsets
            Lk, supportK = scanAndPrune(Dataset, Ck, minSupport)
            support.update(supportK)
            L.append(Lk)
            k += 1
            
        result = pd.DataFrame(list(support.items()), columns = ["Items","Support"])
        return(L, support)
    

def mineRules(support, minConfidence = 0.5):
    # Make sure frozenset is transformed into tuples
    support =  {tuple(k): v for k, v in support.items()}
    data = []
    L = list(support.keys())
    
    # Generate the rules using permutation
    # Permutation because order matters A --> B is different from B --> A
    ruleList = list(permutations(L, 2))
    
    # Iterating through each rule
    for rule in ruleList:
        # If the left hand side (antecedent) of rule is subset of the right hand side (consequent) then it is a valid rule.
        if set(rule[0]).issubset(rule[1]):
            # Calculate Confidence --> frequency(A,B)/frequency(A)
            confidence = support[rule[1]]/support[rule[0]]
            
            # If confidence is greater than the minimum required, then we can use it
            if confidence > minConfidence:
                consequent = rule[1][not rule[1].index(rule[0][0])]
                # Calculate lift --> Support/Support(A) x Support(B)
                lift = support[rule[1]]/(support[rule[0]]* support[(consequent,)])
                data.append([rule[0], (consequent,), support[rule[0]], support[(consequent,)], support[rule[1]], confidence, lift])

    # Put result in a dataframe - visualization
    result = pd.DataFrame(data, columns = ["Antecedents", "Consequents", "Antecedent support", "Consequent Support", "Support", "Confidence", "Lift"])
    return(result)

# Using the Apriori Algorithm on Symptoms

## Get Frequent itemsets

In [12]:
tic = time.perf_counter()
L, support = apriori(SymptomsDataset, minSupport = 0.01, minAmount = 500)
frequentItemsetsTable = pd.DataFrame(list(support.items()), columns = ["Items","Support"])
toc = time.perf_counter()
print(f"Frequent itemsets generation in {toc - tic:0.4f} seconds")
frequentItemsetsTable

Frequent itemsets generation in 199.8817 seconds


Unnamed: 0,Items,Support
0,(Hypoaesthesia),0.031785
1,(Injection site hypoaesthesia),0.001023
2,(Headache),0.181453
3,(Feeling hot),0.017603
4,(Flushing),0.013290
...,...,...
3915,"(Fatigue, Arthralgia, Headache, Pain)",0.003406
3916,"(Fatigue, Arthralgia, Headache, Dizziness)",0.002002
3917,"(Chills, Fatigue, Headache, Nausea, Pain)",0.004174
3918,"(Chills, Fatigue, Headache, Pyrexia, Pain)",0.007414


## Mine frequent itemsets for rules

In [20]:
tic = time.perf_counter()
generatedRules = mineRules(support, minConfidence = 0.6)
toc = time.perf_counter()
print(f"Rules generated in {toc - tic:0.4f} seconds")
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.precision', 4,):display(generatedRules.sort_values(by='Support', ascending=False))

Rules generated in 6.7708 seconds


Unnamed: 0,Antecedents,Consequents,Antecedent support,Consequent Support,Support,Confidence,Lift
9,"(Chills, Pain)","(Pyrexia,)",0.0518,0.1527,0.0324,0.6266,4.1023
3,"(SARS-CoV-2 test positive,)","(COVID-19,)",0.028,0.0379,0.0236,0.8416,22.1782
4,"(COVID-19,)","(SARS-CoV-2 test positive,)",0.0379,0.028,0.0236,0.6218,22.1782
6,"(Chills, Nausea)","(Headache,)",0.0355,0.1815,0.0215,0.606,3.3397
0,"(Injection site swelling,)","(Injection site erythema,)",0.0336,0.0413,0.0207,0.6148,14.8975
1,"(Injection site pruritus,)","(Injection site erythema,)",0.0289,0.0413,0.0189,0.6535,15.8361
33,"(Headache, Pyrexia, Pain)","(Chills,)",0.0296,0.1326,0.018,0.608,4.5851
32,"(Chills, Headache, Pain)","(Pyrexia,)",0.0284,0.1527,0.018,0.6337,4.1491
11,"(Nausea, Pain)","(Headache,)",0.0262,0.1815,0.016,0.6083,3.3526
17,"(Myalgia, Chills)","(Chills,)",0.025,0.1326,0.0153,0.6097,4.5983


In [11]:
print("Combined dataset size: " + str(len(updatedDataJSON)))
print("Symptoms transactional dataset with no duplicates size: " + str(len(SymptomsDataset)))

Combined dataset size: 865056
Symptoms transactional dataset with no duplicates size: 643942
