In [127]:
from apyori import apriori
from helper_functions import load_dataset
from typing import List, Tuple
import pandas as pd

### Loading the cleaned dataset

In [128]:
transactions: List[List] = None
try:
    data: pd.DataFrame = load_dataset('../data/assignment1_income_levels_cleaned.xlsx')
    transactions = data.astype(str).values.tolist()
except FileNotFoundError:
    print('File not found')

## 2) Search for Association Rules 

### Function to run the apriori algorithm

In [129]:
def run_apriori(transactions: List[List], min_support: float, min_confidence: float, min_length: int=2, sorted_by_support: bool=True, consequence:List[str]=None):
    """
    Runs the apriori algorithm and writes the results (rules) to a txt file.
    :param transactions: dataset as a list of lists
    :param min_support: minimum support of relations
    :param min_confidence: minimum confidence of relations
    :param min_length: minimum number of items in a rule
    :param sorted_by_support: True if the apriori results should be sorted by support, False otherwise
    :param consequence: list of strings, where the rules should have at least one of the strings as their consequence
    :return: 
    """
    file_name: str = f'../output/output_{str(int(consequence is not None))}_{min_support}_{min_confidence}_{min_length}.txt'
    results: List[Tuple] = list(apriori(transactions, min_support=min_support, min_confidence=min_confidence, min_length=min_length))
    try:
        with open(file_name, 'w') as file:
            if sorted_by_support:
                results = sorted(results, key=lambda x: x.support, reverse=True) # this means our final output will be sorted by support too
            for rule in results:
                if len(rule.items) == 1:
                    continue
                to_write: str = f"Items: ({', '.join(rule.items)})\n" # for writing to txt file
                is_consequence_present: bool = False
                for i in range(0, len(rule.ordered_statistics)):
                    antecedent: str = ', '.join(rule.ordered_statistics[i].items_base) # "left side" of the rule
                    consequent: str = ', '.join(rule.ordered_statistics[i].items_add) # "right side" of the rule
                    
                    # skip rules with empty antecedent or consequent
                    if len(antecedent) == 0 or len(consequent) == 0: 
                        continue
                        
                    # if we are looking for a specific consequence, we skip the rules that do not have it
                    if consequence is not None and consequent not in consequence:
                        continue
                    is_consequence_present = True
                        
                    support: str = str(rule.support)[:7] # float, so we need to convert it to string and limit the number of decimal places
                    confidence: str = str(rule.ordered_statistics[i].confidence)[:7] # float, so we need to convert it to string and limit the number of decimal places
                    lift: str = str(rule.ordered_statistics[i].lift)[:7] # float, so we need to convert it to string and limit the number of decimal places
                
                    to_write += f"\t{antecedent} => {consequent} (support: {support}, confidence: {confidence}, lift: {lift})\n"
                to_write += '\n'
                if consequence is not None and not is_consequence_present:
                    to_write = ''
                file.write(to_write)
                
                return
                
    except FileNotFoundError:
        print('File not found')
    except Exception as e:
        print(e)

### a) Running the apriori algorithm

Here, we play around with the algorithm and run it for different values for the "minimum support" and "minimum confidence" values.

In [130]:
min_length: int = 2
sorted_by_support: bool = True

supp_conf_values: List[Tuple] = [(0.2, 0.95), (0.6, 0.95), (0.8, 0.9), (0.8, 0.2), (0.6, 0.5), (0.2, 0.2)]

for min_support, min_confidence in supp_conf_values:
    run_apriori(transactions, min_support, min_confidence, min_length, sorted_by_support)

### b) Running the apriori algorithm

Here, we extract rules that have “sex = Male” or “sex = Female” as their consequence. 

In [131]:
supp_conf_values = [(0.2, 0.8), (0.8, 0.95), (0.9, 0.95), (0.2, 0.1)]
consequence: List[str] = ["Male", "Female"]

for min_support, min_confidence in supp_conf_values:
    run_apriori(transactions, min_support, min_confidence, min_length, sorted_by_support, consequence) 