# Homework 2 - association rules

In this homework we will perform association between data series in the obesity dataset. For this notebook we will use the following formulas:

$$
\begin{align*}
    support(X) &= P(X) \\
            &= \frac{\text{number of instances containing X in the dataset}}{\text{total number of items in the dataset}} \\
    \\
    support(X \rightarrow Y) &= P(X \cap Y) \\
                            &= support(X \cup Y) \\
                            &= \frac{\text{number of instances containing both X and Y in the dataset}}{\text{total number of items in the dataset}} \\
    \\
    confidence(X \rightarrow Y) &= P(Y|X) = \frac{P(X \cap Y)}{P(X)} \\
                                &= \frac{support(X \cup Y)}{support(X)} \\
    \\
    lift(X \rightarrow Y) &= \frac{P(X \cap Y)}{P(X) \cdot P(Y)} \\
                          &= \frac{support(X \cup Y)}{support(X) \cdot support(Y)} \\
    \\
    conviction(X \rightarrow Y) &= \frac{1 - P(Y)}{1 - P(Y|X)} = \frac{1 - P(Y)}{1 - \frac{P(X \cap Y)}{P(X)}} \\
                                &= \frac{1 - support(Y)}{1 - confidence(X \rightarrow Y)} \\
\end{align*}
$$

We will use the Apriori and ECLAT algorithms.

## Conclusions

TBD, need to update.

## Dependencies

### General dependencies

Imports for Python

In [None]:
import typing as t
import csv
import numpy as np
import numpy.typing as npt
import pandas as pd
import matplotlib.pyplot as plt
import arulespy.arules as arules
import arulespy.arulesViz as arulesviz
from IPython.display import HTML

import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

### Dataset-specific dependencies

Dataset manager, known labels, known outputs for the dataset.

In [None]:
NUMERICAL_VARIABLES = ["Age", "Height", "Weight", "FCVC", "NCP", "CH2O", "FAF", "TUE"]
CATEGORICAL_VARIABLES = [
    "FAVC",
    "CAEC",
    "CALC",
    "SCC",
    "MTRANS",
    "Gender",
    "family_history_with_overweight",
    "SMOKE",
    "NObeyesdad",
]
CATEGORICAL_VARIABLES_NO_LABEL = [
    "FAVC",
    "CAEC",
    "CALC",
    "SCC",
    "MTRANS",
    "Gender",
    "family_history_with_overweight",
    "SMOKE",
]
ALL_VARIABLES = [*NUMERICAL_VARIABLES, *CATEGORICAL_VARIABLES]
ALL_VARIABLES_NO_LABEL = [*NUMERICAL_VARIABLES, *CATEGORICAL_VARIABLES_NO_LABEL]
LABEL_VARIABLE = "NObeyesdad"
LABEL_DICTIONARY = {
    "Age": "Age",
    "Height": "Height (cm)",
    "Weight": "Weight (kg)",
    "FCVC": " Frequency of consumption of vegetables (times per day)",
    "NCP": "Number of main meals",
    "CH2O": "Consumption of water daily (Liters)",
    "FAF": "Physical activity frequency (times per day)",
    "TUE": "Time using technology devices (hours)",
    "FAVC": "Frequent consumption of high caloric food",
    "CAEC": "Consumption of food between meals",
    "CALC": "Consumption of alcohol",
    "SCC": "Calories consumption monitoring",
    "MTRANS": "Transportation used",
    "Gender": "Gender",
    "family_history_with_overweight": "Family member suffered or suffers from overweight",
    "SMOKE": "Smoker or not",
    "NObeyesdad": "Obesity level"
}

T = t.TypeVar("T")


class Person:
    Gender: str
    Age: int
    Height: float
    Weight: float
    family_history_with_overweight: str
    FAVC: str
    FCVC: int
    NCP: int
    CAEC: str
    SMOKE: str
    CH2O: int
    SCC: str
    FAF: str
    TUE: int
    CALC: str
    MTRANS: str
    NObeyesdad: str

    def __init__(
        self,
        Gender: str,
        Age: int,
        Height: float,
        Weight: float,
        family_history_with_overweight: str,
        FAVC: str,
        FCVC: int,
        NCP: int,
        CAEC: str,
        SMOKE: str,
        CH2O: int,
        SCC: str,
        FAF: int,
        TUE: int,
        CALC: str,
        MTRANS: str,
        NObeyesdad: str,
    ):
        self.Gender = Gender
        self.Age = Age
        self.Height = Height
        self.Weight = Weight
        self.family_history_with_overweight = family_history_with_overweight
        self.FAVC = FAVC
        self.FCVC = FCVC
        self.NCP = NCP
        self.CAEC = CAEC
        self.SMOKE = SMOKE
        self.CH2O = CH2O
        self.SCC = SCC
        self.FAF = FAF
        self.TUE = TUE
        self.CALC = CALC
        self.MTRANS = MTRANS
        self.NObeyesdad = NObeyesdad

    def __str__(self):
        return vars(self)

    def __len__(self):
        return len(vars(self))

    def __repr__(self): 
        return vars(self)


class DatasetManager:
    def __init__(self, path_to_csv: str):
        self.path_to_csv = path_to_csv

    def load_as_obj_list(self) -> list[Person]:
        with open(self.path_to_csv) as csv_file:
            csv_reader = csv.DictReader(csv_file)
            return [Person(**row) for row in csv_reader]

In [None]:
dataset_manager = DatasetManager("data/ObesityDataSet.csv")
dataset_obj_list = dataset_manager.load_as_obj_list()
dataset_dataframe = pd.DataFrame.from_records(data=[vars(entry) for entry in dataset_obj_list])

### Categorial data utility functions

Here we add utility functions (if any) for the categorial data types.

In [None]:
# TODO: add if any are found

### Continuous data utility functions

Here we add utility functions (if any) for the continuous data types.

In [None]:
def bin_by_frequency(data: npt.NDArray[np.float32], bins: int = 30):
    result = pd.qcut(data, q=bins, duplicates='drop')
    return result

## Preliminary data analysis

Here we built plots for the data. Mostly for debugging purposes.

In [None]:
for label in CATEGORICAL_VARIABLES_NO_LABEL:
    pretty_label = LABEL_DICTIONARY[label]
    data = dataset_dataframe[label].astype(str)

    plt.figure()
    plt.hist(data)
    plt.xlabel(pretty_label)
    plt.ylabel("Count")
    plt.show()

In [None]:
for label in NUMERICAL_VARIABLES:
    pretty_label = LABEL_DICTIONARY[label]
    data = dataset_dataframe[label].astype(np.float32)
    binned_data = bin_by_frequency(data=data, bins=10)

    plt.figure()
    binned_data.value_counts().plot(kind='bar', xlabel=label, ylabel='Count', rot=90)
    plt.xlabel(pretty_label)
    plt.ylabel("Count")
    plt.show()

## Finding associations

Here we use the Associator class to get the associations.

In [None]:
parameters = [
    (0.7, 0.4),
    (0.8, 0.8),
    (0.9, 0.8),
    (0.9, 0.9),
    (0.95, 0.9),
    (0.95, 0.95),
]

transactions = arules.Transactions.from_df(dataset_dataframe)
result = ""

for support, confidence in parameters:
    rules = arules.apriori(
        transactions,
        parameter=arules.parameters({"supp": support, "conf": confidence}),
        control=arules.parameters({"verbose": False}),
    )
    rules_dataframe = rules.as_df()

    result += f"""<p>Result for Apriori run with parameters: support {support}, confidence: {confidence}</p>"""
    result += rules_dataframe.to_html()
    result += "</br>"

HTML(result)
