# Laboratorium 1 - analiza koszykowa

## Przygotowanie

 * pobierz i wypakuj dataset: https://kaggle.com/datasets/rashikrahmanpritom/groceries-dataset-for-market-basket-analysismba?resource=download&select=basket.csv
   * alternatywnie, pobierz plik `basket.csv` z Teamsów
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab1`
 * zainstaluj potrzebne biblioteki:
 `pip install more-itertools`

## Część 1. - przygotowanie danych

In [62]:
# importujemy wszystkie potrzebne pakiety

from more_itertools import powerset
from collections import defaultdict

In [63]:
# definiujemy stale

PATH = "./basket.csv"
EPSILON = 0.001
K = 4

In [64]:
# wczytujemy dane o koszykach


def read_baskets(path: str) -> list[tuple[str]]:
    with open(path) as f:
        raw = f.read()
    baskets = [
        set([y.lower() for y in x.split(",") if y]) for x in raw.split("\n")[1:] if x
    ]
    return baskets


def unique_products(baskets: list[tuple[str]]) -> list[str]:
    products = set()
    for basket in baskets:
        products.update(basket)
    return sorted(list(products))


baskets = read_baskets(PATH)
products = unique_products(baskets)

## Część 2. - obliczanie wskaźników

In [65]:
# obliczamy strukture danych (np. slownik albo graf) przechowujaca wszystkie interesujace wartosci `support`


def stringify_tuple(tpl: tuple[str]):
    return ",".join(sorted(list(tpl)))


def get_supports(
    baskets: list[tuple[str]], all_products: list[str], epsilon: float
):
    product_baskets_map = defaultdict(list)
    supports_dict = {}
    memo = {}

    for basket in baskets:
        for item in basket:
            product_baskets_map[item].append(basket)

    for cur_basket in baskets:
        for possible_basket in powerset(cur_basket):
            key = stringify_tuple(possible_basket)
            if key not in memo:
                occurrences = sum(
                    set(possible_basket).issubset(set(basket))
                    for product in possible_basket
                    for basket in product_baskets_map[product]
                )
                support = occurrences / len(baskets)
                memo[key] = support
                if support >= epsilon:
                    supports_dict[key] = support
    return supports_dict


supports = get_supports(baskets, products, EPSILON)
# supports

In [66]:
# definiujemy funkcje obliczajace support, confidence i lift


def support(supports, products: tuple[str], baskets: list[tuple[str]] = baskets) -> float:
    products_key = stringify_tuple(products)
    
    if products_key not in supports:
        products_set = set(products)
        occurences = sum(products_set.issubset(basket) for basket in baskets)
        supports[products_key] = occurences / len(baskets)
    
    return supports[products_key]


def confidence(
    supports, prior_products: tuple[str], following_products: tuple[str]
) -> float:
    new_products = prior_products + following_products
    return support(supports, new_products) / support(supports, prior_products)


def lift(supports, prior_products: tuple[str], following_products: tuple[str]) -> float:
    new_products = prior_products + following_products
    return support(supports, new_products) / (
        support(supports, prior_products) * support(supports, following_products)
    )

In [67]:
print(support(supports, ("whole milk", "rolls/buns")))
print(confidence(supports, ("whole milk", "rolls/buns"), ("yogurt",)))
print(lift(supports, ("whole milk", "rolls/buns"), ("yogurt",)))

0.02793557441689501
0.14354066985645933
1.6714389440172768


## Część 3. - generowanie rekomendacji

In [68]:
# wyznaczamy liste potencjalnych rekomendacji
# rekomendowane artykuly powinny miec lift > 1 i jak najwyzszy confidence


def generate_basic_candidates(
    basket: tuple[str], products: list[str], supports
) -> list[tuple[str, tuple[str], float, float]]:
    result = []
    basket_tuple = tuple(basket)
    for product in products:
        if product in basket:
            continue
        if lift(supports, basket_tuple, (product,)) > 1:
            result.append(
                (
                    product,
                    confidence(supports, basket_tuple, (product,)),
                    lift(supports, basket_tuple, (product,)),
                )
            )

    return list(
        map(lambda x: x + (x[1],), sorted(result, key=lambda x: x[1], reverse=True))
    )

In [69]:
# zaproponuj drugi, bardziej zaawansowany algorytm, np.:
# - jesli produkt X wystepuje w liscie kandydatow kilkukrotnie, oblicz srednia lub iloczyn confidence
# - posortuj kandydatow po iloczynie configence i lift


def generate_advanced_candidates(
    basket: tuple[str], products: list[str], supports
) -> list[tuple[str, float, float, float]]:
    basket_tuple = tuple(basket)
    prod_dict = {}

    for product in products:
        if product in basket:
            continue

        if lift(supports, basket_tuple, (product,)) > 1:
            if product not in prod_dict:
                prod_dict[product] = (
                    confidence(supports, basket_tuple, (product,)),
                    lift(supports, basket_tuple, (product,)),
                )
            else:
                old_res = prod_dict[product]
                prod_dict[product] = (
                    confidence(supports, basket_tuple, (product,)) * old_res[0],
                    lift(supports, basket_tuple, (product,)),
                )

    res = []
    for key in prod_dict:
        val = prod_dict[key]
        res.append((key, val[0] * val[1], val[0], val[1]))

    return sorted(res, key=lambda x: x[1], reverse=True)

In [70]:
print(baskets[1])
basic = generate_basic_candidates(baskets[1], products, supports)
adv = generate_advanced_candidates(baskets[1], products, supports)

print(basic)
print(adv)

{'whole milk', 'sausage', 'yogurt', 'semi-finished bread'}
[('other vegetables', 0.5, 4.09496442255063, 0.5)]
[('other vegetables', 2.047482211275315, 0.5, 4.09496442255063)]


In [71]:
print(baskets[33])
basic = generate_basic_candidates(baskets[33], products, supports)
adv = generate_advanced_candidates(baskets[33], products, supports)

print(basic)
print(adv)

{'domestic eggs', 'white wine', 'tropical fruit', 'yogurt', 'photo/film', 'soda', 'root vegetables'}
[]
[]


In [72]:
def pretty_print(chosen_basket, candidates, metrics_name):
    print(", ".join(chosen_basket))

    print("".join(["-" for _ in range(110)]))
    print(
        "| {:24} | {:24} | {:24} | {:24} |".format(
            *["products", f"metrics({metrics_name})", "confidence", "lift"]
        )
    )

    print("".join(["-" for _ in range(110)]))
    for candidate in candidates[:5]:
        print("| {:24} | {:24} | {:24} | {:24} |".format(*candidate))
    print()

In [73]:
default_baskets = [
    {"white wine", "domestic eggs"},
    {"root vegetables", "yogurt"},
    {"whole milk", "coffee"},
]

In [74]:
for chosen_basket in default_baskets:
    pretty_print(
        chosen_basket,
        generate_basic_candidates(chosen_basket, products, supports),
        "lift",
    )

domestic eggs, white wine
--------------------------------------------------------------------------------------------------------------
| products                 | metrics(lift)            | confidence               | lift                     |
--------------------------------------------------------------------------------------------------------------
| candy                    |                     0.25 |       17.398837209302325 |                     0.25 |
| canned beer              |                     0.25 |        5.328703703703704 |                     0.25 |
| photo/film               |                     0.25 |        47.35126582278481 |                     0.25 |
| root vegetables          |                     0.25 |        3.593419788664745 |                     0.25 |
| soda                     |                     0.25 |       2.5745010323468684 |                     0.25 |

root vegetables, yogurt
-------------------------------------------------------------------

In [75]:
for chosen_basket in default_baskets:
    pretty_print(
        chosen_basket,
        generate_advanced_candidates(chosen_basket, products, supports),
        "lift*confidence",
    )

domestic eggs, white wine
--------------------------------------------------------------------------------------------------------------
| products                 | metrics(lift*confidence) | confidence               | lift                     |
--------------------------------------------------------------------------------------------------------------
| photo/film               |       11.837816455696203 |                     0.25 |        47.35126582278481 |
| candy                    |        4.349709302325581 |                     0.25 |       17.398837209302325 |
| waffles                  |       3.3761281588447654 |                     0.25 |       13.504512635379061 |
| canned beer              |        1.332175925925926 |                     0.25 |        5.328703703703704 |
| tropical fruit           |       0.9222756410256411 |                     0.25 |       3.6891025641025643 |

root vegetables, yogurt
-------------------------------------------------------------------