# Laboratorium 1 - analiza koszykowa

## Przygotowanie

- pobierz i wypakuj dataset: https://kaggle.com/datasets/rashikrahmanpritom/groceries-dataset-for-market-basket-analysismba?resource=download&select=basket.csv
  - alternatywnie, pobierz plik `basket.csv` z Teamsów
- [opcjonalnie] Utwórz wirtualne środowisko
  `python3 -m venv ./recsyslab1`
- zainstaluj potrzebne biblioteki:
  `pip install more-itertools`


## Część 1. - przygotowanie danych


In [1]:
# importujemy wszystkie potrzebne pakiety
from more_itertools import powerset

In [2]:
# definiujemy stale

PATH = "./basket.csv"
EPSILON = 0.001
K = 4

In [3]:
# wczytujemy dane o koszykach


def read_baskets(path: str) -> list[tuple[str]]:
    with open(path) as f:
        raw = f.read()
    baskets = [
        set([y.lower() for y in x.split(",") if y]) for x in raw.split("\n")[1:] if x
    ]
    return baskets


def unique_products(baskets: list[tuple[str]]) -> list[str]:
    products = set()
    for basket in baskets:
        products.update(basket)
    return sorted(list(products))


baskets = read_baskets(PATH)
products = unique_products(baskets)

In [4]:
print(baskets, products, sep="\n")

[{'pastry', 'whole milk', 'salty snack'}, {'sausage', 'whole milk', 'semi-finished bread', 'yogurt'}, {'soda', 'pickled vegetables'}, {'misc. beverages', 'canned beer'}, {'hygiene articles', 'sausage'}, {'sausage', 'whole milk', 'rolls/buns'}, {'whole milk', 'soda'}, {'frankfurter', 'soda', 'whipped/sour cream'}, {'frankfurter', 'curd'}, {'white bread', 'beef'}, {'whole milk', 'butter'}, {'other vegetables', 'frozen vegetables'}, {'sugar', 'tropical fruit'}, {'specialty chocolate', 'butter milk'}, {'frozen meals', 'dental care'}, {'rolls/buns'}, {'detergent', 'root vegetables'}, {'sausage', 'rolls/buns'}, {'dish cleaner', 'cling film/bags'}, {'frozen fish', 'canned beer'}, {'whole milk', 'pip fruit', 'tropical fruit'}, {'pastry', 'whole milk', 'root vegetables'}, {'red/blush wine', 'chocolate', 'rolls/buns'}, {'shopping bags', 'other vegetables'}, {'whole milk', 'packaged fruit/vegetables', 'chocolate', 'rolls/buns'}, {'hygiene articles', 'other vegetables'}, {'margarine', 'whipped/sou

## Część 2. - obliczanie wskaźników


In [16]:
# obliczamy strukture danych (np. slownik) przechowujaca wszystkie interesujace wartosci `support`


from collections import defaultdict


def get_supports(baskets: list[tuple[str]], all_products: list[str], epsilon: float, depth:int = 2):
    supports = defaultdict()
    T = len(baskets)

    for products in powerset(all_products):
        if len(products) > depth: 
            break
        count = sum(1 for basket in baskets if set(products).issubset(basket))

        support_value = count/T

        if support_value > epsilon:
            supports[products] = support_value
        else:
            supports[products] = epsilon
            
    return dict(supports)



supports = get_supports(baskets, products, EPSILON)
supports

{(): 1.0,
 ('abrasive cleaner',): 0.0014702933903628951,
 ('artif. sweetener',): 0.0019381140145692708,
 ('baby cosmetics',): 0.001,
 ('bags',): 0.001,
 ('baking powder',): 0.008086613646995923,
 ('bathroom cleaner',): 0.0011361358016440553,
 ('beef',): 0.03395041101383412,
 ('berries',): 0.021787074784468355,
 ('beverages',): 0.016574216400454454,
 ('bottled beer',): 0.04531176903027468,
 ('bottled water',): 0.06068301811134131,
 ('brandy',): 0.0025395976742631824,
 ('brown bread',): 0.03762614448974136,
 ('butter',): 0.03522020985096572,
 ('butter milk',): 0.017576689166610975,
 ('cake bar',): 0.006148499632426653,
 ('candles',): 0.004410880171088686,
 ('candy',): 0.014368776314910112,
 ('canned beer',): 0.04691572545612511,
 ('canned fish',): 0.007685624540533315,
 ('canned fruit',): 0.001403461872619127,
 ('canned vegetables',): 0.005480184454988973,
 ('cat food',): 0.011829178640646929,
 ('cereals',): 0.002806923745238254,
 ('chewing gum',): 0.012029673193878232,
 ('chicken',): 0.

In [7]:
# definiujemy funkcje obliczajace support, confidence i lift


def support_value(supports, products: tuple[str]) -> float:
    raise NotImplementedError()

def confidence(
    supports, prior_products: tuple[str], following_products: tuple[str]
) -> float:
    raise NotImplementedError()


def lift(supports, prior_products: tuple[str], following_products: tuple[str]) -> float:
    raise NotImplementedError()

In [8]:
print(support_value(supports, {"whole milk", "rolls/buns"}))
print(confidence(supports, {"whole milk", "rolls/buns"}, {"yogurt"}))
print(lift(supports, {"whole milk", "rolls/buns"}, {"yogurt"}))

NameError: name 'supports' is not defined

## Część 3. - generowanie rekomendacji


In [47]:
# wyznaczamy liste potencjalnych rekomendacji
# rekomendowane artykuly powinny miec lift > 1 i jak najwyzszy confidence


def generate_basic_candidates(
    basket: tuple[str], products: list[str], supports: dict[str, float]
) -> list[tuple[str, tuple[str], float, float]]:
    candidates = []
    
    for product in products:
        if product in basket:
            continue
        
        prior_products = set(basket)
        following_products = {product}
        
        conf = confidence(supports, prior_products, following_products)
        
        lf = lift(supports, prior_products, following_products)
        
        if lf > 1:
            candidates.append((product, tuple(prior_products.union(following_products)), conf, lf))
    
    return candidates.sort(key=lambda x: x[2], reverse=True)

In [46]:
# zaproponuj drugi, bardziej zaawansowany algorytm, np.:
# - jesli produkt X wystepuje w liscie kandydatow kilkukrotnie, oblicz srednia lub iloczyn confidence
# - posortuj kandydatow po iloczynie configence i lift


def generate_advanced_candidates(
    basket: tuple[str], products: list[str], supports
) -> list[tuple[str, tuple[str], float, float]]:
    raise NotImplementedError()

In [None]:
print(baskets[1])
generate_basic_candidates(baskets[1], products, supports)
# generate_advanced_candidates(baskets[1], products, supports)

In [None]:
print(baskets[33])
generate_basic_candidates(baskets[33], products, supports)
generate_advanced_candidates(baskets[33], products, supports)