# Laboratorium 1 - analiza koszykowa

## Przygotowanie

 * pobierz i wypakuj dataset: https://kaggle.com/datasets/rashikrahmanpritom/groceries-dataset-for-market-basket-analysismba?resource=download&select=basket.csv
   * alternatywnie, pobierz plik `basket.csv` z Teamsów
 * [opcjonalnie] Utwórz wirtualne środowisko
 `python3 -m venv ./recsyslab1`
 * zainstaluj potrzebne biblioteki:
 `pip install more-itertools`

In [1]:
!pip install more-itertools


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


## Część 1. - przygotowanie danych

In [2]:
# importujemy wszystkie potrzebne pakiety

from more_itertools import powerset

In [3]:
# definiujemy stale

PATH = './basket.csv'
EPSILON = 0.001

In [4]:
# wczytujemy dane o koszykach

def read_baskets(path: str) -> list[tuple[str]]:
    with open(path) as f:
        raw = f.read()
    baskets = [set([y.lower() for y in x.split(',') if y]) for x in raw.split('\n')[1:] if x]
    return baskets

def unique_products(baskets: list[tuple[str]]) -> list[str]:
    products = set()
    for basket in baskets:
        products.update(basket)
    return sorted(list(products))

baskets = read_baskets(PATH)
products = unique_products(baskets)

In [5]:
baskets

[{'pastry', 'salty snack', 'whole milk'},
 {'sausage', 'semi-finished bread', 'whole milk', 'yogurt'},
 {'pickled vegetables', 'soda'},
 {'canned beer', 'misc. beverages'},
 {'hygiene articles', 'sausage'},
 {'rolls/buns', 'sausage', 'whole milk'},
 {'soda', 'whole milk'},
 {'frankfurter', 'soda', 'whipped/sour cream'},
 {'curd', 'frankfurter'},
 {'beef', 'white bread'},
 {'butter', 'whole milk'},
 {'frozen vegetables', 'other vegetables'},
 {'sugar', 'tropical fruit'},
 {'butter milk', 'specialty chocolate'},
 {'dental care', 'frozen meals'},
 {'rolls/buns'},
 {'detergent', 'root vegetables'},
 {'rolls/buns', 'sausage'},
 {'cling film/bags', 'dish cleaner'},
 {'canned beer', 'frozen fish'},
 {'pip fruit', 'tropical fruit', 'whole milk'},
 {'pastry', 'root vegetables', 'whole milk'},
 {'chocolate', 'red/blush wine', 'rolls/buns'},
 {'other vegetables', 'shopping bags'},
 {'chocolate', 'packaged fruit/vegetables', 'rolls/buns', 'whole milk'},
 {'hygiene articles', 'other vegetables'},
 

## Część 2. - obliczanie wskaźników

In [37]:
# obliczamy strukture danych (np. slownik albo graf) przechowujaca wszystkie interesujace wartosci `support`
from itertools import combinations
from collections import deque

def get_supports(baskets: list[tuple[str]], all_products: list[str], epsilon: float):
    supports = {}
    
    # Sort products to optimize baskets creation
    sorted_products = sorted(all_products)
    # Initialize baskets with all products (every basket contains a single product), 
    # their indexes in the sorted products list (to prevent creating the same basket twice)
    # and possible superset baskets (to prevent checking unnecessary baskets)
    baskets_queue = deque([(frozenset([product]), i, baskets) for i, product in enumerate(sorted_products)])
    
    while baskets_queue:
        basket, index, baskets_to_check = baskets_queue.popleft()
        
        # Calculate the support and store superset baskets for the next iteration
        superset_baskets = [b for b in baskets_to_check if b.issuperset(basket)]
        support = len(superset_baskets) / len(baskets)
        
        # Skip if the support is too low
        if support < epsilon:
            supports[basket] = 0
            continue
        
        # Otherwise store the support
        supports[basket] = support
        
        # Create new baskets by adding the next product to the current basket
        for i in range(index + 1, len(sorted_products)):
            baskets_queue.append((basket.union([sorted_products[i]]), i, superset_baskets))
                
    return supports
    
supports = get_supports(baskets, products, EPSILON)
supports

{frozenset({'abrasive cleaner'}): 0.0014702933903628951,
 frozenset({'artif. sweetener'}): 0.0019381140145692708,
 frozenset({'baby cosmetics'}): 0.001,
 frozenset({'bags'}): 0.001,
 frozenset({'baking powder'}): 0.008086613646995923,
 frozenset({'bathroom cleaner'}): 0.0011361358016440553,
 frozenset({'beef'}): 0.03395041101383412,
 frozenset({'berries'}): 0.021787074784468355,
 frozenset({'beverages'}): 0.016574216400454454,
 frozenset({'bottled beer'}): 0.04531176903027468,
 frozenset({'bottled water'}): 0.06068301811134131,
 frozenset({'brandy'}): 0.0025395976742631824,
 frozenset({'brown bread'}): 0.03762614448974136,
 frozenset({'butter'}): 0.03522020985096572,
 frozenset({'butter milk'}): 0.017576689166610975,
 frozenset({'cake bar'}): 0.006148499632426653,
 frozenset({'candles'}): 0.004410880171088686,
 frozenset({'candy'}): 0.014368776314910112,
 frozenset({'canned beer'}): 0.04691572545612511,
 frozenset({'canned fish'}): 0.007685624540533315,
 frozenset({'canned fruit'}): 0.

In [40]:
# definiujemy funkcje obliczajace support, confidence i lift

def support(supports, products: set[str]) -> float:
    return supports.get(frozenset(products), 0)

def confidence(supports, prior_products: set[str], following_products: set[str]) -> float:
    return support(supports, prior_products | following_products) / support(supports, prior_products)
    
def lift(supports, prior_products: tuple[str], following_products: tuple[str]) -> float:
    return confidence(supports, prior_products, following_products) / support(supports, following_products)

In [41]:
print(support(supports, {'whole milk', 'rolls/buns'}))
print(confidence(supports, {'whole milk', 'rolls/buns'}, {'yogurt'}))
print(lift(supports, {'whole milk', 'rolls/buns'}, {'yogurt'}))

0.013967787208447505
0.09569377990430622
1.1142926293448514


## Część 3. - generowanie rekomendacji

In [15]:
# wyznaczamy liste potencjalnych rekomendacji
# rekomendowane artykuly powinny miec lift > 1 i mozliwie wysokie confidence

def generate_next_product_candidates(basket, products, supports):
    # return [(item, subbasket, confidence, lift)]
    recommendation = []
    for subset in powerset(basket):
        if len(subset) == 0:
            continue
        
        subbasket = set(subset)
        
        for item in products:
            if item in basket:
                continue
            
            confidence_value = confidence(supports, subbasket, set([item]))
            lift_value = lift(supports, subbasket, set([item]))
            
            if lift_value > 1:
                recommendation.append((item, subbasket, confidence_value, lift_value))
                
    # sort po confidence i n najlepszych wez
    return sorted(recommendation, key=lambda x: x[2], reverse=True)

In [16]:
print(baskets[1])
generate_next_product_candidates(baskets[1], products, supports)

{'semi-finished bread', 'sausage', 'yogurt', 'whole milk'}


[('abrasive cleaner', {'whole milk'}, 0.006332204824375794, 4.306762763051592),
 ('artif. sweetener', {'whole milk'}, 0.006332204824375794, 3.267199337487414),
 ('baby cosmetics', {'whole milk'}, 0.006332204824375794, 6.332204824375794),
 ('bags', {'whole milk'}, 0.006332204824375794, 6.332204824375794),
 ('bathroom cleaner', {'whole milk'}, 0.006332204824375794, 5.573457693360883),
 ('brandy', {'whole milk'}, 0.006332204824375794, 2.4933889680825003),
 ('cake bar', {'whole milk'}, 0.006332204824375794, 1.0298780520340762),
 ('candles', {'whole milk'}, 0.006332204824375794, 1.4355875876838637),
 ('canned fruit', {'whole milk'}, 0.006332204824375794, 4.511846704149287),
 ('canned vegetables',
  {'whole milk'},
  0.006332204824375794,
  1.1554729364284757),
 ('cereals', {'whole milk'}, 0.006332204824375794, 2.2559233520746433),
 ('chocolate marshmallow',
  {'whole milk'},
  0.006332204824375794,
  1.5791463464522502),
 ('cleaner', {'whole milk'}, 0.006332204824375794, 3.0564122834559684)

In [14]:
print(baskets[33])
generate_next_product_candidates(baskets[33], products, supports)

{'white wine', 'tropical fruit', 'domestic eggs', 'soda', 'root vegetables', 'yogurt', 'photo/film'}


[('abrasive cleaner',
  {'tropical fruit', 'white wine'},
  1.0,
  680.1363636363636),
 ('artif. sweetener',
  {'tropical fruit', 'white wine'},
  1.0,
  515.9655172413793),
 ('baby cosmetics', {'tropical fruit', 'white wine'}, 1.0, 1000.0),
 ('bags', {'tropical fruit', 'white wine'}, 1.0, 1000.0),
 ('baking powder', {'tropical fruit', 'white wine'}, 1.0, 123.66115702479338),
 ('bathroom cleaner',
  {'tropical fruit', 'white wine'},
  1.0,
  880.1764705882354),
 ('beef', {'tropical fruit', 'white wine'}, 1.0, 29.454724409448822),
 ('berries', {'tropical fruit', 'white wine'}, 1.0, 45.898773006134974),
 ('beverages', {'tropical fruit', 'white wine'}, 1.0, 60.33467741935484),
 ('bottled beer', {'tropical fruit', 'white wine'}, 1.0, 22.069321533923304),
 ('bottled water', {'tropical fruit', 'white wine'}, 1.0, 16.479074889867842),
 ('brandy', {'tropical fruit', 'white wine'}, 1.0, 393.7631578947369),
 ('brown bread', {'tropical fruit', 'white wine'}, 1.0, 26.577264653641212),
 ('butter', 