In [1]:
import os
from tqdm import tqdm

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tecd_retail_recsys.data import DataPreprocessor

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)


In [2]:
%load_ext autoreload
%autoreload 2

### Исходные данные

In [3]:
# # core data

dp = DataPreprocessor(day_begin=1082, day_end=1308, val_days=20, test_days=20, min_user_interactions=1, min_item_interactions=20)
train_df, val_df, test_df = dp.preprocess()

train_df['item_category'] = train_df['item_category'].fillna('Unknown category')
train_df['item_subcategory'] = train_df['item_subcategory'].fillna('Unknown subcategory')

del val_df, test_df

Starting data preprocessing...
Loading events from t_ecd_small_partial/dataset/small/retail/events
Loaded 236,479,226 total events
Loading items data from t_ecd_small_partial/dataset/small/retail/items.pq
Loaded 250,171 items with features: ['item_id', 'item_brand_id', 'item_category', 'item_subcategory', 'item_price', 'item_embedding']
Merged item features. Data shape: (236479226, 12)
Filtered to 3,758,762 events with action_type='added-to-cart'
After filtering (min_user_interactions=1, min_item_interactions=20): 3,249,972 events, 84,944 users, 30,954 items
Created mappings: 84944 users, 30954 items
Temporal split - Train: days < 1269 (902,543 events), Val: days 1269-1288 (228,339 events), Test: days >= 1289 (223,395 events)
Users in each part (train, val, test) - 7425


# features plan


1. **user features**: ✅
- socdem_cluster
- region
- дней с последнего заказа
- дней с первого заказа
- lifetime (days)
- какая ОС
- количество заказов
- средний чек
- суммарная выручка
- основной источник покупок (каталог / поиск / ..) 
- кол-во уник брендов в истории
- кол-во уник категорий в истории
- кол-во уник товаров в истории
- вариативность цен (std)
- *прикрутить RFM-кластеризацию
- *временные паттерны (день недели, час покупок)

2. **item features**: ✅
- brand embedding
- item_category
- item_subcategory
- item_price
- item_embedding
- кол-во уник пользователей за все время
- общее кол-во просмотров, кликов, добавлений, CTR
- популярность по окнам (3, 7, 30 дней)
- относительная цена в категории (price / avg_category_price)
- отклонение от средней цены категории

3. **user-brand features**: ✅
- количество заказов этого бренда
- доля бренда в общих покупках юзера, его ранг
- дней с первой покупки бренда
- дней с последней покупки бренда
- средний, мин, макс чек для бренда

4. **user-category features**: ✅
- Количество покупок в категории
- Доля категории в покупках юзера
- Средний чек в категории
- Дней с последней покупки в категории


5. **user-item features**: ✅
- кол-во ивентов
- кол-во кликов, просмотров, добавлений в корзину
- CTR (из клика в корзину, из просмотра в корзину)
- основной источник покупок (каталог / поиск / ..) 
- дней с последней покупки
- дней с первой покупки
- *кол-во заказов этой item_subcategory ранее


6. **brand features**: ✅
- средний, мин, макс рейтинг из reviews
- попробовать усредний эмбеддинг из reviews
- *уник пользователей, заказов, ср чеки



# Негативные семплы

In [4]:
from tecd_retail_recsys.data.negative_sampler import NegativeSampler

In [5]:
dp = DataPreprocessor(
    day_begin=1082,
    day_end=1308,
    val_days=20,
    test_days=20,
    min_user_interactions=1,
    min_item_interactions=20
)
train_df, val_df, test_df = dp.preprocess()

print(f"Loaded positive samples:")
print(f"  Train: {len(train_df):,} purchases")
print(f"  Val:   {len(val_df):,} purchases")
print(f"  Test:  {len(test_df):,} purchases")


Starting data preprocessing...
Loading events from t_ecd_small_partial/dataset/small/retail/events
Loaded 236,479,226 total events
Loading items data from t_ecd_small_partial/dataset/small/retail/items.pq
Loaded 250,171 items with features: ['item_id', 'item_brand_id', 'item_category', 'item_subcategory', 'item_price', 'item_embedding']
Merged item features. Data shape: (236479226, 12)
Filtered to 3,758,762 events with action_type='added-to-cart'
After filtering (min_user_interactions=1, min_item_interactions=20): 3,249,972 events, 84,944 users, 30,954 items
Created mappings: 84944 users, 30954 items
Temporal split - Train: days < 1269 (902,543 events), Val: days 1269-1288 (228,339 events), Test: days >= 1289 (223,395 events)
Users in each part (train, val, test) - 7425
Loaded positive samples:
  Train: 902,543 purchases
  Val:   228,339 purchases
  Test:  223,395 purchases


In [None]:
def load_or_create_events(
    events_file: str,
    day_begin: int,
    day_end: int,
    events_dir: str = 't_ecd_small_partial/dataset/small/retail/events'
) -> pd.DataFrame:
    if os.path.exists(events_file):
        print(f"Loading existing events from {events_file}...")
        events = pd.read_csv(events_file)
        print(f"✓ Loaded {len(events):,} events")
        return events
    
    print(f"{events_file} not found. Creating from raw data...")
    print(f"Loading events from {events_dir} for days {day_begin}-{day_end-1}")
    print("This may take several minutes...")
    
    all_events = []
    for day in range(day_begin, day_end):
        file_path = os.path.join(events_dir, f'0{day}.pq')
        if os.path.exists(file_path):
            day_events = pd.read_parquet(file_path, engine='fastparquet')
            day_events['day'] = day
            all_events.append(day_events)
            
            if day % 20 == 0:
                print(f"  Loaded day {day}...")
    
    data = pd.concat(all_events, ignore_index=True)
    print(f"Loaded {len(data):,} total raw events")
    
    print("Aggregating events (grouping by day, user, item, action)...")
    events = data.groupby(
        ['day', 'user_id', 'item_id', 'subdomain', 'os', 'action_type'],
        as_index=False
    ).size().rename(columns={'size': 'cnt'})
    
    print(f"✓ Created {len(events):,} aggregated events")
    
    events['item_id'] = events['item_id'].apply(lambda x: dp.item_to_idx[x] if x in dp.item_to_idx else None)
    events = events.dropna(subset=['item_id'])
    events['item_id'] = events['item_id'].astype(np.int64)
    events['user_id'] = events['user_id'].apply(lambda x: dp.user_to_idx[x] if x in dp.user_to_idx else None)
    events = events.dropna(subset=['user_id'])
    events['user_id'] = events['user_id'].astype(np.int64)

    print(f"Saving to {events_file}...")
    events.to_csv(events_file, index=False)
    print(f"✓ Saved events to {events_file}")
    
    del data, all_events
    
    return events



# ==========================================
# Step 2: Load events (views, clicks, purchases)
# ==========================================
print("\nStep 2: Loading events for negative sampling...")


# Load or create aggregated events
# Train period: days 1082 to 1268 (before validation at day 1269)
train_events = load_or_create_events(
    events_file='train_retail_events.csv',
    day_begin=1082,
    day_end=1269  # exclusive (validation starts at 1269)
)

print(f"Train events shape: {train_events.shape}")
print(f"Action types: {train_events['action_type'].unique()}")



Step 2: Loading events for negative sampling...
Loading existing events from train_retail_events.csv...
✓ Loaded 28,015,049 events
Train events shape: (28015049, 7)
Action types: ['click' 'view' 'added-to-cart']


In [7]:
# ==========================================
# Step 3: Load item metadata
# ==========================================
print("\nStep 3: Loading item metadata...")

items_df = pd.read_parquet(
    't_ecd_small_partial/dataset/small/retail/items.pq',
    engine='fastparquet'
)

# Select required columns
items_df = items_df[['item_id', 'category', 'brand_id']].drop_duplicates('item_id')
items_df['item_id'] = items_df['item_id'].apply(lambda x: dp.item_to_idx[x] if x in dp.item_to_idx else None)
items_df = items_df.dropna(subset=['item_id'])
items_df = items_df.rename(columns={'category': 'item_category', 'brand_id': 'item_brand_id'})
items_df['item_category'] = items_df['item_category'].fillna('Unknown category')

print(f"Loaded {len(items_df):,} items in {items_df['item_category'].nunique()} categories")


Step 3: Loading item metadata...
Loaded 30,954 items in 20 categories


In [8]:

# ==========================================
# Step 4: Initialize NegativeSampler
# ==========================================
print("\nStep 4: Initializing NegativeSampler...")

sampler = NegativeSampler(
        negative_ratio=10,        # 10 negatives per 1 positive (use 5 for faster experiments)
        viewed_ratio=0.5,         # 50% viewed-but-not-purchased
        popular_ratio=0.3,        # 30% popular items
        category_ratio=0.2,       # 20% category-aware
        view_window_days=2,      # 14-day window for views
        random_seed=42,
        n_jobs=10                 # Use all CPU cores (set to 1 to disable parallelization)
    )

# Print sampling configuration
print("\nSampling strategy:")
print(f"  Negative ratio: 1:{sampler.negative_ratio}")
print(f"  Per positive sample:")
print(f"    - Viewed but not purchased: {sampler.n_viewed}")
print(f"    - Popular items (hard):     {sampler.n_popular}")
print(f"    - Category-aware random:    {sampler.n_category}")




Step 4: Initializing NegativeSampler...

Sampling strategy:
  Negative ratio: 1:10
  Per positive sample:
    - Viewed but not purchased: 5
    - Popular items (hard):     3
    - Category-aware random:    2


In [9]:

# ==========================================
# Step 5: Fit sampler on training data
# ==========================================
print("\nStep 5: Fitting sampler on training data...")

sampler.fit(
    events_df=train_events,
    items_df=items_df,
    positive_df=train_df
)

# Print statistics
stats = sampler.get_sampling_stats()
print("\nSampler statistics:")
for key, value in stats.items():
    print(f"  {key}: {value:.2f}" if isinstance(value, float) else f"  {key}: {value:,}")




Step 5: Fitting sampler on training data...
Fitting negative sampler...
Loaded 30,954 items in 20 categories
Calculated popularity for 30,954 items
  Popularity stats: min=0, max=17940, mean=688.3, items_with_0_pop=101
Built history for 7,425 users
Fitted on 30,954 items, 7,425 users
Average views per user: 313.7
Average purchases per user: 84.9

Sampler statistics:
  negative_ratio: 10
  n_viewed: 5
  n_popular: 3
  n_category: 2
  n_jobs: 10
  total_items: 30,954
  total_users: 7,425
  total_categories: 20
  avg_user_views: 313.75
  avg_user_purchases: 84.95


In [10]:

# ==========================================
# Step 6: Sample negatives for train
# ==========================================
print("\n" + "="*70)
print("Step 6: Sampling negatives for TRAIN set")
print("="*70)

train_samples = sampler.sample_negatives(train_df, batch_size=10_000)

print(f"\nTrain samples created:")
print(f"  Total samples: {len(train_samples):,}")
print(f"  Positives: {(train_samples['label'] == 1).sum():,}")
print(f"  Negatives: {(train_samples['label'] == 0).sum():,}")
print(f"\nNegative breakdown:")
print(train_samples[train_samples['label'] == 0]['negative_type'].value_counts())   


Step 6: Sampling negatives for TRAIN set
Sampling negatives with ratio 1:10
  - Viewed but not purchased: 5 per positive
  - Popular items: 3 per positive
  - Category-aware: 2 per positive
  - Parallel jobs: 10
  - Batch size: 10,000
Processing 91 batches...
Processed 10,000 / 902,543 positives
Processed 20,000 / 902,543 positives
Processed 30,000 / 902,543 positives
Processed 40,000 / 902,543 positives
Processed 50,000 / 902,543 positives
Processed 60,000 / 902,543 positives
Processed 70,000 / 902,543 positives
Processed 80,000 / 902,543 positives
Processed 90,000 / 902,543 positives
Processed 100,000 / 902,543 positives
Processed 110,000 / 902,543 positives
Processed 120,000 / 902,543 positives
Processed 130,000 / 902,543 positives
Processed 140,000 / 902,543 positives
Processed 150,000 / 902,543 positives
Processed 160,000 / 902,543 positives
Processed 170,000 / 902,543 positives
Processed 180,000 / 902,543 positives
Processed 190,000 / 902,543 positives
Processed 200,000 / 902,54

In [None]:
# train_samples.to_parquet('processed_data/train_samples_with_negatives_9kk.parquet', index=False)

In [6]:
train_samples = pd.read_parquet('processed_data/train_samples_with_negatives_9kk.parquet')
neg_train_samples = train_samples[train_samples['label'] == 0][['user_id', 'item_id', 'day', 'label', 'negative_type']].sample(frac=0.7, random_state=42)
neg_train_samples['item_id'] = neg_train_samples['item_id'].astype(np.int64)
print(neg_train_samples.shape)
del train_samples

(6317801, 5)


In [7]:
# neg features
from tecd_retail_recsys.data.features import add_features_to_samples

neg_features = add_features_to_samples(
    samples_df=neg_train_samples,
    train_df=train_df,
    dp=dp
)

neg_features.head()

user_features: (7425, 14), nans: 0
item_features: (30954, 19), nans: 0
brand_features: (5413, 6), nans: 0
user_item_features: (630733, 9), nans: 0
user_brand_features: (33989, 12), nans: 0
user_category_features: (27837, 12), nans: 0
done! nans:  9998121


Unnamed: 0,user_id,item_id,item_brand_id,user_os,user_orders,user_avg_check,user_total_revenue,user_main_subdomain,user_brands,user_categories,...,user_category_orders,user_category_avg_price,user_category_min_price,user_category_max_price,user_category_first_order_day,user_category_last_order_day,user_category_days_since_first_order,user_category_days_since_last_order,user_category_lifetime,user_category_orders_share
0,25537,18662,37799,ios,83,-3.818024,-316.896,catalog,3,3,...,75.0,-3.878307,-6.332,-1.336,1206.0,1260.0,62.0,8.0,54.0,0.903614
1,70941,29248,60434,ios,359,-3.78373,-1358.359,search,6,7,...,344.0,-3.74718,-6.308,0.0,1117.0,1267.0,151.0,1.0,150.0,0.958217
2,18102,2512,146468,ios,80,-3.855488,-308.439,search,6,2,...,,,,,,,9999.0,9999.0,,
3,63867,16668,60434,ios,266,-3.716447,-988.575,search,6,5,...,257.0,-3.706016,-5.847,0.612,1152.0,1267.0,116.0,1.0,115.0,0.966165
4,74657,9486,146468,ios,223,-4.303475,-959.675,catalog,6,5,...,6.0,-4.182167,-4.835,-3.162,1204.0,1241.0,64.0,27.0,37.0,0.026906


In [9]:
assert neg_features[neg_features['user_item_added_to_cart_total'] > 0].shape[0] == 0

In [None]:
# neg_features.to_parquet('neg_features_6kk.parquet', index=False)

In [14]:
from tecd_retail_recsys.data.features import collect_all_features

pos_features = collect_all_features(train_df, dp)
pos_features.head()

user_features: (7425, 14), nans: 0
item_features: (30954, 19), nans: 0
brand_features: (5413, 6), nans: 0
user_item_features: (630733, 9), nans: 0
user_brand_features: (33989, 12), nans: 0
user_category_features: (32249, 12), nans: 0
done! nans:  0


Unnamed: 0,user_id,item_id,item_brand_id,user_os,user_orders,user_avg_check,user_total_revenue,user_main_subdomain,user_brands,user_categories,...,user_category_orders,user_category_avg_price,user_category_min_price,user_category_max_price,user_category_first_order_day,user_category_last_order_day,user_category_days_since_first_order,user_category_days_since_last_order,user_category_lifetime,user_category_orders_share
0,79038,20358,65693,ios,183,-4.030437,-737.57,catalog,5,5,...,167,-4.075545,-5.938,0.0,1082,1261,186,7,179,0.912568
1,44584,23489,60434,android,248,-3.53398,-876.427,item,6,7,...,222,-3.529495,-6.409,0.0,1082,1255,186,13,173,0.895161
2,12869,2908,240838,android,71,-4.132817,-293.43,item,5,5,...,61,-4.265295,-6.451,-1.339,1082,1201,186,67,119,0.859155
3,42145,18904,240838,ios,349,-3.341175,-1166.07,search,6,9,...,318,-3.261318,-6.453,0.623,1082,1256,186,12,174,0.911175
4,15304,14462,146468,android,265,-4.070279,-1078.624,search,6,4,...,257,-4.073226,-5.774,0.0,1082,1251,186,17,169,0.969811


In [23]:
# pos_features.to_parquet('pos_features.parquet', index=False)