In [1]:
# ====================================
# Notebook 2: Constructing User-Item Matrix and Item Embeddings
# Description:
# This notebook loads filtered transactional data and constructs
# binary and quantity-based user-item matrices, splits data into train/test,
# and generates item embeddings using two transformer models (MiniLM and SBERT).
# ====================================

In [2]:
import os

# === Clone GitHub repository ===
repo_dir = "My-BS-Thesis"

if os.path.exists(repo_dir):
    print(f"{repo_dir} already exists. Removing it...\n")
    !rm -r {repo_dir}

!git clone https://github.com/Goshmar/My-BS-Thesis

Cloning into 'My-BS-Thesis'...
remote: Enumerating objects: 125, done.[K
remote: Counting objects: 100% (125/125), done.[K
remote: Compressing objects: 100% (120/120), done.[K
remote: Total 125 (delta 41), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (125/125), 200.97 MiB | 12.80 MiB/s, done.
Resolving deltas: 100% (41/41), done.
Updating files: 100% (20/20), done.


In [7]:
# === Install dependencies from requirements.txt ===
!pip install -r My-BS-Thesis/requirements.txt -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m557.7/557.7 MB[0m [31m54.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.2/160.2 kB[0m [31m97.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.3/27.3 MB[0m [31m122.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m42.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m53.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m45.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.4/4.4 MB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
import json
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm
from scipy.sparse import csr_matrix, save_npz
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize
from sentence_transformers import SentenceTransformer
import zipfile

In [None]:
# === Unzip filtered order data ===
interim_data_dir = os.path.join(repo_dir, 'data', 'interim')
zip_path = os.path.join(interim_data_dir, 'filtered_orders.zip')

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(interim_data_dir)

In [None]:
# === Paths ===
filtered_orders_path = os.path.join(interim_data_dir, 'filtered_orders.csv')

processed_data_dir = os.path.join(repo_dir, 'data', 'processed')
artifacts_dir = os.path.join(repo_dir, 'artifacts')
data_raw = os.path.join(repo_dir, 'data', 'raw')
os.makedirs(processed_data_dir, exist_ok=True)
os.makedirs(artifacts_dir, exist_ok=True)

binary_matrix_path = os.path.join(processed_data_dir, 'user_item_binary_matrix.npz')
quantity_matrix_path = os.path.join(processed_data_dir, 'user_item_quantity_matrix.npz')

map_path = os.path.join(artifacts_dir, 'mapping.pkl')
embedding_pkl_path = os.path.join(artifacts_dir, 'item_embeddings.pkl')
embedding_zip_path = os.path.join(artifacts_dir, 'item_embeddings.zip')

train_csv_path = os.path.join(processed_data_dir, 'train_df.csv')
test_csv_path = os.path.join(processed_data_dir, 'test_df.csv')
train_zip_path = os.path.join(processed_data_dir, 'train_df.zip')
test_zip_path = os.path.join(processed_data_dir, 'test_df.zip')

id_map_path = os.path.join(data_raw, 'id_mapping.json')

# === Load filtered dataset ===
df = pd.read_csv(filtered_orders_path)

In [None]:
# === Split into train/test (fixed test size) ===
test_size = 80000
test_fraction = test_size / len(df)
train_df, test_df = train_test_split(df, test_size=test_fraction, random_state=42)

train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
train_df["shard"] = train_df.index % 10
test_df["shard"] = test_df.index % 10

In [None]:
# Download NDA-dict for decoding
with open(id_map_path, "r", encoding="utf-8-sig") as file:
    id_mappings = json.load(file)

name_dict = {v: k for k, v in id_mappings["name_dict"].items()}

In [None]:
# === Load transformer models ===
model_minilm = SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2')
model_sbert = SentenceTransformer('sberbank-ai/sbert_large_nlu_ru')
print("✅ Both transformer models loaded.")

# Creating embeddings for all products
item_embeddings = {}
for item_id, item_name in tqdm(name_dict.items(), desc="Generating item embeddings"):
    embedding_minilm = model_minilm.encode(item_name)
    embedding_sbert = model_sbert.encode(item_name)
    item_embeddings[item_id] = {
        "minilm": embedding_minilm,
        "sbert": embedding_sbert
    }

# Normalize embeddings
item_embeddings = {
    k: {
        "minilm": normalize(v["minilm"].reshape(1, -1))[0],
        "sbert": normalize(v["sbert"].reshape(1, -1))[0]
    }
    for k, v in item_embeddings.items()
}

✅ Both transformer models loaded.


Generating item embeddings: 100%|██████████| 12881/12881 [06:15<00:00, 34.27it/s]


In [None]:
# === Create user/item mappings from train set ===
order_ids = train_df["id"].unique()
user_map = {order_id: idx for idx, order_id in enumerate(order_ids)}
item_map = {item: idx for idx, item in enumerate(item_embeddings.keys())}
reverse_item_map = {idx: item for item, idx in item_map.items()}

# === Construct binary and quantity user-item matrices ===
rows, cols, binary_data, quantity_data = [], [], [], []

for _, row in tqdm(train_df.iterrows(), total=len(train_df), desc="🔄 Building user-item matrices"):
    order_id = row["id"]
    if order_id not in user_map:
        continue

    user_idx = user_map[order_id]
    try:
        products = eval(row["products"])
        if not isinstance(products, dict):
            continue
    except:
        continue

    for item, value in products.items():
        if item in item_map:
            quantity = value[0] if isinstance(value, list) and value else 1.0
            if not isinstance(quantity, (int, float)):
                continue

            rows.append(user_idx)
            cols.append(item_map[item])
            binary_data.append(1)
            quantity_data.append(quantity)

rows = np.array(rows, dtype=np.int32)
cols = np.array(cols, dtype=np.int32)
binary_data = np.array(binary_data, dtype=np.float32)
quantity_data = np.array(quantity_data, dtype=np.float32)

user_item_binary_matrix = csr_matrix((binary_data, (rows, cols)), shape=(len(user_map), len(item_map)))
user_item_quantity_matrix = csr_matrix((quantity_data, (rows, cols)), shape=(len(user_map), len(item_map)))

print("✅ User-item matrices created.")

🔄 Building user-item matrices: 100%|██████████| 412999/412999 [00:40<00:00, 10078.80it/s]


✅ User-item matrices created.


In [None]:
# Создаём разреженные user-item матрицы
user_item_binary_matrix = csr_matrix((binary_data, (rows, cols)), shape=(len(user_map), len(item_map)))
user_item_quantity_matrix = csr_matrix((quantity_data, (rows, cols)), shape=(len(user_map), len(item_map)))
print("✅ User-item матрицы созданы.")

✅ User-item матрицы созданы.


In [None]:
# === Save outputs ===
save_npz(binary_matrix_path, user_item_binary_matrix)
save_npz(quantity_matrix_path, user_item_quantity_matrix)
print(f"✅ Binary matrix saved to: {binary_matrix_path}")
print(f"✅ Quantity matrix saved to: {quantity_matrix_path}")

with open(map_path, 'wb') as f:
    pickle.dump({'user_map': user_map, 'item_map': item_map, 'reverse_item_map': reverse_item_map}, f)
print(f"✅ Mapping saved to: {map_path}")

✅ Binary matrix saved to: My-BS-Thesis/data/processed/user_item_binary_matrix.npz
✅ Quantity matrix saved to: My-BS-Thesis/data/processed/user_item_quantity_matrix.npz
✅ Mapping saved to: My-BS-Thesis/artifacts/mapping.pkl


In [None]:
# === Split and save item_embeddings in 4 parts (2 MiniLM + 2 SBERT) ===
minilm_embeddings = {k: v["minilm"] for k, v in item_embeddings.items()}
sbert_embeddings = {k: v["sbert"] for k, v in item_embeddings.items()}

def split_dict(d, n_parts):
    items = list(d.items())
    chunk_size = len(items) // n_parts
    return [dict(items[i * chunk_size : (i + 1) * chunk_size]) for i in range(n_parts - 1)] + [dict(items[(n_parts - 1) * chunk_size:])]

# Split into 2 parts each
minilm_parts = split_dict(minilm_embeddings, 2)
sbert_parts = split_dict(sbert_embeddings, 2)

# Save and zip each part
for idx, part in enumerate(minilm_parts, 1):
    pkl_path = os.path.join(artifacts_dir, f'item_embeddings_minilm_part{idx}.pkl')
    zip_path = os.path.join(artifacts_dir, f'item_embeddings_minilm_part{idx}.zip')

    with open(pkl_path, 'wb') as f:
        pickle.dump(part, f)
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        zipf.write(pkl_path, arcname=os.path.basename(pkl_path))
    os.remove(pkl_path)
    print(f"✅ MiniLM part {idx} saved to {zip_path}")

for idx, part in enumerate(sbert_parts, 1):
    pkl_path = os.path.join(artifacts_dir, f'item_embeddings_sbert_part{idx}.pkl')
    zip_path = os.path.join(artifacts_dir, f'item_embeddings_sbert_part{idx}.zip')

    with open(pkl_path, 'wb') as f:
        pickle.dump(part, f)
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        zipf.write(pkl_path, arcname=os.path.basename(pkl_path))
    os.remove(pkl_path)
    print(f"✅ SBERT part {idx} saved to {zip_path}")

print(f"✅ Item embeddings (MiniLM + SBERT) saved to: {embedding_zip_path}")

✅ MiniLM part 1 saved to My-BS-Thesis/artifacts/item_embeddings_minilm_part1.zip
✅ MiniLM part 2 saved to My-BS-Thesis/artifacts/item_embeddings_minilm_part2.zip
✅ SBERT part 1 saved to My-BS-Thesis/artifacts/item_embeddings_sbert_part1.zip
✅ SBERT part 2 saved to My-BS-Thesis/artifacts/item_embeddings_sbert_part2.zip
✅ Item embeddings (MiniLM + SBERT) saved to: My-BS-Thesis/artifacts/item_embeddings.zip


In [None]:
train_df.to_csv(train_csv_path, index=False)
test_df.to_csv(test_csv_path, index=False)

with zipfile.ZipFile(train_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipf.write(train_csv_path, arcname=os.path.basename(train_csv_path))
with zipfile.ZipFile(test_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipf.write(test_csv_path, arcname=os.path.basename(test_csv_path))

# Remove original CSVs
os.remove(train_csv_path)
os.remove(test_csv_path)

print(f"✅ Train data saved to: {train_zip_path}")
print(f"✅ Test data saved to: {test_zip_path}")

✅ Train data saved to: My-BS-Thesis/data/processed/train_df.zip
✅ Test data saved to: My-BS-Thesis/data/processed/test_df.zip
