# 03 – Embeddingi

Zamiana tekstów na wektory (sentence-transformers). Zapis embeddingów do pliku, żeby nie liczyć za każdym razem przy treningu.

## 1. Ścieżka projektu i ładowanie danych

In [1]:
import sys
from pathlib import Path

root = Path.cwd()
if not (root / "src").is_dir():
    root = root.parent
sys.path.insert(0, str(root))

import numpy as np
import pandas as pd

# Opcja A: wczytaj z zapisanych CSV (po notatniku 02)
processed_dir = root / "data" / "processed"
train_path = processed_dir / "train_clean.csv"
val_path = processed_dir / "val_clean.csv"

if train_path.exists() and val_path.exists():
    train_df = pd.read_csv(train_path, dtype={"text_clean": str})
    val_df = pd.read_csv(val_path, dtype={"text_clean": str})
    text_col = "text_clean"
    print("Wczytano z data/processed/")
else:
    # Opcja B: załaduj dataset i preprocess (jeśli nie uruchomiłeś 02)
    from datasets import load_dataset
    from src.preprocessing import PolishTextPreprocessor
    dataset = load_dataset("allegro/klej-allegro-reviews")
    train_df = dataset["train"].to_pandas()
    val_df = dataset["validation"].to_pandas()
    preprocessor = PolishTextPreprocessor(remove_emoji=True, remove_stopwords=False)
    train_df["text_clean"] = preprocessor.preprocess_series(train_df["text"])
    val_df["text_clean"] = preprocessor.preprocess_series(val_df["text"])
    text_col = "text_clean"
    print("Wczytano dataset i wykonano preprocessing")

# Wymuszenie listy stringów (model.encode nie przyjmuje float/NaN)
def to_string_list(ser):
    out = []
    for v in ser.values:
        if isinstance(v, str):
            out.append(v)
        elif v is None or (isinstance(v, float) and np.isnan(v)):
            out.append("")
        else:
            out.append(str(v))
    return out

train_texts = to_string_list(train_df[text_col])
val_texts = to_string_list(val_df[text_col])
y_train = train_df["rating"].values.astype(int)
y_val = val_df["rating"].values.astype(int)
assert all(isinstance(t, str) for t in train_texts), "train_texts zawiera nie-string"
assert all(isinstance(t, str) for t in val_texts), "val_texts zawiera nie-string"
print("Train:", len(train_texts), "| Val:", len(val_texts))

Wczytano z data/processed/
Train: 9577 | Val: 1002


## 2. Model sentence-transformers i encoding

In [2]:
from sentence_transformers import SentenceTransformer

model_name = "paraphrase-multilingual-MiniLM-L12-v2"
model = SentenceTransformer(model_name)
print("Model:", model_name)

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Model: paraphrase-multilingual-MiniLM-L12-v2


In [3]:
batch_size = 64
X_train = model.encode(train_texts, batch_size=batch_size, show_progress_bar=True)
X_val = model.encode(val_texts, batch_size=batch_size, show_progress_bar=True)
print("X_train:", X_train.shape, "| X_val:", X_val.shape)

Batches:   0%|          | 0/150 [00:00<?, ?it/s]

Batches:   0%|          | 0/16 [00:00<?, ?it/s]

X_train: (9577, 384) | X_val: (1002, 384)


## 3. Zapis embeddingów do data/processed

In [4]:
processed_dir.mkdir(parents=True, exist_ok=True)
np.save(processed_dir / "X_train.npy", X_train)
np.save(processed_dir / "y_train.npy", y_train)
np.save(processed_dir / "X_val.npy", X_val)
np.save(processed_dir / "y_val.npy", y_val)
print("Zapisano X_train.npy, y_train.npy, X_val.npy, y_val.npy w data/processed/")

Zapisano X_train.npy, y_train.npy, X_val.npy, y_val.npy w data/processed/
