In [None]:
# ====================================================
# DSAN 6700 Group Project - Embedding Generation Sample

# ========== 1. Setup ==========
import os
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

# ========== 2. path  ==========
BASE = Path("/Users/zhouyiqin/Desktop/25fall/6700/group_project/DSAN6700-group6-project")

CANDIDATE_DATA_DIRS = [
    BASE / "clean_data",
    BASE / "data" / "clean_data",
    BASE / "data_clean",
    BASE / "data" / "processed",
]

def resolve_csv(filename: str) -> Path:
    tried = []
    for d in CANDIDATE_DATA_DIRS:
        p = d / filename
        tried.append(str(p))
        if p.exists():
            print(f"[PATH] Found: {p}")
            return p
    hits = list(BASE.rglob(filename))
    if hits:
        print(f"[PATH] Found by search: {hits[0]}")
        return hits[0]
    raise FileNotFoundError(f"❌ Could not find {filename}. Tried:\n" + "\n".join(tried))

OUTPUT_DIR = BASE / "data" / "embeddings"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# locate 4 CSV
movie_csv = resolve_csv("movie_sample_50.csv")
music_csv = resolve_csv("music_sample_100.csv")
book_csv  = resolve_csv("book_sample_100.csv")
dest_csv  = resolve_csv("destination_sample_wikipedia.csv")

movie_out = OUTPUT_DIR / "movie_embeddings.npy"
music_out = OUTPUT_DIR / "music_embeddings.npy"
book_out  = OUTPUT_DIR / "book_embeddings.npy"
dest_out  = OUTPUT_DIR / "destination_embeddings.npy"

print("\n[SUMMARY] Output dir ->", OUTPUT_DIR)

# ========== 3. load model ==========
print("\n[INFO] Loading model: sentence-transformers/all-MiniLM-L6-v2")
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# ========== 4.  embedding function ==========
def generate_embeddings(input_path, text_columns, output_path, sample_size=None):

    print(f"\n[START] Processing: {os.path.basename(input_path)}")
    df = pd.read_csv(input_path)
    if sample_size:
        df = df.sample(sample_size, random_state=42)

    df["embedding_text"] = df[text_columns].astype(str).agg(" ".join, axis=1)

    print(f"[INFO] Generating embeddings for {len(df)} samples...")
    embeddings = model.encode(
        df["embedding_text"].tolist(),
        batch_size=32,
        normalize_embeddings=True,
        show_progress_bar=True
    )

    np.save(output_path, embeddings)
    print(f"[OK] Saved: {output_path} (shape={embeddings.shape})")
    return df, embeddings

# ========== 5. generate Embedding ==========
movie_df, movie_emb = generate_embeddings(movie_csv, ["title", "overview", "genres", "keywords"], movie_out)
music_df, music_emb = generate_embeddings(music_csv, ["artists", "track_name", "track_genre"], music_out)
book_df,  book_emb  = generate_embeddings(book_csv,  ["title", "authors", "tag_list"], book_out)
dest_df,  dest_emb  = generate_embeddings(dest_csv,  ["name", "country", "description"], dest_out)

# ========== 6. validation ==========
print("\n[INFO] Verification Summary:")
for name, arr in zip(
    ["movie", "music", "book", "destination"],
    [movie_emb, music_emb, book_emb, dest_emb]
):
    print(f"  {name:12s} → shape: {arr.shape}")

print("\n[Sample vector preview]")
print("Movie:", movie_emb[0][:10])
print("Music:", music_emb[0][:10])
print("Book:", book_emb[0][:10])
print("Destination:", dest_emb[0][:10])

print("\n✅ All embeddings generated successfully!")

[PATH] Found: /Users/zhouyiqin/Desktop/25fall/6700/group_project/DSAN6700-group6-project/data/clean_data/movie_sample_50.csv
[PATH] Found: /Users/zhouyiqin/Desktop/25fall/6700/group_project/DSAN6700-group6-project/data/clean_data/music_sample_100.csv
[PATH] Found: /Users/zhouyiqin/Desktop/25fall/6700/group_project/DSAN6700-group6-project/data/clean_data/book_sample_100.csv
[PATH] Found: /Users/zhouyiqin/Desktop/25fall/6700/group_project/DSAN6700-group6-project/data/clean_data/destination_sample_wikipedia.csv

[SUMMARY] Output dir -> /Users/zhouyiqin/Desktop/25fall/6700/group_project/DSAN6700-group6-project/data/embeddings

[INFO] Loading model: sentence-transformers/all-MiniLM-L6-v2

[START] Processing: movie_sample_50.csv
[INFO] Generating embeddings for 50 samples...


Batches: 100%|██████████| 2/2 [00:02<00:00,  1.29s/it]


[OK] Saved: /Users/zhouyiqin/Desktop/25fall/6700/group_project/DSAN6700-group6-project/data/embeddings/movie_embeddings.npy (shape=(50, 384))

[START] Processing: music_sample_100.csv
[INFO] Generating embeddings for 100 samples...


Batches: 100%|██████████| 4/4 [00:01<00:00,  2.83it/s]


[OK] Saved: /Users/zhouyiqin/Desktop/25fall/6700/group_project/DSAN6700-group6-project/data/embeddings/music_embeddings.npy (shape=(100, 384))

[START] Processing: book_sample_100.csv
[INFO] Generating embeddings for 100 samples...


Batches: 100%|██████████| 4/4 [00:00<00:00,  6.80it/s]


[OK] Saved: /Users/zhouyiqin/Desktop/25fall/6700/group_project/DSAN6700-group6-project/data/embeddings/book_embeddings.npy (shape=(100, 384))

[START] Processing: destination_sample_wikipedia.csv
[INFO] Generating embeddings for 150 samples...


Batches: 100%|██████████| 5/5 [00:01<00:00,  4.80it/s]

[OK] Saved: /Users/zhouyiqin/Desktop/25fall/6700/group_project/DSAN6700-group6-project/data/embeddings/destination_embeddings.npy (shape=(150, 384))

[INFO] Verification Summary:
  movie        → shape: (50, 384)
  music        → shape: (100, 384)
  book         → shape: (100, 384)
  destination  → shape: (150, 384)

[Sample vector preview]
Movie: [-0.0249922   0.04050739 -0.0530851  -0.00941723 -0.01514957  0.03045774
  0.08655921 -0.05231513  0.05098666 -0.06734758]
Music: [-0.09648349 -0.00883941  0.01210818  0.00486399 -0.05340661 -0.01599385
  0.02953386 -0.18408775 -0.02750859 -0.03005303]
Book: [-0.01953777 -0.07866155 -0.02123779  0.04082649 -0.0789073   0.04712661
  0.08464538  0.00539581 -0.02346932 -0.01278798]
Destination: [ 0.11104134 -0.03947651  0.02970299  0.05724914 -0.01541783  0.00310293
  0.06320502  0.07225557 -0.07223704  0.00226288]

✅ All embeddings generated successfully!



