In [1]:
import os
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
import pickle

In [2]:
CSV_PATH = "./../cleaned_database/cleaned_final_dataset3.csv"
OUTPUT_DIR = "./../faiss_embeddings3"
BATCH_SIZE = 1000 

In [3]:
def load_data(csv_path):
    df = pd.read_csv(csv_path)
    docs = df["docs"].fillna("").tolist()
    ids = df["id"].tolist()
    metadata = df.drop(columns=["docs"])
    return docs, ids, metadata

In [4]:
def load_model():
    return SentenceTransformer('sentence-transformers/all-distilroberta-v1')

In [5]:
def create_index(model):
    dim = model.get_sentence_embedding_dimension()
    return faiss.IndexFlatIP(dim)

In [6]:
def process_batches(docs, ids, model, index, batch_size):
    all_ids = []
    for start in range(0, len(docs), batch_size):
        end = start + batch_size
        batch_docs = docs[start:end]
        batch_ids = ids[start:end]

        print(f"Processing batch {start // batch_size + 1} | Size: {len(batch_docs)}")
        embeddings = model.encode(batch_docs, normalize_embeddings=True).astype("float32")
        index.add(embeddings)
        all_ids.extend(batch_ids)
    return all_ids


In [7]:
def save_outputs(index, ids, metadata, output_dir):
    os.makedirs(output_dir, exist_ok=True)

    faiss.write_index(index, os.path.join(output_dir, "movie_index.faiss"))
    with open(os.path.join(output_dir, "movie_ids.pkl"), "wb") as f:
        pickle.dump(ids, f)

    metadata.to_csv(os.path.join(output_dir, "movie_metadata.csv"), index=False)

In [8]:
def run_embedding_pipeline():
    docs, ids, metadata = load_data(CSV_PATH)
    model = load_model()
    index = create_index(model)
    final_ids = process_batches(docs, ids, model, index, BATCH_SIZE)
    save_outputs(index, final_ids, metadata, OUTPUT_DIR)

In [9]:
run_embedding_pipeline()


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.1k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Processing batch 1 | Size: 1000
Processing batch 2 | Size: 1000
Processing batch 3 | Size: 1000
Processing batch 4 | Size: 1000
Processing batch 5 | Size: 1000
Processing batch 6 | Size: 1000
Processing batch 7 | Size: 1000
Processing batch 8 | Size: 1000
Processing batch 9 | Size: 1000
Processing batch 10 | Size: 1000
Processing batch 11 | Size: 1000
Processing batch 12 | Size: 1000
Processing batch 13 | Size: 1000
Processing batch 14 | Size: 1000
Processing batch 15 | Size: 1000
Processing batch 16 | Size: 1000
Processing batch 17 | Size: 1000
Processing batch 18 | Size: 1000
Processing batch 19 | Size: 1000
Processing batch 20 | Size: 1000
Processing batch 21 | Size: 1000
Processing batch 22 | Size: 1000
Processing batch 23 | Size: 1000
Processing batch 24 | Size: 1000
Processing batch 25 | Size: 1000
Processing batch 26 | Size: 1000
Processing batch 27 | Size: 1000
Processing batch 28 | Size: 1000
Processing batch 29 | Size: 1000
Processing batch 30 | Size: 1000
Processing batch 31