In [None]:
%pip install pandas tqdm faiss-cpu langchain langchain-community langchain-huggingface


In [None]:
import os
import pandas as pd
from tqdm import tqdm
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS


In [None]:
SOURCE_PATH = "tmdb_labeled.parquet"   # upload this file to Colab
INDEX_DIR = "tmdb_index"
EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

def row_to_block(row: pd.Series) -> str:
    cols = [
        'id','name','number_of_seasons','number_of_episodes','original_language',
        'vote_count','vote_average','overview','adult','backdrop_path','first_air_date',
        'last_air_date','homepage','in_production','original_name','popularity',
        'poster_path','type','status','tagline','genres','created_by','languages',
        'networks','origin_country','spoken_languages','production_companies',
        'production_countries','episode_run_time','is_popular','is_long_running'
    ]
    lines = []
    for c in cols:
        if c in row:
            lines.append(f"{c}: {row[c]}")
    return "\n".join(lines)

def build_index(force_rebuild=False):
    os.makedirs(INDEX_DIR, exist_ok=True)

    if os.path.exists(os.path.join(INDEX_DIR, "index.faiss")) and not force_rebuild:
        print(f"✅ Using existing FAISS index at {INDEX_DIR}")
        return

    print("⚙️ Building new FAISS index...")
    df = pd.read_parquet(SOURCE_PATH)

    docs_text = [row_to_block(r) for _, r in tqdm(df.iterrows(), total=len(df), desc="📄 Converting rows")]
    splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
    docs = splitter.create_documents(docs_text)

    print("🔢 Generating embeddings...")
    embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)

    vs = FAISS.from_documents(tqdm(docs, desc="⚡ Embedding docs"), embeddings)
    vs.save_local(INDEX_DIR)

    print(f"✅ FAISS index saved to {INDEX_DIR} (chunks: {len(docs)})")


In [None]:
build_index(force_rebuild=True)


In [None]:
from langchain_community.vectorstores import FAISS

embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
vs = FAISS.load_local(INDEX_DIR, embeddings, allow_dangerous_deserialization=True)

query = "Which shows are long running and popular?"
results = vs.similarity_search(query, k=3)

for i, res in enumerate(results, 1):
    print(f"\nResult {i}:\n{res.page_content[:500]}...\n")
