# üîé Semantic Search Indexer
**Build a neural search index from your CSV data.**

1. **Initialize** the environment.
2. **Upload** your `final_api_data.csv`.
3. **Process** the data to generate embeddings.
4. **Download** the resulting index.

In [None]:
# @title 1. Initialize Environment
# @markdown Run this cell first to install necessary libraries.

%%capture
!pip install sentence-transformers pandas numpy tqdm

import os
import re
import shutil
import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from google.colab import files
from tqdm.notebook import tqdm
from IPython.display import display, Markdown

# Enable progress bars for pandas
tqdm.pandas()

display(Markdown("‚úÖ **Libraries installed & Environment ready!**"))

In [None]:
# @title 2. Upload Data
# @markdown Upload your CSV file. The script will automatically look for a `.csv` file and rename it for processing.

print("‚¨ÜÔ∏è Please upload your CSV file:")
uploaded = files.upload()

found_file = False
for filename in uploaded.keys():
    if filename.endswith('.csv'):
        os.rename(filename, 'final_api_data.csv')
        display(Markdown(f"‚úÖ **File loaded successfully:** `{filename}` renamed to `final_api_data.csv`"))
        found_file = True
        break

if not found_file:
    display(Markdown("‚ùå **Error:** No CSV file found in upload. Please try again."))

In [None]:
# @title 3. Process & Embed
# @markdown This step cleans the text, formats it for the model, and generates vectors.

# --- CONFIGURATION ---
MODEL_NAME = "intfloat/multilingual-e5-large" # @param ["intfloat/multilingual-e5-large", "sentence-transformers/all-MiniLM-L6-v2", "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"]
CSV_FILE = "final_api_data.csv"
EMBEDDING_FILE = "search_embeddings.npy"
METADATA_FILE = "search_metadata.pkl"

# --- HELPER FUNCTIONS ---
def clean_text(text):
    if not isinstance(text, str): return ""
    text = text.lower()
    # Remove extensions
    text = re.sub(r"\.(mp4|wmv|avi|mkv|iso)", "", text)
    # Remove brackets
    text = re.sub(r"\[.*?\]", " ", text)
    text = re.sub(r"\(.*?\)", " ", text)
    # Remove common noise keywords
    noise = ["fhd", "hd", "sd", "1080p", "4k", "vr", "uncensored", "leaked"]
    pattern = r"\b(" + "|".join(noise) + r")\b"
    text = re.sub(pattern, "", text)
    return re.sub(r"\s+", " ", text).strip()

def create_rich_context(row):
    # Prepare parts
    title = clean_text(row.get("title", ""))
    jp_title = clean_text(row.get("jpTitle", ""))
    tags = clean_text(row.get("tags", "")) if "tags" in row else ""
    actress = clean_text(row.get("actress", "")) if "actress" in row else ""
    maker = clean_text(row.get("maker", "")) if "maker" in row else ""

    text_parts = []
    if title: text_parts.append(title)
    if jp_title and jp_title != title: text_parts.append(jp_title)
    if actress: text_parts.append(f"starring {actress}")
    if maker: text_parts.append(f"studio {maker}")
    if tags: text_parts.append(f"genres {tags}")

    # E5 models require "passage: " prefix for documents
    prefix = "passage: " if "e5" in MODEL_NAME else ""
    return prefix + " ".join(text_parts)

# --- EXECUTION ---
if not os.path.exists(CSV_FILE):
    print("‚ùå CSV file not found. Please run Step 2 first.")
else:
    print("‚è≥ Reading CSV...")
    df = pd.read_csv(CSV_FILE)

    print("üßπ Cleaning data & creating rich context...")
    # Using progress_apply for visualization
    df["search_text"] = df.progress_apply(create_rich_context, axis=1)

    # Filter empty or too short rows
    initial_len = len(df)
    df = df[df["search_text"].str.len() > 10]
    print(f"   üìâ Filtered: {initial_len} -> {len(df)} items (removed empty/short)")

    print(f"üß† Loading Model: {MODEL_NAME}...")
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"   üî• Computing on: {device.upper()}")
    model = SentenceTransformer(MODEL_NAME, device=device)

    print(f"üöÄ Generating Embeddings...")
    sentences = df["search_text"].tolist()

    # Generate embeddings
    embeddings = model.encode(
        sentences,
        show_progress_bar=True,
        batch_size=32,
        normalize_embeddings=True
    )

    print("üíæ Saving temporary files...")
    np.save(EMBEDDING_FILE, embeddings)
    df.to_pickle(METADATA_FILE)

    display(Markdown("‚úÖ **Indexing Complete! Proceed to Step 4.**"))

In [None]:
# @title 4. Download Results
# @markdown Zips the embeddings and metadata, then triggers a download.

OUTPUT_ZIP = "search_engine_data"

if os.path.exists("search_embeddings.npy"):
    print("üì¶ Zipping files...")
    shutil.make_archive(OUTPUT_ZIP, 'zip', '.', '.')

    print("‚¨áÔ∏è Downloading...")
    files.download(f'{OUTPUT_ZIP}.zip')
else:
    display(Markdown("‚ùå **Files not found.** Please run Step 3 successfully first."))