# üîé Semantic Search Indexer
**Build a neural search index from your CSV data.**

1. **Initialize** the environment.
2. **Upload** your `final_api_data.csv`.
3. **Process** the data to generate embeddings.
4. **Download** the resulting index.

In [None]:
# @title 1. Initialize Environment
# @markdown Run this cell first to install necessary libraries.

%%capture
!pip install sentence-transformers pandas numpy tqdm

import os
import re
import shutil
import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from google.colab import files
from tqdm.notebook import tqdm
from IPython.display import display, Markdown

# Enable progress bars for pandas
tqdm.pandas()

display(Markdown("‚úÖ **Libraries installed & Environment ready!**"))

In [None]:
# @title 2. Upload Data
# @markdown Upload your CSV file. The script will automatically look for a `.csv` file and rename it for processing.

print("‚¨ÜÔ∏è Please upload your CSV file:")
uploaded = files.upload()

found_file = False
for filename in uploaded.keys():
    if filename.endswith('.csv'):
        os.rename(filename, 'final_api_data.csv')
        display(Markdown(f"‚úÖ **File loaded successfully:** `{filename}` renamed to `final_api_data.csv`"))
        found_file = True
        break

if not found_file:
    display(Markdown("‚ùå **Error:** No CSV file found in upload. Please try again."))

In [None]:
# @title 3. Process & Index into LanceDB (Disk-based)
# @markdown This creates a database on disk so you don't need huge RAM later.

# --- INSTALL DATABASE ---
!pip install lancedb

import lancedb
import pandas as pd
import numpy as np
import os
import re
import shutil
from sentence_transformers import SentenceTransformer
from tqdm.notebook import tqdm

# --- CONFIG ---
MODEL_NAME = "intfloat/multilingual-e5-large" 
CSV_FILE = "final_api_data.csv"
DB_FOLDER = "jav_search_index"
TABLE_NAME = "videos"
BATCH_SIZE = 50000  # Process 50k items at a time to save Colab RAM

# --- HELPER FUNCTIONS ---
def clean_text(text):
    if not isinstance(text, str): return ""
    text = text.lower()
    text = re.sub(r"\.(mp4|wmv|avi|mkv|iso)", "", text)
    text = re.sub(r"\[.*?\]", " ", text)
    text = re.sub(r"\(.*?\)", " ", text)
    noise = ["fhd", "hd", "sd", "1080p", "4k", "vr", "uncensored", "leaked"]
    pattern = r"\b(" + "|".join(noise) + r")\b"
    text = re.sub(pattern, "", text)
    return re.sub(r"\s+", " ", text).strip()

def create_rich_context(row):
    title = clean_text(row.get("title", ""))
    jp_title = clean_text(row.get("jpTitle", ""))
    dvd_id = str(row.get("dvdId", "")).strip()
    
    text_parts = []
    if title: text_parts.append(title)
    if jp_title and jp_title != title: text_parts.append(jp_title)
    if dvd_id: text_parts.append(dvd_id) # Important for search matches
    
    # Prefix for e5 models
    prefix = "passage: " if "e5" in MODEL_NAME else ""
    return prefix + " ".join(text_parts)

# --- EXECUTION ---
if os.path.exists(DB_FOLDER):
    shutil.rmtree(DB_FOLDER) # Reset DB if exists
os.makedirs(DB_FOLDER, exist_ok=True)

# Initialize DB
db = lancedb.connect(DB_FOLDER)

print(f"üß† Loading Model: {MODEL_NAME}...")
model = SentenceTransformer(MODEL_NAME)
model.to("cuda" if torch.cuda.is_available() else "cpu")

print("‚è≥ Reading CSV...")
# Read CSV in chunks to avoid memory overflow if file is huge
# For simplicity here we read all text but embed in batches
df_full = pd.read_csv(CSV_FILE)
df_full = df_full.fillna("") # Fill NAs

print("üßπ Preparing text...")
df_full["search_text"] = df_full.apply(create_rich_context, axis=1)
df_full = df_full[df_full["search_text"].str.len() > 5] # Filter garbage

print(f"üöÄ Indexing {len(df_full)} items into Vector DB...")

data_buffer = []
total_batches = (len(df_full) // BATCH_SIZE) + 1

# Create Table (using first item to infer schema)
# We need to structure data explicitly for LanceDB
# [vector, dvdId, title, jpTitle, releaseDate, image, generated_url]

for i in tqdm(range(0, len(df_full), BATCH_SIZE), desc="Processing Batches"):
    batch = df_full.iloc[i : i + BATCH_SIZE].copy()
    
    # Encode
    sentences = batch["search_text"].tolist()
    embeddings = model.encode(sentences, normalize_embeddings=True, show_progress_bar=False)
    
    # Prepare batch for DB
    chunk_data = []
    for idx, row in enumerate(batch.to_dict("records")):
        chunk_data.append({
            "vector": embeddings[idx],
            "dvdId": str(row["dvdId"]),
            "title": str(row["title"]),
            "jpTitle": str(row["jpTitle"]),
            "releaseDate": str(row["releaseDate"]),
            "image": str(row["image"]),
            "generated_url": str(row["generated_url"])
        })
    
    # Add to DB
    if i == 0:
        table = db.create_table(TABLE_NAME, data=chunk_data, mode="overwrite")
    else:
        table.add(chunk_data)

print(f"‚úÖ Indexing complete. Total items in DB: {len(table)}")

# Create an IVF-PQ index for speed on large datasets (Optional but recommended for >100k items)
if len(table) > 10000:
    print("‚öôÔ∏è Building optimized index (IVF-PQ)... this makes search fast on laptops.")
    table.create_index(metric="cosine", vector_column_name="vector")
    print("‚úÖ Index built.")

In [None]:
# @title 4. Compress & Download (With Progress Bar)
# @markdown Zips the database folder manually so you can see progress, then triggers download.

import zipfile
import os
from tqdm.notebook import tqdm
from google.colab import files

SOURCE_FOLDER = "jav_search_index"
OUTPUT_FILENAME = "jav_search_index.zip"

def zipdir_with_progress(path, ziph):
    # 1. Count total files first for the progress bar
    print("üìä Calculating total files to zip...")
    total_files = sum([len(files) for r, d, files in os.walk(path)])
    print(f"   Found {total_files} files.")
    
    # 2. Zip with progress bar
    with tqdm(total=total_files, unit="file", desc="üì¶ Zipping") as pbar:
        for root, dirs, files in os.walk(path):
            for file in files:
                # Absolute path
                file_path = os.path.join(root, file)
                # Path inside zip
                arcname = os.path.relpath(file_path, os.path.join(path, '..'))
                
                ziph.write(file_path, arcname)
                pbar.update(1)

# --- EXECUTION ---
if os.path.exists(SOURCE_FOLDER):
    print(f"üöÄ Starting compression of '{SOURCE_FOLDER}'...")
    
    # Create Zip File
    with zipfile.ZipFile(OUTPUT_FILENAME, 'w', zipfile.ZIP_DEFLATED) as zipf:
        zipdir_with_progress(SOURCE_FOLDER, zipf)
    
    # Check size
    size_mb = os.path.getsize(OUTPUT_FILENAME) / (1024 * 1024)
    print(f"‚úÖ Compression Complete! File size: {size_mb:.2f} MB")
    
    print("‚¨áÔ∏è Triggering Download (Check your browser downloads)...")
    files.download(OUTPUT_FILENAME)
else:
    print(f"‚ùå Error: Folder '{SOURCE_FOLDER}' not found. Did Step 3 finish?")