# üîé Semantic Search Indexer (With Google Drive Support)
**Build a neural search index from your CSV data.**

1. **Initialize** (Optionally connect Google Drive).
2. **Load Data** (Checks Drive folder `JSearchEngine` first, then falls back to upload).
3. **Process** the data to generate embeddings.
4. **Export** (Saves to Drive if connected, otherwise downloads via browser).

In [None]:
# @title 1. Initialize Environment & Connect Drive
# @markdown Install libraries and optionally mount Google Drive.
CONNECT_GOOGLE_DRIVE = True # @param {type:"boolean"}

%%capture
!pip install sentence-transformers pandas numpy tqdm

import os
import re
import shutil
import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from google.colab import files
from google.colab import drive
from tqdm.notebook import tqdm
from IPython.display import display, Markdown

# Enable progress bars for pandas
tqdm.pandas()

DRIVE_BASE = "/content/drive/MyDrive/JSearchEngine"
DRIVE_MOUNTED = False

if CONNECT_GOOGLE_DRIVE:
    if not os.path.exists('/content/drive'):
        drive.mount('/content/drive')
    
    if os.path.exists('/content/drive'):
        DRIVE_MOUNTED = True
        os.makedirs(DRIVE_BASE, exist_ok=True)
        display(Markdown(f"‚úÖ **Google Drive Mounted!** Working folder: `{DRIVE_BASE}`"))
    else:
        display(Markdown("‚ö†Ô∏è **Drive mount failed.** Proceeding with local storage."))
else:
    display(Markdown("‚úÖ **Libraries installed.** (Google Drive disabled)."))

In [None]:
# @title 2. Load Data
# @markdown The script looks for a CSV file in `JSearchEngine` on Drive. If not found, it asks for an upload.

TARGET_FILENAME = 'final_api_data.csv'
found_file = False

# 1. Check Google Drive First
if DRIVE_MOUNTED:
    drive_file_path = os.path.join(DRIVE_BASE, TARGET_FILENAME)
    
    # Check if specific file exists, or look for any CSV
    if os.path.exists(drive_file_path):
        print(f"üìÇ Found file in Drive: {drive_file_path}")
        shutil.copy(drive_file_path, TARGET_FILENAME)
        found_file = True
    else:
        # Look for any CSV in the JSearchEngine folder
        csv_files = [f for f in os.listdir(DRIVE_BASE) if f.endswith('.csv')]
        if csv_files:
            print(f"üìÇ Found CSV in Drive: {csv_files[0]}")
            shutil.copy(os.path.join(DRIVE_BASE, csv_files[0]), TARGET_FILENAME)
            found_file = True

# 2. Fallback to Upload
if found_file:
    display(Markdown(f"‚úÖ **Data Loaded successfully from Drive.**"))
else:
    print("‚¨áÔ∏è No CSV found in Drive folder. Please upload manually:")
    uploaded = files.upload()
    for filename in uploaded.keys():
        if filename.endswith('.csv'):
            os.rename(filename, TARGET_FILENAME)
            display(Markdown(f"‚úÖ **File uploaded:** `{filename}` renamed to `{TARGET_FILENAME}`"))
            found_file = True
            break
    
    if not found_file:
        display(Markdown("‚ùå **Error:** No CSV file provided."))

In [None]:
# @title 3. Process & Index into LanceDB
# @markdown Indexes the data locally first for speed.

# --- INSTALL DATABASE ---
!pip install lancedb

import lancedb
import pandas as pd
import numpy as np
import os
import re
import shutil
from sentence_transformers import SentenceTransformer
from tqdm.notebook import tqdm

# --- CONFIG ---
MODEL_NAME = "intfloat/multilingual-e5-large" 
CSV_FILE = "final_api_data.csv"
DB_FOLDER = "jav_search_index"
TABLE_NAME = "videos"
BATCH_SIZE = 50000 

# --- HELPER FUNCTIONS ---
def clean_text(text):
    if not isinstance(text, str): return ""
    text = text.lower()
    text = re.sub(r"\.(mp4|wmv|avi|mkv|iso)", "", text)
    text = re.sub(r"\[.*?\]", " ", text)
    text = re.sub(r"\(.*?\)", " ", text)
    noise = ["fhd", "hd", "sd", "1080p", "4k", "vr", "uncensored", "leaked"]
    pattern = r"\b(" + "|".join(noise) + r")\b"
    text = re.sub(pattern, "", text)
    return re.sub(r"\s+", " ", text).strip()

def create_rich_context(row):
    title = clean_text(row.get("title", ""))
    jp_title = clean_text(row.get("jpTitle", ""))
    dvd_id = str(row.get("dvdId", "")).strip()
    actresses = str(row.get("actress_names", "")).replace(",", " ")
    
    text_parts = []
    if actresses: text_parts.append(f"Starring: {actresses}.")
    if title: text_parts.append(title)
    if jp_title and jp_title != title: text_parts.append(jp_title)
    if dvd_id: text_parts.append(dvd_id)
    
    prefix = "passage: " if "e5" in MODEL_NAME else ""
    return prefix + " ".join(text_parts)

# --- EXECUTION ---
if os.path.exists(DB_FOLDER):
    shutil.rmtree(DB_FOLDER) 
os.makedirs(DB_FOLDER, exist_ok=True)

if not os.path.exists(CSV_FILE):
    print("‚ùå CSV File not found! Run Step 2.")
else:
    db = lancedb.connect(DB_FOLDER)

    print(f"üß† Loading Model: {MODEL_NAME}...")
    model = SentenceTransformer(MODEL_NAME)
    model.to("cuda" if torch.cuda.is_available() else "cpu")

    print("‚è≥ Reading CSV...")
    df_full = pd.read_csv(CSV_FILE)
    df_full = df_full.fillna("")

    print("üßπ Preparing text...")
    df_full["search_text"] = df_full.apply(create_rich_context, axis=1)
    df_full = df_full[df_full["search_text"].str.len() > 5]

    print(f"üöÄ Indexing {len(df_full)} items into Vector DB...")

    for i in tqdm(range(0, len(df_full), BATCH_SIZE), desc="Processing Batches"):
        batch = df_full.iloc[i : i + BATCH_SIZE].copy()
        sentences = batch["search_text"].tolist()
        embeddings = model.encode(sentences, normalize_embeddings=True, show_progress_bar=False)
        
        chunk_data = []
        for idx, row in enumerate(batch.to_dict("records")):
            chunk_data.append({
                "vector": embeddings[idx],
                "dvdId": str(row["dvdId"]),
                "title": str(row["title"]),
                "jpTitle": str(row["jpTitle"]),
                "actress_names": str(row["actress_names"]),
                "releaseDate": str(row["releaseDate"]),
                "image": str(row["image"]),
                "generated_url": str(row["generated_url"])
            })
        
        if i == 0:
            table = db.create_table(TABLE_NAME, data=chunk_data, mode="overwrite")
        else:
            table.add(chunk_data)

    print(f"‚úÖ Indexing complete. Total items in DB: {len(table)}")

    if len(table) > 10000:
        print("‚öôÔ∏è Building optimized index (IVF-PQ)...")
        table.create_index(metric="cosine", vector_column_name="vector")
        print("‚úÖ Index built.")

In [None]:
# @title 4. Compress & Export to Drive
# @markdown If Drive is connected, the zip file is saved to `JSearchEngine`. Otherwise, it downloads to your browser.

import zipfile
import os
import shutil
from tqdm.notebook import tqdm
from google.colab import files

SOURCE_FOLDER = "jav_search_index"
OUTPUT_FILENAME = "jav_search_index.zip"

def zipdir_with_progress(path, ziph):
    total_files = sum([len(files) for r, d, files in os.walk(path)])
    print(f"üìä Total files: {total_files}")
    with tqdm(total=total_files, unit="file", desc="üì¶ Zipping") as pbar:
        for root, dirs, files in os.walk(path):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, os.path.join(path, '..'))
                ziph.write(file_path, arcname)
                pbar.update(1)

# --- EXECUTION ---
if os.path.exists(SOURCE_FOLDER):
    print(f"üöÄ Zipping database...")
    with zipfile.ZipFile(OUTPUT_FILENAME, 'w', zipfile.ZIP_DEFLATED) as zipf:
        zipdir_with_progress(SOURCE_FOLDER, zipf)
    
    size_mb = os.path.getsize(OUTPUT_FILENAME) / (1024 * 1024)
    print(f"‚úÖ Compression Complete! Size: {size_mb:.2f} MB")

    # Check where to send it
    if DRIVE_MOUNTED and os.path.exists(DRIVE_BASE):
        dest_path = os.path.join(DRIVE_BASE, OUTPUT_FILENAME)
        print(f"‚òÅÔ∏è Copying to Google Drive ({DRIVE_BASE})...")
        shutil.copy(OUTPUT_FILENAME, dest_path)
        display(Markdown(f"‚úÖ **Saved to Drive:** `{dest_path}`"))
    else:
        print("‚¨áÔ∏è Triggering Browser Download...")
        files.download(OUTPUT_FILENAME)
else:
    print(f"‚ùå Error: Folder '{SOURCE_FOLDER}' not found. Did Step 3 finish?")