# DiscoverAI: Review-Aware Semantic Search

This notebook runs the complete DiscoverAI pipeline on Google Colab.

**Before running:**
1. Upload `Health_and_Personal_Care.jsonl.gz` and `meta_Health_and_Personal_Care.jsonl.gz` to `src/io/input/raw/`
2. Make sure the Colab runtime is set to **GPU** (Runtime > Change runtime type > T4 GPU)

## 0. Clone Repository and Setup Environment

In [None]:
import os

REPO_URL = "https://github.com/IIxoskeletonII/Deloitte-X-LUISS-Beta.git"
REPO_DIR = "/content/Deloitte-X-LUISS-Beta"

if not os.path.exists(REPO_DIR):
    !git clone {REPO_URL} {REPO_DIR}
    print(f"Cloned repo to {REPO_DIR}")
else:
    print(f"Repo already exists at {REPO_DIR}")

os.chdir(REPO_DIR)
print(f"Working directory: {os.getcwd()}")

In [None]:
!pip install -q pandas==2.1.4 numpy==1.26.2 pyarrow==14.0.2 sentence-transformers==2.2.2 transformers==4.36.2 faiss-cpu==1.7.2 gradio==4.12.0 tqdm==4.66.1

import subprocess
result = subprocess.run(["pip", "install", "-q", "faiss-gpu-cu12"], capture_output=True)
if result.returncode == 0:
    print("FAISS-GPU installed.")
else:
    print("FAISS-GPU not available, using CPU version.")

import torch
import platform

print(f"\nPython: {platform.python_version()}")
print(f"PyTorch: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_mem / 1e9:.1f} GB")

## 0.1 Upload Dataset Files
Run this cell, then use the file picker to upload both `.jsonl.gz` files. They will be placed in the correct directory automatically.

In [None]:
import os
from pathlib import Path
from google.colab import files

RAW_DIR = Path("src/io/input/raw")
RAW_DIR.mkdir(parents=True, exist_ok=True)

print("Upload both .jsonl.gz files now:")
uploaded = files.upload()

for filename, content in uploaded.items():
    dest = RAW_DIR / filename
    with open(dest, "wb") as f:
        f.write(content)
    print(f"  Saved: {dest} ({len(content) / 1e6:.1f} MB)")

print("\nFiles in raw directory:")
for f in RAW_DIR.iterdir():
    print(f"  {f.name}")

## 1. Data Extraction
Decompress the raw `.jsonl.gz` files into readable JSONL format.

In [None]:
from src.data_extraction import main as run_extraction
run_extraction()

## 2. Exploratory Data Analysis
Validate column names, check missing values, and confirm join keys.

In [None]:
from src.eda_preliminary import peek_data

print(">>> ANALYZING METADATA...")
meta_df = peek_data("meta_Health_and_Personal_Care.jsonl")
print("\n" + "="*60 + "\n")
print(">>> ANALYZING REVIEWS...")
reviews_df = peek_data("Health_and_Personal_Care.jsonl")

## 3. Preprocessing
Clean metadata, filter low-engagement reviews, merge datasets, rename columns.

In [None]:
from src.preprocessing import main as run_preprocessing
run_preprocessing()

## 4. Product-Level Semantic Embeddings
Encode product metadata and reviews with `all-mpnet-base-v2`, fuse into product vectors, build FAISS index.

In [None]:
from src.modelling import main as run_modelling
run_modelling()

import torch
if torch.cuda.is_available():
    torch.cuda.empty_cache()

## 5. Review Summarization
Generate human-readable summaries per product using `facebook/bart-large-cnn`.

In [None]:
from src.summarization import main as run_summarization
run_summarization()

import torch
if torch.cuda.is_available():
    torch.cuda.empty_cache()

## 6. Named Entity Recognition
Extract brand names, organizations, and other entities from reviews using `dslim/bert-base-NER`.

In [None]:
from src.entity_recognition import main as run_entity_recognition
run_entity_recognition()

import torch
if torch.cuda.is_available():
    torch.cuda.empty_cache()

## 7. Search Engine Validation
Load all artifacts and run sample queries to verify the system works end-to-end.

In [None]:
from src.search_engine import main as run_search_test
run_search_test()

## 7.1 Quality Report
Generates a human-readable quality report with pipeline statistics, sample summaries, sample entities, and sample search results. Saves to `quality_report.txt` for offline review.

In [None]:
import json
import numpy as np
import pandas as pd
from pathlib import Path
from src.search_engine import SearchEngine

OUTPUT_DIR = Path("src/io/output")
REPORT_PATH = "quality_report.txt"

engine = SearchEngine()
lines = []

lines.append("=" * 70)
lines.append("DISCOVERAI - QUALITY REPORT")
lines.append("=" * 70)

lines.append("\n--- PIPELINE STATISTICS ---")

df_processed = pd.read_parquet(OUTPUT_DIR / "processed_data.parquet")
lines.append(f"Processed dataset shape: {df_processed.shape}")
lines.append(f"Columns: {list(df_processed.columns)}")
lines.append(f"Unique products: {df_processed['parent_asin'].nunique()}")
lines.append(f"Total reviews: {len(df_processed)}")

embeddings = np.load(OUTPUT_DIR / "product_embeddings.npy")
lines.append(f"\nEmbedding matrix shape: {embeddings.shape}")
lines.append(f"Any NaN in embeddings: {np.any(np.isnan(embeddings))}")
lines.append(f"FAISS index size: {engine.index.ntotal}")

df_summaries = pd.read_parquet(OUTPUT_DIR / "product_summaries.parquet")
lines.append(f"\nProducts summarized: {len(df_summaries)}")
lines.append(f"Non-empty summaries: {(df_summaries['summary'].str.len() > 0).sum()}")

df_entities = pd.read_parquet(OUTPUT_DIR / "product_entities.parquet")
non_empty_ents = sum(1 for e in df_entities["entities"] if e != "{}")
lines.append(f"Products with entities: {non_empty_ents} / {len(df_entities)}")

lines.append("\n\n--- SAMPLE SUMMARIES (5 random products) ---")
samples = df_summaries[df_summaries["summary"].str.len() > 0].sample(min(5, len(df_summaries)), random_state=42)
for _, row in samples.iterrows():
    meta = engine.metadata_lookup.get(row["parent_asin"], {})
    title = meta.get("product_title", "Unknown")
    lines.append(f"\nProduct: {title}")
    lines.append(f"ASIN: {row['parent_asin']}")
    lines.append(f"Reviews used: {row['num_reviews_used']}")
    lines.append(f"Summary: {row['summary']}")

lines.append("\n\n--- SAMPLE ENTITIES (5 products with most entities) ---")
entity_counts = []
for _, row in df_entities.iterrows():
    try:
        ents = json.loads(row["entities"])
        entity_counts.append((row["parent_asin"], ents, len(ents)))
    except (json.JSONDecodeError, TypeError):
        continue
entity_counts.sort(key=lambda x: x[2], reverse=True)
for asin, ents, count in entity_counts[:5]:
    meta = engine.metadata_lookup.get(asin, {})
    title = meta.get("product_title", "Unknown")
    lines.append(f"\nProduct: {title}")
    lines.append(f"ASIN: {asin}")
    lines.append(f"Entities ({count}): {json.dumps(ents, indent=2)}")

lines.append("\n\n--- SEARCH QUALITY TEST ---")
test_queries = [
    "low-priced skincare product",
    "organic shampoo for sensitive scalp",
    "vitamins for energy and focus",
    "toothpaste for whitening",
    "moisturizer for dry skin",
]
for query in test_queries:
    lines.append(f"\nQuery: '{query}'")
    results = engine.search(query, top_k=5)
    if not results:
        lines.append("  No results found.")
        continue
    for i, r in enumerate(results, 1):
        lines.append(f"  {i}. [score={r['score']:.3f}] {r['product_title']}")
        lines.append(f"     Rating: {r['average_rating']} | Store: {r['store']}")
        if r["summary"]:
            lines.append(f"     Summary: {r['summary'][:150]}...")

lines.append("\n\n--- RECOMMENDATION TEST ---")
test_asin = df_summaries.iloc[0]["parent_asin"]
meta = engine.metadata_lookup.get(test_asin, {})
lines.append(f"Source product: {meta.get('product_title', 'Unknown')} ({test_asin})")
recs = engine.recommend(test_asin, top_k=5)
for i, r in enumerate(recs, 1):
    lines.append(f"  {i}. [score={r['score']:.3f}] {r['product_title']}")

lines.append("\n" + "=" * 70)
lines.append("END OF REPORT")
lines.append("=" * 70)

report_text = "\n".join(lines)

with open(REPORT_PATH, "w", encoding="utf-8") as f:
    f.write(report_text)

print(report_text)
print(f"\nReport saved to: {REPORT_PATH}")

## 8. Interactive Demo
Launch the Gradio interface with semantic search and product explorer tabs.
A public URL will be generated for sharing.

In [None]:
from src.demo import create_demo

app = create_demo()
app.launch(share=True)