In [3]:
# ==========================================
# Step 1: Authentication & Setup
# ==========================================
from google.colab import auth
auth.authenticate_user()

# Set your variables
PROJECT_ID = 'bookbridge-477802'
BUCKET_NAME = 'book_bridge'
SOURCE_PATH = f"gs://{BUCKET_NAME}/item2vec_training_data"
DEST_MODEL_PATH = f"gs://{BUCKET_NAME}/models/"

In [4]:
# Set the project for gsutil
!gcloud config set project {PROJECT_ID}

Updated property [core/project].


In [5]:
# ==========================================
# Step 2: Load & Prepare Data
# ==========================================
import os

# Create a local directory
os.makedirs("data_parts", exist_ok=True)

print("Downloading training data parts from GCS...")
# -m enables multi-threaded download (faster)
!gsutil -m cp -r {SOURCE_PATH}/*.gz ./data_parts/

print("Merging and decompressing files...")
# TRICK: We use zcat to decompress AND concatenate all parts into one file
# This creates a single 'corpus.txt' that Gensim can read easily.
!zcat ./data_parts/*.gz > corpus.txt

# Verify the data looks right (User ID sequences)
print("\n--- First 3 lines of Corpus ---")
!head -n 3 corpus.txt

Downloading training data parts from GCS...
Copying gs://book_bridge/item2vec_training_data/part-00001-a2f627d1-b68a-494a-88b6-5e59e18cf828-c000.txt.gz...
Copying gs://book_bridge/item2vec_training_data/part-00002-a2f627d1-b68a-494a-88b6-5e59e18cf828-c000.txt.gz...
Copying gs://book_bridge/item2vec_training_data/part-00006-a2f627d1-b68a-494a-88b6-5e59e18cf828-c000.txt.gz...
Copying gs://book_bridge/item2vec_training_data/part-00003-a2f627d1-b68a-494a-88b6-5e59e18cf828-c000.txt.gz...
Copying gs://book_bridge/item2vec_training_data/part-00007-a2f627d1-b68a-494a-88b6-5e59e18cf828-c000.txt.gz...
Copying gs://book_bridge/item2vec_training_data/part-00000-a2f627d1-b68a-494a-88b6-5e59e18cf828-c000.txt.gz...
Copying gs://book_bridge/item2vec_training_data/part-00004-a2f627d1-b68a-494a-88b6-5e59e18cf828-c000.txt.gz...
Copying gs://book_bridge/item2vec_training_data/part-00005-a2f627d1-b68a-494a-88b6-5e59e18cf828-c000.txt.gz...
Copying gs://book_bridge/item2vec_training_data/part-00008-a2f627d1-

In [3]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m27.9/27.9 MB[0m [31m70.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [8]:
# ==========================================
# Step 3: Train Item2Vec (Word2Vec)
# ==========================================
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import multiprocessing

print("\nStarting Training...")

# Stream the corpus from disk
sentences = LineSentence("corpus.txt")

# Train the model
# vector_size=100: Standard for recommendation
# window=5: Context window (how many books before/after to consider)
# min_count=3: Ignore books that appear less than 3 times (filters noise)
# negative sampling=10: Increase the embedding space
# sg=1: Apply Skip-Gram method since the data is sparse
model = Word2Vec(
    sentences=sentences,
    vector_size=100,
    window=5,
    min_count=3,
    sg=1,
    negative=10,
    workers=multiprocessing.cpu_count(),
    epochs=5
)

print(f"Training Complete! Vocab size: {len(model.wv.index_to_key)} books")


Starting Training...
Training Complete! Vocab size: 99667 books


In [9]:
# ==========================================
# Step 4: Save & Upload Artifacts
# ==========================================

# 1. Save the Lightweight KeyedVectors (Best for API)
model.wv.save("item_embeddings.kv")

# 2. Save the Full Model (Optional - Good for retraining later)
model.save("item2vec_full.model")

print(f"Uploading models to {DEST_MODEL_PATH}...")
!gsutil cp item_embeddings.kv {DEST_MODEL_PATH}
!gsutil cp item2vec_full.model {DEST_MODEL_PATH}

print("‚úÖ DONE. Your model is safe in GCS.")

Uploading models to gs://book_bridge/models/...
Copying file://item_embeddings.kv [Content-Type=application/octet-stream]...
/
Operation completed over 1 objects/41.2 MiB.                                     
Copying file://item2vec_full.model [Content-Type=application/octet-stream]...
|
Operation completed over 1 objects/79.2 MiB.                                     
‚úÖ DONE. Your model is safe in GCS.


In [8]:
import json
import re
import os
from google.colab import auth

# ==========================================
# 1. Setup & Authentication
# ==========================================
auth.authenticate_user()

# TODO: Replace with your actual bucket name
BUCKET_NAME = "book_bridge"
PROJECT_ID = "bookbridge-477802"

# Paths
REMOTE_SOURCE_DIR = f"gs://{BUCKET_NAME}/filtered_metadata" # Spark output folder
LOCAL_META_FILE = "metadata.jsonl"
LOCAL_INDEX_FILE = "title_to_id_index.json"
REMOTE_DEST_PATH = f"gs://{BUCKET_NAME}/indexes/{LOCAL_INDEX_FILE}"

!gcloud config set project {PROJECT_ID}

# ==========================================
# 2. Download Data from Spark Folder
# ==========================================
print(f"Downloading metadata from {REMOTE_SOURCE_DIR}...")

# Spark creates a folder. We use wildcard *.json to grab the actual data file
# and rename it to 'metadata.jsonl' locally for easy processing.
!gsutil cp {REMOTE_SOURCE_DIR}/*.json {LOCAL_META_FILE}

print("Download complete.")

# ==========================================
# 3. Build the Normalized Lookup Map
# ==========================================
def normalize_title(title):
    """
    Standardizes titles for fuzzy matching.
    Input:  "Harry Potter and the Sorcerer's Stone (Book 1)"
    Output: "harrypotterandthesorcerersstonebook1"
    """
    if not title: return ""
    # Lowercase
    clean = title.lower()
    # Remove all non-alphanumeric characters (spaces, punctuation, emojis)
    clean = re.sub(r'[^a-z0-9]', '', clean)
    return clean

title_to_asin_map = {}
count = 0
duplicates = 0

print("Building index (this may take 1-2 minutes)...")

with open(LOCAL_META_FILE, 'r') as f:
    for line in f:
        try:
            item = json.loads(line)
            asin = item.get('asin')
            title = item.get('title')

            if title and asin:
                # Create the "slug" key
                key = normalize_title(title)

                # Collision Strategy: First entry wins (usually the most popular if sorted)
                if key not in title_to_asin_map:
                    title_to_asin_map[key] = asin
                else:
                    duplicates += 1

                count += 1
                if count % 20000 == 0:
                    print(f"Processed {count} books...")

        except json.JSONDecodeError:
            continue

print(f"\n--- Index Build Complete ---")
print(f"Total Books Processed: {count}")
print(f"Unique Titles in Index: {len(title_to_asin_map)}")
print(f"Duplicate Titles Skipped: {duplicates}")

# ==========================================
# 4. Save & Upload
# ==========================================
print(f"Saving to {LOCAL_INDEX_FILE}...")
with open(LOCAL_INDEX_FILE, 'w') as out:
    json.dump(title_to_asin_map, out)

print(f"Uploading to GCS: {REMOTE_DEST_PATH}...")
!gsutil cp {LOCAL_INDEX_FILE} {REMOTE_DEST_PATH}

print("‚úÖ SUCCESS: Index is ready for your API.")

Updated property [core/project].
Downloading metadata from gs://book_bridge/filtered_metadata...
Copying gs://book_bridge/filtered_metadata/part-00000-4a0f39af-08f9-4855-85a1-d8b1c1a3ef89-c000.json...
==> NOTE: You are downloading one or more large file(s), which would
run significantly faster if you enabled sliced object downloads. This
feature is enabled by default but requires that compiled crcmod be
installed (see "gsutil help crcmod").

| [1 files][473.0 MiB/473.0 MiB]                                                
Operation completed over 1 objects/473.0 MiB.                                    
Download complete.
Building index (this may take 1-2 minutes)...
Processed 20000 books...
Processed 40000 books...
Processed 60000 books...
Processed 80000 books...

--- Index Build Complete ---
Total Books Processed: 99797
Unique Titles in Index: 92073
Duplicate Titles Skipped: 7724
Saving to title_to_id_index.json...
Uploading to GCS: gs://book_bridge/indexes/title_to_id_index.json...
C

In [None]:
!pip install gensim

In [6]:
from google.colab import auth
auth.authenticate_user()

import os
from gensim.models import KeyedVectors

# ==========================================
# 1. Configuration
# ==========================================
BUCKET_NAME = 'book_bridge' # Replace with your bucket
MODEL_FILE = "item_embeddings.kv"
REMOTE_PATH = f"gs://{BUCKET_NAME}/models/{MODEL_FILE}"

# ==========================================
# 2. Download from GCS
# ==========================================
print(f"Downloading model from {REMOTE_PATH}...")

# Check if file already exists locally to save time
if not os.path.exists(MODEL_FILE):
    !gsutil cp {REMOTE_PATH} .
    print("Download complete.")
else:
    print("Model file already exists locally.")

# ==========================================
# 3. Load into Gensim
# ==========================================
print("Loading vectors into memory...")

# We use KeyedVectors.load() because we saved it as .kv
# mmap='r' is optional but great for speed (memory mapping)
model_vectors = KeyedVectors.load(MODEL_FILE, mmap='r')

print(f"Model loaded successfully!")
print(f"Vocabulary size: {len(model_vectors.index_to_key)}")

# ==========================================
# 4. Test the Model (Sanity Check)
# ==========================================
# Pick the most popular book (index 0) to test
test_book = model_vectors.index_to_key[0]
print(f"\nTest Recommendation for Book ID: {test_book}")

similar_items = model_vectors.most_similar(test_book, topn=5)
for item_id, score in similar_items:
    print(f"- {item_id} (Score: {score:.4f})")

Downloading model from gs://book_bridge/models/item_embeddings.kv...
Copying gs://book_bridge/models/item_embeddings.kv...
- [1 files][ 41.2 MiB/ 41.2 MiB]                                                
Operation completed over 1 objects/41.2 MiB.                                     
Download complete.
Loading vectors into memory...
Model loaded successfully!
Vocabulary size: 99667

Test Recommendation for Book ID: B00L9B7IKE
- B0027MJU00 (Score: 0.6783)
- B00AEDDSZW (Score: 0.6262)
- B0151YQUTE (Score: 0.6215)
- B00IB5BSBG (Score: 0.6210)
- B00KU4PW86 (Score: 0.6056)


In [9]:
import json
import re
from gensim.models import KeyedVectors

# ==========================================
# 1. Setup Resources
# ==========================================
print("‚è≥ Loading resources into memory...")

# A. Load the Title -> ID Index
with open("title_to_id_index.json", "r") as f:
    title_to_id = json.load(f)

# B. Load the Vectors
model = KeyedVectors.load("item_embeddings.kv", mmap='r')

# C. Build ID -> Title Map (for display only)
# We need this to turn the recommended IDs (e.g. "B001...") back into Titles
id_to_title = {}
with open("metadata.jsonl", "r") as f:
    for line in f:
        try:
            item = json.loads(line)
            id_to_title[item['asin']] = item['title']
        except:
            continue

print(f"‚úÖ Resources Ready. Catalog size: {len(title_to_id)} titles.")

# ==========================================
# 2. Input from OpenAI (Titles Only)
# ==========================================
openai_titles = [
    "Dune",
    "The Name of the Wind",
    "The Left Hand of Darkness",
    "Mistborn: The Final Empire",
    "The Way of Kings",
    "Hyperion",
    "The Lies of Locke Lamora",
    "The Fifth Season",
    "Neuromancer",
    "The Blade Itself"
]

# ==========================================
# 3. The Recommender Logic
# ==========================================
def normalize_title(title):
    """Must match the logic used to build the index exactly"""
    if not title: return ""
    return re.sub(r'[^a-z0-9]', '', title.lower())

print("\n--- üöÄ Starting Recommendation Pipeline ---\n")

hits = 0

for raw_title in openai_titles:
    # 1. Normalize
    search_key = normalize_title(raw_title)

    # 2. Lookup in Index (Validation)
    if search_key in title_to_id:
        book_id = title_to_id[search_key]
        hits += 1
        print(f"‚úÖ MATCH: '{raw_title}' -> ID: {book_id}")

        # 3. Vector Search (Item2Vec)
        # Check if this specific book ID survived the training filter (min_count)
        if book_id in model:
            # Get top 5 recommendations
            recommendations = model.most_similar(book_id, topn=5)

            print(f"   ‚Ü≥ Based on this, you might like:")
            for rank, (rec_id, score) in enumerate(recommendations, 1):
                # Convert ID back to Title for display
                rec_title = id_to_title.get(rec_id, "[Title Not in Metadata]")
                print(f"      {rank}. {rec_title} (Sim: {score:.2f})")
        else:
            print(f"   ‚ö†Ô∏è ID found, but it had too few interactions to form a vector.")

    else:
        # This usually happens if the book is too new (e.g. published after 2014/2018)
        # or if the title spelling in Amazon is slightly different.
        print(f"‚ùå MISS: '{raw_title}' (Key: {search_key}) - Not in Top 100k Catalog.")

print(f"\n--- Summary: Found {hits}/{len(openai_titles)} books ---")

‚è≥ Loading resources into memory...
‚úÖ Resources Ready. Catalog size: 92073 titles.

--- üöÄ Starting Recommendation Pipeline ---

‚úÖ MATCH: 'Dune' -> ID: 044100590X
   ‚Ü≥ Based on this, you might like:
      1. Shadow of the Hegemon (The Shadow Series) (Sim: 0.98)
      2. The End of Eternity: A Novel (Sim: 0.98)
      3. The Anubis Gates (Ace Science Fiction) (Sim: 0.98)
      4. The Arabian Nights: Tales of 1,001 Nights: Volume 1 (Penguin Classics) (Sim: 0.98)
      5. Weaveworld (Sim: 0.98)
‚úÖ MATCH: 'The Name of the Wind' -> ID: 075640407X
   ‚Ü≥ Based on this, you might like:
      1. Red Country (Sim: 0.98)
      2. The Wise Man's Fear (Kingkiller Chronicles, Day 2) (Sim: 0.97)
      3. The Name of the Wind (Sim: 0.97)
      4. Last Argument of Kings (First Law: Book Three) (Sim: 0.97)
      5. The Emperor's Soul (Sim: 0.97)
‚ùå MISS: 'The Left Hand of Darkness' (Key: thelefthandofdarkness) - Not in Top 100k Catalog.
‚úÖ MATCH: 'Mistborn: The Final Empire' -> ID: 076531178