In [None]:
# download nltk stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Install a particular version of `google-cloud-storage` because (oddly enough)
# the  version on Colab and GCP is old. A dependency error below is okay.
!pip install -q google-cloud-storage==1.43.0

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/106.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━[0m [32m92.2/106.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.6/106.6 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 1.29.0 requires google-cloud-storage>=2.0.0, but you have google-cloud-storage 1.43.0 which is incompatible.[0m[31m
[0m

In [1]:
# authenticate below for Google Storage access as needed
from google.colab import auth
auth.authenticate_user()

In [None]:
# --- build id2title from parquet on gcs and upload it back ---

!pip -q install gcsfs pyarrow pandas

import pandas as pd
import gcsfs
import pickle
import os

BUCKET = "information-retrival-ex3"
PATTERN = f"{BUCKET}/multistream*_preprocessed.parquet"   # gcsfs uses "bucket/path" (no gs://)
OUT_LOCAL = "id2title.pickle"
OUT_GCS = f"gs://{BUCKET}/artifacts/id2title.pickle"

fs = gcsfs.GCSFileSystem()
paths = sorted(fs.glob(PATTERN))

print("found parquet files:", len(paths))
print("first 3:", paths[:3])

# ---- detect actual column names (schema may vary) ----
sample_path = "gs://" + paths[0]
sample_df = pd.read_parquet(sample_path, engine="pyarrow")
cols = set(sample_df.columns)
print("sample columns:", sample_df.columns.tolist())

# common possibilities
ID_CANDIDATES = ["id", "doc_id", "wiki_id"]
TITLE_CANDIDATES = ["title", "doc_title", "document_title"]

id_col = next((c for c in ID_CANDIDATES if c in cols), None)
title_col = next((c for c in TITLE_CANDIDATES if c in cols), None)

if id_col is None or title_col is None:
    raise ValueError(f"could not find id/title columns in parquet. columns={sample_df.columns.tolist()}")

print("using:", id_col, title_col)

# ---- build mapping ----
id2title = {}

for i, p in enumerate(paths, 1):
    df = pd.read_parquet("gs://" + p, columns=[id_col, title_col], engine="pyarrow")

    ids = df[id_col].astype("int64").tolist()
    titles = df[title_col].astype(str).tolist()
    id2title.update(dict(zip(ids, titles)))

    if i % 5 == 0 or i == len(paths):
        print(f"[{i}/{len(paths)}] total mapped ids so far:", len(id2title))

with open(OUT_LOCAL, "wb") as f:
    pickle.dump(id2title, f, protocol=pickle.HIGHEST_PROTOCOL)

print("saved:", OUT_LOCAL, "size bytes:", os.path.getsize(OUT_LOCAL))

!gsutil -m cp {OUT_LOCAL} {OUT_GCS}
!gsutil ls -l {OUT_GCS}

In [None]:
# from google.colab import auth
# auth.authenticate_user()
#
# !pip -q install gcsfs pyarrow pandas

In [None]:
import re, math, os, pickle, hashlib
from collections import Counter, defaultdict

import pandas as pd
import gcsfs

# upload your inverted_index_gcp.py to colab (Files panel) before running this
from inverted_index_gcp import InvertedIndex

# tokenizer like your project
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")

RE_WORD = re.compile(r"""[\#\@\w](['\-]?\w){2,24}""", re.UNICODE)

corpus_stopwords = {"category", "references", "also", "links", "extenal", "see", "thumb"}
english_stopwords = set(stopwords.words("english"))
all_stopwords = english_stopwords.union(corpus_stopwords)

def tokenize(text: str):
    if not text:
        return []
    tokens = [m.group() for m in RE_WORD.finditer(text.lower())]
    return [t for t in tokens if t not in all_stopwords]# Build TITLE inverted index

In [None]:
BUCKET = "information-retrival-ex3"
PATTERN = f"{BUCKET}/multistream*_preprocessed.parquet"  # gcsfs style (no gs://)

fs = gcsfs.GCSFileSystem()
paths = sorted(fs.glob(PATTERN))
print("parquet shards:", len(paths))
print("first 3:", paths[:3])

# build in-memory title index
title_index = InvertedIndex()
total_docs = 0

for i, p in enumerate(paths, 1):
    df = pd.read_parquet("gs://" + p, columns=["id", "title"], engine="pyarrow")

    ids = df["id"].astype("int64").tolist()
    titles = df["title"].astype(str).tolist()

    for doc_id, title in zip(ids, titles):
        toks = tokenize(title)
        if toks:
            title_index.add_doc(int(doc_id), toks)

    total_docs += len(ids)
    if i % 5 == 0 or i == len(paths):
        print(f"[{i}/{len(paths)}] docs seen: {total_docs:,} | vocab so far: {len(title_index.df):,}")

print("done building title index.")
print("final docs seen:", total_docs)
print("final vocab:", len(title_index.df))

In [None]:
import shutil
from pathlib import Path

OUT_DIR = "title_postings_gcp"
INDEX_NAME = "title_index"
N_BUCKETS = 124  # common setting in the course

# clean local folder
if os.path.exists(OUT_DIR):
    shutil.rmtree(OUT_DIR)
os.makedirs(OUT_DIR, exist_ok=True)

def bucket_id_for_term(term: str, n_buckets: int = N_BUCKETS) -> str:
    # deterministic bucket (NOT python hash)
    h = hashlib.md5(term.encode("utf-8")).hexdigest()
    return str(int(h, 16) % n_buckets).zfill(3)

# group terms into buckets: bucket_id -> [(term, posting_list)]
bucket_to_list = defaultdict(list)

# IMPORTANT: this uses the in-memory posting lists created during add_doc()
# InvertedIndex stores them in _posting_list while building.
posting_dict = title_index._posting_list  # internal

print("bucketing terms...")
for t, pl in posting_dict.items():
    bid = bucket_id_for_term(t)
    # sort posting list by doc_id (good practice)
    pl_sorted = sorted(pl, key=lambda x: x[0])
    bucket_to_list[bid].append((t, pl_sorted))

print("writing posting bins...")
for bid in sorted(bucket_to_list.keys()):
    InvertedIndex.write_a_posting_list((bid, bucket_to_list[bid]), OUT_DIR, bucket_name=None)

print("merging posting_locs into the index object...")
title_index.posting_locs = defaultdict(list)
for bid in sorted(bucket_to_list.keys()):
    locs_path = os.path.join(OUT_DIR, f"{bid}_posting_locs.pickle")
    with open(locs_path, "rb") as f:
        d = pickle.load(f)
    for term, locs in d.items():
        title_index.posting_locs[term].extend(locs)

# now we can drop the heavy in-memory postings before pickling the index
# (InvertedIndex.__getstate__ may do this anyway, but we do it explicitly)
del title_index._posting_list

print("writing title_index globals (df, term_total, posting_locs)...")
title_index.write_index(OUT_DIR, INDEX_NAME, bucket_name=None)

print("local output folder size (rough):")
!du -sh {OUT_DIR}

# upload to gcs
DEST = f"gs://{BUCKET}/{OUT_DIR}/"
print("uploading to:", DEST)
!gsutil -m rm -r {DEST} 2>/dev/null || true
!gsutil -m cp -r {OUT_DIR} {DEST}

print("verify on gcs:")
!gsutil ls {DEST} | head
!gsutil ls {DEST} | grep -E "{INDEX_NAME}\.(pkl|pickle)$" || true

# Run the app

In [2]:
# you need to upload your implementation of search_app.py
import search_frontend as se

In [None]:
# uncomment the code below and execute to reload the module when you make
# changes to search_frontend.py (after you upload again).
# import importlib
# importlib.reload(se)

In [None]:
# find Colab's public URL
from google.colab.output import eval_js
server_url = eval_js("google.colab.kernel.proxyPort(5000)")
print(f"""Test your search engine by navigating to
{server_url}search?query=hello+world
This URL is only accessible from the same browser session. In other words, this
will not be accessible from a different machine, browser, or incognito session.
""")

# Uncomment the following line of code to run the frontend in the main
# process and wait for HTTP requests (colab will hang). The debug parameter
# lets you see incoming requests and get debug print outs if exceptions occur.
# se.run(debug=False, use_reloader=False)

# Alternatively, the next few lines run the frontend in a background process.
# Just don't forget to terminate the process when you update your search engine
# or want to reload it.
import multiprocessin, time
proc = multiprocessing.Process(target=se.run, 
                               kwargs={"debug": True, "use_reloader": False, 
                                       "host": "0.0.0.0", "port": 5000})
proc.start()

time.sleep(1) # give Flask time to boot

from google.colab.output import eval_js
server_url = eval_js("google.colab.kernel.proxyPort(5000)")

print(f"Open this URL:\n{server_url}/search?query=hello+world")
# Use proc.terminate() to stop the process

# Testing your app

Once your app is running you can query it. You can simply do that by clicking on the URL printed above (the one looking like https://XXXXX-5000-colab.googleusercontent.com/search?query=hello+world or by issuing an HTTP request through code (from colab).

The code below shows how to issue a query from python. This is also how our testing code will issue queries to your search engine, so make sure to test your search engine this way after you deploy it to GCP and before submission. Command line instructions for deploying your search engine to GCP are available at `run_frontend_in_gcp.sh`. Note that we will not only issue training queries to your search engine, but also test queries, i.e. queries that you've never seen before.

In [None]:
import json

with open('queries_train.json', 'rt') as f:
  queries = json.load(f)

In [None]:
def average_precision(true_list, predicted_list, k=40):
    true_set = frozenset(true_list)
    predicted_list = predicted_list[:k]
    precisions = []
    for i,doc_id in enumerate(predicted_list):
        if doc_id in true_set:
            prec = (len(precisions)+1) / (i+1)
            precisions.append(prec)
    if len(precisions) == 0:
        return 0.0
    return round(sum(precisions)/len(precisions),3)

In [None]:
def precision_at_k(true_list, predicted_list, k):
    true_set = frozenset(true_list)
    predicted_list = predicted_list[:k]
    if len(predicted_list) == 0:
        return 0.0
    return round(len([1 for doc_id in predicted_list if doc_id in true_set]) / len(predicted_list), 3)
def recall_at_k(true_list, predicted_list, k):
    true_set = frozenset(true_list)
    predicted_list = predicted_list[:k]
    if len(true_set) < 1:
        return 1.0
    return round(len([1 for doc_id in predicted_list if doc_id in true_set]) / len(true_set), 3)
def f1_at_k(true_list, predicted_list, k):
    p = precision_at_k(true_list, predicted_list, k)
    r = recall_at_k(true_list, predicted_list, k)
    if p == 0.0 or r == 0.0:
        return 0.0
    return round(2.0 / (1.0/p + 1.0/r), 3)
def results_quality(true_list, predicted_list):
    p5 = precision_at_k(true_list, predicted_list, 5)
    f1_30 = f1_at_k(true_list, predicted_list, 30)
    if p5 == 0.0 or f1_30 == 0.0:
        return 0.0
    return round(2.0 / (1.0/p5 + 1.0/f1_30), 3)

assert precision_at_k(range(10), [1,2,3] , 2) == 1.0
assert recall_at_k(   range(10), [10,5,3], 2) == 0.1
assert precision_at_k(range(10), []      , 2) == 0.0
assert precision_at_k([],        [1,2,3],  5) == 0.0
assert recall_at_k(   [],        [10,5,3], 2) == 1.0
assert recall_at_k(   range(10), [],       2) == 0.0
assert f1_at_k(       [],        [1,2,3],  5) == 0.0
assert f1_at_k(       range(10), [],       2) == 0.0
assert f1_at_k(       range(10), [0,1,2],  2) == 0.333
assert f1_at_k(       range(50), range(5), 30) == 0.182
assert f1_at_k(       range(50), range(10), 30) == 0.333
assert f1_at_k(       range(50), range(30), 30) == 0.75
assert results_quality(range(50), range(5))  == 0.308
assert results_quality(range(50), range(10)) == 0.5
assert results_quality(range(50), range(30)) == 0.857
assert results_quality(range(50), [-1]*5 + list(range(5,30))) == 0.0


In [None]:
import requests
from time import time
# In GCP the public URL for your engine should look like this:
# url = 'http://35.232.59.3:8080'
# In colab, we are going to send HTTP requests to localhost (127.0.0.1)
# and direct them to port where the server is listening (5000).
url = 'http://127.0.0.1:5000'

qs_res = []
for q, true_wids in queries.items():
  duration, ap = None, None
  t_start = time()
  try:
    res = requests.get(url + '/search', {'query': q}, timeout=35)
    duration = time() - t_start
    if res.status_code == 200:
      pred_wids, _ = zip(*res.json())
      rq = results_quality(true_wids, pred_wids)
  except:
    pass

  qs_res.append((q, duration, rq))