# scrape topic index


In [None]:
%pip install chromadb
%pip install bs4
%pip install numpy

In [None]:
#only run if using gpu
%pip install tensorflow-gpu

In [None]:
#if not using gpu
%pip install tensorflow

In [None]:
%pip install tensorflow_hub

In [None]:
import requests
import time
import chromadb
from bs4 import BeautifulSoup
import tensorflow_hub as hub
import tensorflow as tf
import numpy as np
import os

In [None]:
#if using a remote session, download the gzipped wiki dump
url = "https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-all-titles-in-ns0.gz"

#download the file
r = requests.get(url, stream = True)

#save the file
with open("enwiki-latest-all-titles-in-ns0.gz", "wb") as f:
    for chunk in r.iter_content(chunk_size = 1024):
        if chunk:
            f.write(chunk)

#unzip the file to ./data/
!gunzip -k enwiki-latest-all-titles-in-ns0.gz

#copy and rename the file to ./data/enwiki-latest-all-titles.txt
!cp enwiki-latest-all-titles-in-ns0 enwiki-latest-all-titles.txt

In [None]:
#load wikipedia article titles

wiki_titles = []
with open("./data/enwiki-latest-all-titles.txt", "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        line = line.replace("_", " ")
        wiki_titles.append(line)

print(len(wiki_titles))

In [None]:
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [None]:
#tell tensorflow to use GPU
#GPU 0 is GTX 1050 Ti

devices = tf.config.experimental.list_physical_devices("GPU")

print(devices)

In [None]:
def custom_ef(text):
    embeddings = embed(text)
    
    #flatten tensor to list
    embeddings = np.array(embeddings).flatten().tolist()
    
    return embeddings

In [None]:
def time_to_str(t: int):
    """Converts time in seconds to the appropriate time unit"""
    
    if t > 3600 * 24:
        out_str = f"{round(t / (3600 * 24), 2)} days"
    elif t > 3600:
        out_str = f"{round(t / 3600, 2)} hours"
    elif t > 60:
        out_str = f"{round(t / 60, 2)} minutes"
    elif 60 > t > 1:
        out_str = f"{round(t, 2)} seconds"
    elif t < 1:
        out_str = f"{round(t * 1000, 2)} milliseconds"
    else:
        out_str = f"{round(t, 2)} seconds"
    
    return out_str

In [None]:
batch_size = 10000
batches = [
    wiki_titles[i:i + batch_size] for i in range(0, len(wiki_titles), batch_size)
]

#for each batch, get the embeddings
#save the embeddings to disk in a .npy file

if not os.path.exists("./content/embeddings"):
    os.makedirs("./content/embeddings")

for i, batch in enumerate(batches):
    print(f"Processing batch {i + 1} of {len(batches)}")
    
    start_time = time.time()
    
    embeddings = embed(batch)
    
    #flatten tensor to list
    embeddings_np = np.array(embeddings)
    
    np.save(f"./content/embeddings/batch_{i}.npy", embeddings)
    
    end_time = time.time()
    
    batches_left = len(batches) - i - 1
    time_remaining = batches_left * (end_time - start_time)
    
    print(
        f"Batch {i + 1}/{len(batches)} complete. Time remaining: {time_to_str(time_remaining)}"
    )

In [None]:
#delete embeddings from disk
file_path = "./content/embeddings"
for file in os.listdir(file_path):
    os.remove(os.path.join(file_path, file))

In [None]:
#load the embeddings from disk
#concatenate the embeddings into one large array
#convert the np array to a list

embeddings = []
file_path = "./content/embeddings"

for idx, file in enumerate(os.listdir(file_path)):
    print(f"Loading file {idx + 1} of {len(os.listdir(file_path))}")
    embeddings.append(np.load(f"{file_path}/{file}"))

embeddings = np.concatenate(embeddings).tolist()

print(len(embeddings))
print(type(embeddings))

In [None]:
Client = chromadb.PersistentClient("./data/chromadb")

In [None]:
#delete collection "wiki_titles"

Client.delete_collection("wiki_titles")

In [None]:
#add wikipedia article titles to chromadb

collection = Client.get_or_create_collection("wiki_titles")

batch_size = 1000
batches = [
    embeddings[i:i + batch_size] for i in range(0, len(embeddings), batch_size)
]

for i, batch in enumerate(batches):
    print(f"Processing batch {i + 1} of {len(batches)}")
    
    start_time = time.time()
    
    collection.add(
        embeddings=batch,
        documents=wiki_titles[i * batch_size:(i + 1) * batch_size],
        metadatas=None,
        ids = [str(i) for i in range(i * batch_size, (i + 1) * batch_size)]
    )
    
    end_time = time.time()
    
    batches_left = len(batches) - i - 1
    time_remaining = batches_left * (end_time - start_time)
    
    print(
        f"Batch {i + 1}/{len(batches)} complete. Time remaining: {time_to_str(time_remaining)}"
    )

In [None]:
def query(query: str, num_results: int = 10):
    collection = Client.get_collection("wiki_titles")
    
    t_start = time.time()
    
    embeddings = custom_ef([query])
    
    r = collection.query(
        query_embeddings=embeddings,
        n_results=num_results
    )
    
    return r

In [None]:
query("Belgium")