# Initialize

In [1]:
import json
import os
import random

import chromadb
import dotenv
import fireworks.client
import pandas as pd
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm

config = dotenv.dotenv_values(".env")

fireworks.client.api_key = config['FIREWORKS_API_KEY']

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def get_completion(prompt, model=None, **kwargs):

    fw_model_dir = "accounts/fireworks/models/"

    if model is None:
        model = fw_model_dir + "llama-v2-7b"
    else:
        model = fw_model_dir + model

    completion = fireworks.client.Completion.create(
        model=model,
        prompt=prompt,
        **kwargs
    )

    return completion.choices[0].text

In [3]:
get_completion("Hello, world!", temperature=0.1)

' I’m back!\nI’ve been away for a while, but'

# Gather documents

In [37]:
# Import summarizer
# from transformers import AutoTokenizer
from transformers import pipeline

# checkpoint = "google-t5/t5-small"
# tokenizer = AutoTokenizer.from_pretrained(checkpoint)
summarizer = pipeline("summarization", model="facebook/bart-base")  # bart-large-cnn
temp_doc = """BART (large-sized model), fine-tuned on CNN Daily Mail

BART model pre-trained on English language, and fine-tuned on CNN Daily Mail. It was introduced in the paper BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension by Lewis et al. and first released in [this repository (https://github.com/pytorch/fairseq/tree/master/examples/bart).

Disclaimer: The team releasing BART did not write a model card for this model so this model card has been written by the Hugging Face team.
Model description

BART is a transformer encoder-encoder (seq2seq) model with a bidirectional (BERT-like) encoder and an autoregressive (GPT-like) decoder. BART is pre-trained by (1) corrupting text with an arbitrary noising function, and (2) learning a model to reconstruct the original text.

BART is particularly effective when fine-tuned for text generation (e.g. summarization, translation) but also works well for comprehension tasks (e.g. text classification, question answering). This particular checkpoint has been fine-tuned on CNN Daily Mail, a large collection of text-summary pairs."""
print(summarizer(temp_doc, max_length=130, min_length=30, do_sample=False))

[{'summary_text': 'BART (large-sized model), fine-tuned on CNN Daily Mail, a large collection of text-summary pairs.BART model pre-trained on American language, and fine-tuneed on English language. It was introduced in the paper BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension by Lewis et al. and first released in [this repository (https://github.com/pytorch/fairseq/tree/master/examples/bart).]BART is a large-sized, high-resolution model with a'}]


In [38]:
root_folder = "/Users/kaz/repos/toronto-com-ttcriders-archive/scraper/output/toronto.com"
documents = os.listdir(root_folder)
results = []
for document in tqdm(documents):
    if not document.endswith('.html'):
        continue
    document = {
        'filename': os.path.join(root_folder, document),
        'body': '',
        'title': '',
    }
    with open(document['filename'], 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file.read(), 'html.parser')
        document['title'] = soup.title.string.strip()
        document['body'] = soup.body.get_text().strip()
        document['summary'] = summarizer(document['body'][5000:6024], max_length=150, min_length=30, do_sample=True)
    results.append(document)
    # break
# print(results[0]['summary'])
documents = results

  1%|          | 1/172 [00:07<21:46,  7.64s/it]

[{'summary_text': 'The Liberals have championed a flat fare of $1 and monthly passes capped at $40 for all transit until 2024.The Green Party has committed to slashing fares in half for at least three months. The NDP has pledged to expanding a TTC discount (known as the Fair Pass program) to all low-income workers.Scarborough does not end at McCowan Road, but current funded rapid transit plans stop there. As residents of northeastern Scarborough and Durham, we know how hard it is to get around the east end of Scarborough. The Eglinton East Light Rail Transit line would connect Kennedy Station to the University of Toronto Scarborough Campus and end in Malvern, unlocking better access to education and connecting underserved Neighbourhood Improvement Areas. Toronto'}]





In [4]:
documents[12]

{'filename': '/Users/kaz/repos/toronto-com-ttcriders-archive/scraper/output/toronto.com/ward-42-kingsley-kwok-says-area-needs-better-and-faster-ttc-service.html',
 'body': 'Skip to main content\n\n\n\n\n\n\n\n\nYou have permission to edit this article.\n\n Edit\nClose\n\n\n\n\n\n\n\n\nToronto.com\n\n\n\nHome\n\n\n\n\n\nNews\n\n\nBusiness\n\nCouncil\n\nCrime\n\nMunicipal Election\n\nProvincial Election\n\nFederal Election\n\nBloor West - Parkdale\n\nBeach - East York\n\nEtobicoke\n\nNorth York\n\nScarborough\n\nYork - City Centre\n\nTopics\n\n\n\n\nEvents\n\n\nArts\n\nAttractions\n\nCommunity\n\nFestivals and Fairs\n\nMusic\n\nSeasonal\n\nShows and Expos\n\nSports\n\n\n\n\nThings to Do\n\n\nBooks And Authors\n\nContests\n\nFood And Drink\n\n\n\n\nOpinion\n\n\nAdvice\n\nColumns\n\nCommunity Voices\n\nEditorial\n\nLetters\n\n\n\n\nLife\n\n\nFashion And Beauty\n\nObituaries\n\nPersonal Finance\n\nReal Estate\n\nTravel\n\nWellness\n\nWheels\n\n\n\n\nSpecial Features\n\n\nMarketplace\n\nRead

In [5]:
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# class EmbedDocuments(chromadb.EmbeddingFunction):
#     def __call__(self, input: chromadb.Documents) -> chromadb.Embeddings:
#         batch_embeddings = embedding_model.encode(input)
#         return batch_embeddings.tolist()

chroma_client = chromadb.PersistentClient(path="./chromadb")
collection = chroma_client.get_or_create_collection(
    name=f"24-04-02-local-document-experiment"
)

In [6]:
# Generate embeddings, and index file contents in batches
batch_size = 10

# Loop through batches
for i in tqdm(range(0, len(documents), batch_size)):

    i_end = min(i + batch_size, len(documents))
    batch = documents[i : i + batch_size]

    # Prepare batch
    batch_titles = [doc['title'] for doc in batch]
    batch_ids = [str(sum(ord(c) + random.randint(1, 10000) for c in doc["title"])) for doc in batch]
    batch_metadata = [
        {
            "filename": doc['filename'],
            "body": doc['body']
        }
        for doc in batch
    ]
    batch_embeddings = embedding_model.encode([f"{doc['title']} {doc['body']}" for doc in batch])

    # Upsert to ChromaDB
    collection.upsert(
        ids=batch_ids,
        metadatas=batch_metadata,
        documents=batch_titles,
        embeddings=batch_embeddings.tolist(),
    )

100%|██████████| 9/9 [00:05<00:00,  1.52it/s]


In [7]:
# user query
user_query = "virus coronavirus pandemic"

# query for user query
results = collection.query(
    query_texts=[user_query],
    n_results=5,
)

results['documents']

[["Ride the 'COVID-19 East'? Group calls crowded Scarborough buses a hazard | News | toronto.com",
  'Transit workers union vows to fight TTC vaccine mandate | News | toronto.com',
  'TTC says it’s losing about $18 million a week in fares, will need bailout over coronavirus ridership crash | News | toronto.com',
  "Scarborough transit users bring 'COVID-19' bus to MPP's office | News | toronto.com",
  "Olivia Chow just couldn't be stopped this time: Analysis | News | toronto.com"]]

In [8]:
results['metadatas']


[[{'body': "Skip to main content\n\n\n\n\n\n\n\n\nYou are the owner of this article.\n\n Edit Article\n Add New Article\nClose\n\n\n\nYou have permission to edit this article.\n\n Edit\nClose\n\n\n\n\n\n\n\n\nToronto.com\n\n\n\nHome\n\n\n\n\n\nNews\n\n\nBusiness\n\nCouncil\n\nCrime\n\nMunicipal Election\n\nProvincial Election\n\nFederal Election\n\nBloor West - Parkdale\n\nBeach - East York\n\nEtobicoke\n\nNorth York\n\nScarborough\n\nYork - City Centre\n\nTopics\n\n\n\n\nEvents\n\n\nArts\n\nAttractions\n\nCommunity\n\nFestivals and Fairs\n\nMusic\n\nSeasonal\n\nShows and Expos\n\nSports\n\n\n\n\nThings to Do\n\n\nBooks And Authors\n\nContests\n\nFood And Drink\n\n\n\n\nOpinion\n\n\nAdvice\n\nColumns\n\nCommunity Voices\n\nEditorial\n\nLetters\n\n\n\n\nLife\n\n\nFashion And Beauty\n\nObituaries\n\nPersonal Finance\n\nReal Estate\n\nTravel\n\nWellness\n\nWheels\n\n\n\n\nSpecial Features\n\n\nMarketplace\n\nReaders' Choice Awards\n\nSponsored and Partners\n\nClassifieds\n\n\n\n\n\n\n\n\n

# Query documents via chat

In [25]:
user_query = {"USER_QUERY": "On what date did the TTC reduce service operations due to the pandemic?"}

prompt_template = f"""[INST] Please generate a single keyword-based search string in JSON format (with the key "SEARCH_STRING") based on the following user query:
{json.dumps(user_query)} [INST]"""

response = get_completion(prompt_template, model="mistral-7b-instruct-4k", max_tokens=2000)

print("\nResponse:")
print(response)


Response:
 Here is the search string in JSON format:
```
{
  "SEARCH_STRING": "date TTC pandemic reduction service operations"
}
``` 
This search string includes keywords such as "date," "TTC," "pandemic," and "reduction" to help with the search.


In [15]:
type(response)

str

In [41]:
# Query documents
# search_query = json.loads(response[response.index('{'):])
# search_query = search_query["SEARCH_STRING"]
search_query = "date TTC pandemic reduction service operations"

results = collection.query(
    query_texts=[search_query],
    n_results=5,
)

print("Search results:")
results['documents']

Search results:


[['Transit workers union vows to fight TTC vaccine mandate | News | toronto.com',
  'Increased spending authority for TTC CEO opposed by advocacy group, union | News | toronto.com',
  'Assessing Access: Can the TTC become fully accessible by 2025? | News | toronto.com',
  'TTC fare hikes hurt the poor the most: TTCriders | News | toronto.com',
  'Analysis: How can the TTC improve transit service on Dufferin Street? | News | toronto.com']]

In [43]:
search_documents = []
for i in tqdm(range(5)):
    document = {
        "title": results['documents'][0][i],
        # "body": results['metadatas'][0][i]['body']
        "summary": summarizer(results['metadatas'][0][i]['body'][5000:6024], max_length=150, min_length=30, do_sample=True)
    }
    search_documents.append(document)

prompt_json = {
    "USER_QUERY": user_query["USER_QUERY"],
    "DOCUMENTS": search_documents
}

prompt_template = \
f'''[INST] Using your knowledge and the following 5 documents, please answer the USER_QUERY to the best of your ability if possible. The information is provided below in JSON format.

{json.dumps(prompt_json)}
[/INST]
'''

print("Prompt:")
print(prompt_template)

100%|██████████| 5/5 [00:37<00:00,  7.47s/it]

Prompt:
[INST] Using your knowledge and the following 5 documents, please answer the USER_QUERY to the best of your ability if possible. The information is provided below in JSON format.

[/INST]






In [45]:
response = get_completion(prompt_template, model="mistral-7b-instruct-4k", max_tokens=2000)

print("\nResponse:")
print(response)


Response:

The Transit workers union vows to fight TTC vaccine mandate, as reported by the Toronto Star, announced a deadline for workers to provide proof of vaccination or medical exemption by Sept. 13, which is before the city's policy of requiring employees to provide proof of their vaccination status. While this date may vary, it is important to note that the TTC's decision to reduce service operations due to the pandemic was not mentioned in any of the available documents.

If you have any further questions or if there is anything else I can be of assistance with, please let me know.
