In [77]:
import os
import random

pdf_folder_path1 = "CleanedORC"

In [None]:
from tqdm.auto import tqdm

def open_and_read_pdfs(pdf_folder_path) -> list[dict]: 
    pages_and_texts = []
    for article in sorted(os.listdir(pdf_folder_path)):
        file_path = os.path.join(pdf_folder_path, article)
        with open(file_path, 'r', encoding='utf-8') as file:
                text = file.read()
        file_name = os.path.splitext(article)[0]
        pages_and_texts.append({"article_name": file_name,
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4,
                                "text": text
                                })
    return pages_and_texts

pages_and_text = open_and_read_pdfs(pdf_folder_path1)
pages_and_text[:2]

[{'article_name': 'the_lamoni_chronicle_usa_iowa_lamoni_19010404_english_1',
  'page_char_count': 14611,
  'page_word_count': 2664,
  'page_sentence_count_raw': 140,
  'page_token_count': 3652.75,
  'text': '% The Lamont Chronicle. VOL. 7. SPRING GOODS. We have just received full line of OSWEGO FORKS, HOES, and I BAKE. Henry Sears and Son\'s CUTLERY and SHEARS. Utah Metal SPOONS, KNIVES AND FORKS. Washing Machines, Gasoline and Oil Stoves. All clean, fresh goods, the best that can be bought. An l\xad most everything \'pertaining to the Hardware Trade we have. We are ready for spring business \'with good goods at right prices. Our motto fair representation.\' Yours Truly, Lamont Hardware Co. ^uuuauLitiktuuuumiuuiiiuuiiimiuuuuuuimiiiiimiiUiUiiiUUiiAiimAiii^ The C h r o n i c l e appears this issue under a new heading and with the addition of Mr John Scott to the firm. Mi Scott is well known to a great many of our readers and will need no introduction from us. His connection with the Hera

In [79]:
import pandas as pd

df = pd.DataFrame(pages_and_text)
df.head()
df.page_token_count.describe().round(2)

count       5.00
mean     5790.60
std      2410.61
min      3522.75
25%      3652.75
50%      5854.75
75%      6529.25
max      9393.50
Name: page_token_count, dtype: float64

In [80]:
# Using spacy because it's more robust then just splitting sentences from ". " and also the news paper scan didn't pick up the period
from spacy.lang.en import English 
nlp = English()
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x2c1fafd0d50>

In [81]:
for item in tqdm(pages_and_text):
    item["sentences"] = list(nlp(item["text"]).sents)
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]

    item["page_sentence_count_spacy"] = len(item["sentences"])


  0%|          | 0/5 [00:00<?, ?it/s]

In [82]:
pages_and_text[0]

{'article_name': 'the_lamoni_chronicle_usa_iowa_lamoni_19010404_english_1',
 'page_char_count': 14611,
 'page_word_count': 2664,
 'page_sentence_count_raw': 140,
 'page_token_count': 3652.75,
 'text': '% The Lamont Chronicle. VOL. 7. SPRING GOODS. We have just received full line of OSWEGO FORKS, HOES, and I BAKE. Henry Sears and Son\'s CUTLERY and SHEARS. Utah Metal SPOONS, KNIVES AND FORKS. Washing Machines, Gasoline and Oil Stoves. All clean, fresh goods, the best that can be bought. An l\xad most everything \'pertaining to the Hardware Trade we have. We are ready for spring business \'with good goods at right prices. Our motto fair representation.\' Yours Truly, Lamont Hardware Co. ^uuuauLitiktuuuumiuuiiiuuiiimiuuuuuuimiiiiimiiUiUiiiUUiiAiimAiii^ The C h r o n i c l e appears this issue under a new heading and with the addition of Mr John Scott to the firm. Mi Scott is well known to a great many of our readers and will need no introduction from us. His connection with the Herald Off

In [83]:
# spacy split a more than just on ". "
df = pd.DataFrame(pages_and_text)
df.describe().round(2)

Unnamed: 0,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,5.0,5.0,5.0,5.0,5.0
mean,23162.4,5285.6,275.4,5790.6,210.4
std,9642.44,2669.53,95.69,2410.61,58.07
min,14091.0,2664.0,140.0,3522.75,116.0
25%,14611.0,2776.0,216.0,3652.75,194.0
50%,23419.0,5841.0,308.0,5854.75,239.0
75%,26117.0,6064.0,340.0,6529.25,244.0
max,37574.0,9083.0,373.0,9393.5,259.0


In [84]:
# Chunk size
num_sentence_chunk_size = 10

def split_list(input_list: list, slice_size: int=num_sentence_chunk_size) -> list[list[str]]:
    return [input_list[i:i+slice_size] for i in range(0, len(input_list), slice_size)]

test_list = list(range(25))
split_list(test_list)

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
 [20, 21, 22, 23, 24]]

In [85]:
# split sentences in chunks
for item in tqdm(pages_and_text):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                        slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/5 [00:00<?, ?it/s]

In [86]:
# Splitting each chunk into its own item
import re

pages_and_chunks = []
for item in tqdm(pages_and_text):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["article_name"] = item["article_name"]
        

        #join the lists of paragraphs 
        joined_sentence_chunk = "".join(sentence_chunk).replace("  "," ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])',r'. \1', joined_sentence_chunk)
        joined_sentence_chunk = re.sub(r'\?([A-Z])',r'. \1', joined_sentence_chunk)
        joined_sentence_chunk = re.sub(r'\!([A-Z])',r'. \1', joined_sentence_chunk)
        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4
        pages_and_chunks.append(chunk_dict)

len(pages_and_chunks)

  0%|          | 0/5 [00:00<?, ?it/s]

107

In [87]:
random.sample(pages_and_chunks, k = 1)

[{'article_name': 'the_lamoni_chronicle_usa_iowa_lamoni_19010404_english_3',
  'sentence_chunk': 'He would pull them out of every pin cushion In the house when he could get a chance, so you might search through room after room, and not come across a single pin. Nobody knew what he did with them all. He was fond OT eggs, too, and I am afraid this was the REA\xad so nth at the children had such a long hunt for them, and found so few. At last, they climbed up the long ladder into the mow The hay was piled almost to the roof and covered the windows. It was so dark that Nanny and Kitty were a little bit afraid, but Billy went first, floundering along in the hay, just as you wade through a snow drift. "Guess there are nests on this beam,\' said Billy, "but it\'s so dark I can\'t see. I\'ll feel\' In the la s t few years artificial egg3 of candy, china and other materials, and egg-shaped articles of all kinds, have largely replaced the real eggs as Easter gifts. The shop windows each "CHRIST.

In [88]:
# get chunks with under 120 tokens of length. These chunks probaly don't have a lot of usefull info and are most likely headings
df = pd.DataFrame(pages_and_chunks)
min_token_length = 120
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len[1]

{'article_name': 'the_lamoni_chronicle_usa_iowa_lamoni_19010404_english_1',
 'sentence_chunk': "T h e r e has been considerable talk of late in railroad circles about the Burlington system having passed into the control of J. J. Hill, who is directing the Northern and Union Pacific systems According to re\xad ports this was accomplished by the Morgan-Harriman syndicate purchasing a controlling interest in the road an<i subsequently leasing the road to Hill. In some places this re\xad port is denied. Whether or not the report is true as co the Burlington system, it is clearly evident that the railroads are fast accomplishing what is the equivalent of pooling their interests, and that despite the fact that the government has legislated against pooling. It is not pooling, however, for the combination is accomplished in a strictly legitimate and practical way, namely, by the syndicate purchasing controlling interests in the various l'roads they desire to combine and this prosecution of the

In [89]:
# Note for future research best embedding models
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                      device="cuda")

In [90]:
# Running on slow bum CPU
# CPU no batching takes 9.12 mins
for item in tqdm(pages_and_chunks_over_min_token_len):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])

  0%|          | 0/96 [00:00<?, ?it/s]

In [91]:
%%time
# Embedding model using GPU

# embedding_model.to("cuda")

# for item in tqdm(pages_and_chunks_over_min_token_len):
#     item["embedding"] = embedding_model.encode(item["sentence_chunk"])

CPU times: total: 0 ns
Wall time: 0 ns


In [92]:
%%time
# running with CPU chunking
# CPU with batching takes 7.16 mins
text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]

text_chunk_embeddings = embedding_model.encode(text_chunks,
 batch_size=16, convert_to_tensors=True)

text_chunk_embeddings

CPU times: total: 4.11 s
Wall time: 591 ms


array([[ 0.0126965 ,  0.07359759,  0.02510833, ...,  0.02915846,
        -0.10578252, -0.0090372 ],
       [-0.05968514,  0.14332505,  0.02289345, ...,  0.02090998,
        -0.02865135, -0.01604334],
       [-0.01827556,  0.12791066,  0.02157426, ...,  0.0154283 ,
        -0.04676846, -0.00781298],
       ...,
       [ 0.02069793,  0.0308956 , -0.01870852, ...,  0.0271231 ,
        -0.05607637, -0.08762081],
       [ 0.0507438 ,  0.06419544, -0.00236332, ...,  0.03081468,
        -0.00185361, -0.06439706],
       [ 0.00949541,  0.10498811, -0.00046495, ...,  0.0275246 ,
        -0.05283398, -0.02711323]], dtype=float32)

In [93]:
#Save embeddings
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_len)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

In [94]:
import pandas as pd

In [95]:
text_chunks_and_embeddings_df_load = pd.read_csv(embeddings_df_save_path)

text_chunks_and_embeddings_df_load.head()

Unnamed: 0,article_name,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,embedding
0,the_lamoni_chronicle_usa_iowa_lamoni_19010404_...,We are ready for spring business 'with good go...,2054,378,513.5,[ 1.26965372e-02 7.35975802e-02 2.51083169e-...
1,the_lamoni_chronicle_usa_iowa_lamoni_19010404_...,T h e r e has been considerable talk of late i...,2440,438,610.0,[-5.96851595e-02 1.43325061e-01 2.28934307e-...
2,the_lamoni_chronicle_usa_iowa_lamoni_19010404_...,What the ultimate goal may be toward which we ...,2184,406,546.0,[-1.82755776e-02 1.27910659e-01 2.15742495e-...
3,the_lamoni_chronicle_usa_iowa_lamoni_19010404_...,The best class of the natives will be happy at...,963,169,240.75,[ 6.30823197e-03 2.47935466e-02 -1.42200300e-...
4,the_lamoni_chronicle_usa_iowa_lamoni_19010404_...,When the nature of the country is taken into c...,1166,210,291.5,[-2.69240309e-02 1.02799527e-01 7.51518458e-...


In [96]:
import torch
import numpy as np

# convert embeddins colum back to np.array from csv
# text_chunks_and_embeddings_df_load = pd.read_csv("text_chunk_embeddings_df.csv")
# text_chunks_and_embeddings_df["embedding"] = text_chunks_and_embeddings_df_load["embedding"].apply(lambda x: np.fromstring(x.strip("[]")))

device = "cuda" if torch.cuda.is_available() else "cpu"

embeddings = text_chunks_and_embeddings_df["embedding"].tolist()
embeddings = torch.tensor(np.stack(text_chunks_and_embeddings_df["embedding"].tolist(), axis=0), dtype=torch.float32).to(device)
embeddings.shape

torch.Size([96, 768])

In [97]:
# Create model
from sentence_transformers import util, SentenceTransformer
from time import perf_counter as timer

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                      device=device)

query = "Students at graceland"
print(f"Query: {query}")

#Note: embed query with the same model you embedded your passage with.
query_embedding = embedding_model.encode(query, convert_to_tensor=True)

Query: Students at graceland


In [98]:
#Checking data type
print(query_embedding.dtype)
print(embeddings[0].dtype)

torch.float32
torch.float32


In [99]:
start_time = timer()
dot_scores = util.dot_score(a=query_embedding, b=embeddings)[0]
end_timer = timer()

print(f"Time taken to get scores on {(len(embeddings))} embeddings: {end_timer-start_time:.5f} seconds.")

# Get top 5 results of query search
top_results_dot_product = torch.topk(dot_scores, k=5)
top_results_dot_product

Time taken to get scores on 96 embeddings: 0.00044 seconds.


torch.return_types.topk(
values=tensor([0.3850, 0.3071, 0.2975, 0.2336, 0.2293], device='cuda:0'),
indices=tensor([14, 12, 22, 20, 48], device='cuda:0'))

In [100]:
# Not great results
pages_and_chunks[1]

{'article_name': 'the_lamoni_chronicle_usa_iowa_lamoni_19010404_english_1',
 'sentence_chunk': "We are ready for spring business 'with good goods at right prices. Our motto fair representation.'Yours Truly, Lamont Hardware Co. ^uuuauLitiktuuuumiuuiiiuuiiimiuuuuuuimiiiiimiiUiUiiiUUiiAiimAiii^ The C h r o n i c l e appears this issue under a new heading and with the addition of Mr John Scott to the firm. Mi Scott is well known to a great many of our readers and will need no introduction from us. His connection with the Herald Office for about 30 years has given him large experience in the 'art preservative' and will be of great benefit to the C h r o n i c l e. We have also added to our equipment a 6 column quarto Campbell cylinder press, that will be in position some time next week. This addition will put us in a position to do our own press work, and give us added facilities for Job work. An n o t a b l e change took place in newspaper circles last week, when the Chicago Record was pur

In [101]:
# For vector search read up on Faiss, nearest neighbour search
import textwrap

def print_wrapped(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)

In [102]:
query = "Chicago's 'land-marks"
query_embedding = embedding_model.encode(query, convert_to_tensor=True)
dot_scores = util.dot_score(a=query_embedding, b=embeddings)[0]
top_results_dot_product = torch.topk(dot_scores, k=5)
top_results_dot_product


print(f"Query: {query}\n")
print("Results:")
for score, idx in zip(top_results_dot_product[0], top_results_dot_product[1]):
    print(f"Score: {score:.4f}")
    print(f"Page number: {pages_and_chunks[idx]["article_name"]}")
    print("Text:")
    print_wrapped(pages_and_chunks[idx]["sentence_chunk"])
    print("\n")

Query: Chicago's 'land-marks

Results:
Score: 0.4339
Page number: the_lamoni_chronicle_usa_iowa_lamoni_19010404_english_3
Text:
downstairs to p la n t a resounding kiss on each of grandma's soft cheeks.
"Those Easter eggs are something like !'-Lizzie E. Johnson. Schools>of'» T r i b
u t e t o B u r r o u g h s. How close the a t veteran friend of birds and
animals and trees, John Bur­ roughs, gets to the heart of mankind is illustrated
by a letter which he re cently received from a schoolboy. The letter, as printed
In an article by Cliff­ ton Johnson in Outing is as follows; 'I recently got one
of your books through the mail, marked 'second-class m at­ the r.' But it isn't
't second-class matter. I h ave read it, and it is first-class m a t­ TER. The
binding and get-up may be second class, h u t the matter is first-class.'The boy
wrote to John Bur­ roughs as he would write to any other boyfriend whom he
considered had been dealt with unjustly. He had read the book, and he knew, and
w a

In [103]:
# Functionizing resources
def retrieve_relevant_resources(query: str,
                                embeddings: torch.tensor,
                                model: SentenceTransformer=embedding_model,
                                n_resources_to_return: int=5,
                                print_time: bool=True):
    
    query_embedding = model.encode(query, convert_to_tensor=True)

    start_time = timer()
    dot_scores = util.dot_score(query_embedding, embeddings)[0]
    end_time = timer()

    if print_time:
        print(f"Time taken to get scores on {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")
    
    scores, indices = torch.topk(input=dot_scores,
                                k=n_resources_to_return)
    
    return scores, indices

def print_top_results_and_scores(query: str,
                                 embeddings: torch.tensor,
                                 pages_and_chunks: list[dict]= pages_and_chunks,
                                 n_resources_to_return: int=5):
    
    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=embeddings,
                                                  n_resources_to_return=n_resources_to_return)
    
    print(f"Query: {query}\n")
    print("Results:")
    for scores, indices in zip(scores, indices):
        print(f"Score: {scores:.4f}")
        print(f"Page number: {pages_and_chunks[indices]["article_name"]}")
        print("Text:")
        print_wrapped(pages_and_chunks[indices]["sentence_chunk"])
        print("\n")

In [104]:
query = "Chicago's 'land-marks"
print_top_results_and_scores(query=query,embeddings=embeddings)

Time taken to get scores on 96 embeddings: 0.00006 seconds.
Query: Chicago's 'land-marks

Results:
Score: 0.4339
Page number: the_lamoni_chronicle_usa_iowa_lamoni_19010404_english_3
Text:
downstairs to p la n t a resounding kiss on each of grandma's soft cheeks.
"Those Easter eggs are something like !'-Lizzie E. Johnson. Schools>of'» T r i b
u t e t o B u r r o u g h s. How close the a t veteran friend of birds and
animals and trees, John Bur­ roughs, gets to the heart of mankind is illustrated
by a letter which he re cently received from a schoolboy. The letter, as printed
In an article by Cliff­ ton Johnson in Outing is as follows; 'I recently got one
of your books through the mail, marked 'second-class m at­ the r.' But it isn't
't second-class matter. I h ave read it, and it is first-class m a t­ TER. The
binding and get-up may be second class, h u t the matter is first-class.'The boy
wrote to John Bur­ roughs as he would write to any other boyfriend whom he
considered had been dea

In [105]:
# checking hardware relative to LLM model
#7b model size model is what we'll be looking for this project.

import torch
gpu_memory_bytes = torch.cuda.get_device_properties(0).total_memory
gpu_memory_gb = round(gpu_memory_bytes / (2**30))

In [106]:
if gpu_memory_gb < 5.1:
    print(f"Your available GPU memory is {gpu_memory_gb}GB, you may not have enough memory to run a Gemma LLM locally without quantization.")
elif gpu_memory_gb < 8.1:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma2 2B in 4-bit precision.")
    use_quantization_config = True 
    model_id = "google/gemma2-2b-it"
elif gpu_memory_gb < 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma2 2B in float16 or Gemma 7B in 4-bit precision.")
    use_quantization_config = False 
    model_id = "google/gemma2-2b-it"
elif gpu_memory_gb > 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommend model: Gemma2 7B in 4-bit or float16 precision.")
    use_quantization_config = False 
    model_id = "google/gemma2-7b-it"

print(f"use_quantization_config set to: {use_quantization_config}")
print(f"model_id set to: {model_id}")

GPU memory: 12 | Recommended model: Gemma2 2B in float16 or Gemma 7B in 4-bit precision.
use_quantization_config set to: False
model_id set to: google/gemma2-2b-it


In [107]:

from huggingface_hub import login

In [108]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available

# Create quantization config
from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.float16)

from dotenv import load_dotenv, find_dotenv

dot_env_path = find_dotenv()
load_dotenv(dot_env_path)
access_token = os.getenv("HUGGING_FACE_KEY")


In [109]:
# loading LLM
# Model: gemma 7b
# Use Flash Attention 2 if possible

#login to huggingface
login(token=access_token)
print("Logged in successfully!")

# Flash attention 2 if possible
if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8):
    attn_implementation = "flash_attention_2"
else:
    attn_implementation = "sdpa"

# Loading model
model_id = "google/gemma-2-2b-it"

# Instantiate tokenizer 
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b-it")

# Load model
llm_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    token=access_token,
    torch_dtype=torch.bfloat16,
    device_map=None,
    attn_implementation=attn_implementation
).to("cuda")

Logged in successfully!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [110]:
# Inspect model
llm_model

Gemma2ForCausalLM(
  (model): Gemma2Model(
    (embed_tokens): Embedding(256000, 2304, padding_idx=0)
    (layers): ModuleList(
      (0-25): 26 x Gemma2DecoderLayer(
        (self_attn): Gemma2Attention(
          (q_proj): Linear(in_features=2304, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2304, out_features=1024, bias=False)
          (v_proj): Linear(in_features=2304, out_features=1024, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2304, bias=False)
        )
        (mlp): Gemma2MLP(
          (gate_proj): Linear(in_features=2304, out_features=9216, bias=False)
          (up_proj): Linear(in_features=2304, out_features=9216, bias=False)
          (down_proj): Linear(in_features=9216, out_features=2304, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)
        (post_attention_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)
        (pre_feedforward_layernorm): Gemm

In [111]:
inputs_text = "Tell me about east locust street."
print(f"Input text:\n{inputs_text}")

Input text:
Tell me about east locust street.


In [112]:
dialog_template = [
    {"role":"user",
     "content": inputs_text}
]

prompt = tokenizer.apply_chat_template(conversation=dialog_template,
                                       tokenize=False,
                                       add_generation_prompt=True)

print(f"\nPrompt (formatted):\n{prompt}")


Prompt (formatted):
<bos><start_of_turn>user
Tell me about east locust street.<end_of_turn>
<start_of_turn>model



In [113]:
tokenizer

GemmaTokenizerFast(name_or_path='google/gemma-2-2b-it', vocab_size=256000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<bos>', 'eos_token': '<eos>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<start_of_turn>', '<end_of_turn>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<eos>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("<bos>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("<mask>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	5: AddedToken("<2mass>", rstrip=False, lstrip=False, single_w

In [114]:
%%time

prompt = "Chicago's 'land-marks"

dialog_template = [
    {"role":"user",
     "content": prompt}
]

prompt = tokenizer.apply_chat_template(conversation=dialog_template,
                                       tokenize=False,
                                       add_generation_prompt=True)

# Tokenize and move to GPU
inputs = tokenizer(prompt, return_tensors="pt")
input_ids = inputs["input_ids"].to("cuda")
attention_mask = inputs["attention_mask"].to("cuda")

# Generate
outputs = llm_model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_new_tokens=512,
    do_sample=True,
    temperature=0.7,
    top_p=0.95
)

# Decode the output
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)

user
Chicago's 'land-marks
model
Chicago is packed with iconic landmarks, so it really depends on what kind of landmarks you're interested in! Here's a breakdown to get you started:

**Architecture & Design:**

* **Skydeck Chicago:**  For breathtaking views of the city, this observation deck on the Willis Tower (formerly Sears Tower) is a must-do.
* **Chicago Riverwalk:** A scenic walkway along the Chicago River, perfect for a stroll, bike ride, or boat tour.
* **Wrigley Field:** Home of the Chicago Cubs, a historic ballpark with a rich history and vibrant atmosphere.
* **Cloud Gate ("The Bean"):** A stunning public sculpture by Anish Kapoor, also known as the Bean, offering a unique reflection experience.
* **Art Institute of Chicago:** One of the world's greatest art museums, boasting an impressive collection of Impressionist, Post-Impressionist, and modern art.

**Culture & History:**

* **Lincoln Park Zoo:** A world-renowned zoo with a variety of animals from around the globe.
* **

In [115]:
def prompt_formatter(query: str,
                     context_items: list[dict]) -> str:
    context = "- " + "\n- ".join([item["sentence_chunk"] for item in context_items])

    base_prompt = """Based on the following context items, please answer the query.
    Context items:
{context}
    Query: {query}
    Answer: 
    """

    prompt = base_prompt.format(context=context,
                                query=query)
    return prompt



In [116]:
query = "Chicago's 'land-marks"

scores, indices = retrieve_relevant_resources(query=prompt,
                                              embeddings=embeddings)

context_items = [pages_and_chunks[i] for i in indices]

prompt = prompt_formatter(query=query,
                          context_items=context_items)

print(prompt)

input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

outputs = llm_model.generate(**input_ids,
                             temperature=0.5,
                             do_sample=True,
                             max_new_tokens=512)

output_text = tokenizer.decode(outputs[0])
print(f"Query: {query}")
print(f"RAG answer:\n{output_text.replace(prompt, '')}")

Time taken to get scores on 96 embeddings: 0.00010 seconds.
Based on the following context items, please answer the query.
    Context items:
- CITY DIRECTORY. M a y o r.-G H. D e r r y. Co u n c IL m k n.-J o h n H gas, I. W. Al­ lender, Wm. A Anderson. Daniel Jon’s. A. II. T yr rel, an n d J o h n Smith. M a k s h an i. .-W. II. Graham. Deputy marshal laud n IG h t watch Henry II.
- downstairs to p la n t a resounding kiss on each of grandma's soft cheeks. "Those Easter eggs are something like !'-Lizzie E. Johnson. Schools>of'» T r i b u t e t o B u r r o u g h s. How close the a t veteran friend of birds and animals and trees, John Bur­ roughs, gets to the heart of mankind is illustrated by a letter which he re cently received from a schoolboy. The letter, as printed In an article by Cliff­ ton Johnson in Outing is as follows; 'I recently got one of your books through the mail, marked 'second-class m at­ the r.' But it isn't 't second-class matter. I h ave read it, and it is first-c

In [117]:
# takes query and finds relevant news articles and then generates an answer based on the relevant resources.
def ask(query: str,
        temperature: float=0.7,
        max_new_tokens: int=512,
        fromat_answer_text = True,
        return_answer_only = True):
    
    # Retrieval 
    scores, indices = retrieve_relevant_resources(query=query,
                                                 embeddings=embeddings)
    
    context_items = [pages_and_chunks[i] for i in indices]

    for i, item in enumerate(context_items):
        item["score"] = scores[i].cpu()


    # Augmentation
    prompt = prompt_formatter(query=query,
                              context_items=context_items)
    
    
    #Generation
    input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

    outputs = llm_model.generate(**input_ids,
                                 temperature= temperature,
                                 do_sample = True,
                                 max_new_tokens=max_new_tokens)
    
    output_text = tokenizer.decode(outputs[0])

    if fromat_answer_text:
        output_text = output_text.replace(prompt, "").replace("<bos>", "").replace("<eos>","")
    
    if return_answer_only:
        return output_text, context_items


In [123]:
query = "Tell me about Mr John Scott in Lamont"

ask(query=query)

Time taken to get scores on 96 embeddings: 0.00006 seconds.


('The provided text does not mention anything about Mr. John Scott in Lamont. \n<end_of_turn>',
 [{'article_name': 'the_lamoni_chronicle_usa_iowa_lamoni_19010404_english_2',
   'sentence_chunk': "T w ill remove any impure tho u g h ts in the h u m an n fa m fly.35c. D. J. Walker. By letter received from Texas, Mrs. George Males learns that her son, Will Spain, has been laid up in the pest house will the smallpox, from which he has just been released cured, fie was turned adrift with only a pair of overalls and a shirt to wear; and he has not heard from his broth\xad her Joe for some weeks, and does not know whether he is dead or alive. Don't fail to see those new pianos and organs at Smith & Te ale's. Dr. J. W- Crawford was placed under $10,000 bonds for appearance to the August term of district court. Tho doctor had no difficulty in get\xad ting signers to the bond who scheduled about $50,000 worth of property. Smith & Tale's is the place to buy carpets, carpet living, rugs, tapestry 