In [15]:
import fitz  # pymupdf
from tqdm.auto import tqdm
import os


def text_formatter(text: str) -> str:
    """Performs minor formatting on text."""
    cleaned_text = text.replace("\n", " ").strip()
    return cleaned_text

def open_and_read_pdf(pdf_path: str) -> list[dict]:
    """
    Opens a PDF file, reads its text content page by page, and collects statistics.

    Parameters:
        pdf_path (str): The file path to the PDF document to be opened and read.

    Returns:
        list[dict]: A list of dictionaries containing extracted data for each page.
    """
    doc = fitz.open(pdf_path)
    document_name = os.path.basename(pdf_path)  # Extract document name
    pages_and_texts = []
    
    for page_number, page in tqdm(enumerate(doc), desc=f"Processing {document_name}"):
        text = page.get_text()
        text = text_formatter(text)
        pages_and_texts.append({
            "document_name": document_name,
            "page_number": page_number,  # Adjust page numbers
            "page_char_count": len(text),
            "page_word_count": len(text.split(" ")),
            "page_sentence_count_raw": len(text.split(". ")),
            "page_token_count": len(text) / 4,
            "text": text
        })
    
    return pages_and_texts

# Directory containing PDFs
directory_path = r"C:\Users\IT Lab VR\Desktop\LamoniAI\GracelandPDFs\Horizons"
pages_and_text = []

# Process all PDFs in the Horizons directory
for filename in os.listdir(directory_path):
    if filename.endswith(".pdf"):
        file_path = os.path.join(directory_path, filename)
        pages_and_text.extend(open_and_read_pdf(file_path))

# Preview first two entries
pages_and_text[:2]


Processing 2009FallHorizons.pdf: 0it [00:00, ?it/s]

Processing 2009SpringHorizons.pdf: 0it [00:00, ?it/s]

Processing 2010FallHorizons.pdf: 0it [00:00, ?it/s]

Processing 2010SpringHorizons.pdf: 0it [00:00, ?it/s]

Processing 2010WinterHorizons.pdf: 0it [00:00, ?it/s]

Processing 2011FallHorizons.pdf: 0it [00:00, ?it/s]

Processing 2011SpSumHorizons.pdf: 0it [00:00, ?it/s]

Processing 2011WinterHorizons.pdf: 0it [00:00, ?it/s]

Processing 2012FallHorizons.pdf: 0it [00:00, ?it/s]

Processing 2012SummerHorizons.pdf: 0it [00:00, ?it/s]

Processing 2013FallHorizons.pdf: 0it [00:00, ?it/s]

Processing 2013SummerHorizons.pdf: 0it [00:00, ?it/s]

Processing 2014FallHorizons.pdf: 0it [00:00, ?it/s]

Processing 2014SummerHorizons.pdf: 0it [00:00, ?it/s]

Processing 2015FallHorizons.pdf: 0it [00:00, ?it/s]

Processing 2015SpringHorizons.pdf: 0it [00:00, ?it/s]

Processing 2015WinterHorizons.pdf: 0it [00:00, ?it/s]

Processing 2016FallHorizons.pdf: 0it [00:00, ?it/s]

Processing 2016SummerHorizons.pdf: 0it [00:00, ?it/s]

Processing 2016WinterHorizons-1.pdf: 0it [00:00, ?it/s]

Processing 2017FallHorizons.pdf: 0it [00:00, ?it/s]

Processing 2017SummerHorizons.pdf: 0it [00:00, ?it/s]

Processing 2018MBBChampsHorizons.pdf: 0it [00:00, ?it/s]

Processing 2018WinterHorizons.pdf: 0it [00:00, ?it/s]

Processing GracelandU-Horizons_FA23-WEB-NEW.pdf: 0it [00:00, ?it/s]

Processing GracelandU-Horizons_SP23_WEB.pdf: 0it [00:00, ?it/s]

Processing Horizons-Winter2017v2.pdf: 0it [00:00, ?it/s]

Processing HorizonsFA18_FINAL.pdf: 0it [00:00, ?it/s]

Processing HorizonsSU18_FINAL.pdf: 0it [00:00, ?it/s]

Processing HorizonsSU19Issue.pdf: 0it [00:00, ?it/s]

Processing HorizonsWinter2009.pdf: 0it [00:00, ?it/s]

Processing Horizons_FA19.pdf: 0it [00:00, ?it/s]

Processing Horizons_FA20-FINAL-WEB.pdf: 0it [00:00, ?it/s]

Processing Horizons_FA22-FINAL-WEB.pdf: 0it [00:00, ?it/s]

Processing Horizons_SP19_WEB.pdf: 0it [00:00, ?it/s]

Processing Horizons_SP22_WEB_Spread-NEW.pdf: 0it [00:00, ?it/s]

Processing Winter2013Horizons.pdf: 0it [00:00, ?it/s]

[{'document_name': '2009FallHorizons.pdf',
  'page_number': 0,
  'page_char_count': 128,
  'page_word_count': 20,
  'page_sentence_count_raw': 3,
  'page_token_count': 32.0,
  'text': 'Fall 2009 Vol. 25, No. 2 Alumni and Friends Magazine  Sherri Kirkpatrick’s HealthEd Connect Serves the World’s Neediest Children'},
 {'document_name': '2009FallHorizons.pdf',
  'page_number': 1,
  'page_char_count': 2418,
  'page_word_count': 434,
  'page_sentence_count_raw': 21,
  'page_token_count': 604.5,
  'text': 'I  am dressed for SPEC in this impromptu photo with  Sherri and Jac Kirkpatrick. They joined 1,200 high  school students and volunteers in late July for the an- nual Community of Christ SPECTACULAR event on our  Lamoni campus. See the photos on pages 8-11 of this issue  of Horizons for glimpses of the energy and excitement that  prevail during this wonderful weeklong camp. It was my pleasure to meet with Sherri and Jac as they  inaugurate their new HealthEd Connect project that will  bring

In [16]:
import pandas as pd

df = pd.DataFrame(pages_and_text)
df.head()
df.page_token_count.describe().round(2)

count    1006.00
mean      884.22
std       551.64
min         0.00
25%       537.56
50%       869.00
75%      1134.25
max      3441.25
Name: page_token_count, dtype: float64

In [17]:
# Using spacy because it's more robust then just splitting sentences from ". " and also the news paper scan didn't pick up the period
from spacy.lang.en import English 
nlp = English()
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x1c0458afe50>

In [18]:
for item in tqdm(pages_and_text):
    item["sentences"] = list(nlp(item["text"]).sents)
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]

    item["page_sentence_count_spacy"] = len(item["sentences"])


  0%|          | 0/1006 [00:00<?, ?it/s]

In [19]:
pages_and_text[0]

{'document_name': '2009FallHorizons.pdf',
 'page_number': 0,
 'page_char_count': 128,
 'page_word_count': 20,
 'page_sentence_count_raw': 3,
 'page_token_count': 32.0,
 'text': 'Fall 2009 Vol. 25, No. 2 Alumni and Friends Magazine  Sherri Kirkpatrick’s HealthEd Connect Serves the World’s Neediest Children',
 'sentences': ['Fall 2009 Vol.',
  '25, No.',
  '2 Alumni and Friends Magazine  Sherri Kirkpatrick’s HealthEd Connect Serves the World’s Neediest Children'],
 'page_sentence_count_spacy': 3}

In [20]:
# spacy split a more than just on ". "
df = pd.DataFrame(pages_and_text)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,1006.0,1006.0,1006.0,1006.0,1006.0,1006.0
mean,15.03,3536.86,665.27,34.19,884.22,26.22
std,10.9,2206.57,474.01,63.06,551.64,38.61
min,0.0,0.0,1.0,1.0,0.0,0.0
25%,6.0,2150.25,387.0,15.25,537.56,8.0
50%,13.0,3476.0,624.0,26.0,869.0,22.0
75%,21.0,4537.0,830.75,37.0,1134.25,34.0
max,47.0,13765.0,5797.0,850.0,3441.25,541.0


In [21]:
# Chunk size
num_sentence_chunk_size = 10

def split_list(input_list: list, slice_size: int=num_sentence_chunk_size) -> list[list[str]]:
    return [input_list[i:i+slice_size] for i in range(0, len(input_list), slice_size)]

test_list = list(range(25))
split_list(test_list)

[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
 [20, 21, 22, 23, 24]]

In [22]:
# split sentences in chunks
for item in tqdm(pages_and_text):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                        slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/1006 [00:00<?, ?it/s]

In [23]:
# Splitting each chunk into its own item
import re

pages_and_chunks = []
for item in tqdm(pages_and_text):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["document_name"] = item["document_name"]
        

        #join the lists of paragraphs 
        joined_sentence_chunk = "".join(sentence_chunk).replace("  "," ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])',r'. \1', joined_sentence_chunk)
        joined_sentence_chunk = re.sub(r'\?([A-Z])',r'. \1', joined_sentence_chunk)
        joined_sentence_chunk = re.sub(r'\!([A-Z])',r'. \1', joined_sentence_chunk)
        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4
        pages_and_chunks.append(chunk_dict)

len(pages_and_chunks)

  0%|          | 0/1006 [00:00<?, ?it/s]

3127

In [24]:
import random
random.sample(pages_and_chunks, k = 1)

[{'document_name': 'Horizons_FA19.pdf',
  'sentence_chunk': 'ANNUAL REPORT 2019 HORIZONS  | 27 “I believe we are navigating the journey between the continuity of our values and mission with an everchanging and ever more complex world.” O\u200a ver and over, we hear that higher education is changing, and it is — rapidly. The days when a student applied to one or two schools, attended one of them on campus for four years and graduated with a bachelor’s degree are disappearing rapidly. Students now compare financial aid packages from multiple schools before choosing based on cost, degree programs are earned online, and the public is even growing skeptical about the essential value of higher education. To survive, universities must prove that what they offer is worth the investment and better than the competition. Those that do not, disappear. For Graceland, however, the challenge is not just delivering well- designed courses. Of course, we need to do that, but we must also be true to our 

In [79]:
# get chunks with under 120 tokens of length. These chunks probaly don't have a lot of usefull info and are most likely headings
df = pd.DataFrame(pages_and_chunks)
min_token_length = 120
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len[0]

{'document_name': '2009FallHorizons.pdf',
 'sentence_chunk': 'I am dressed for SPEC in this impromptu photo with Sherri and Jac Kirkpatrick. They joined 1,200 high school students and volunteers in late July for the an- nual Community of Christ SPECTACULAR event on our Lamoni campus. See the photos on pages 8-11 of this issue of Horizons for glimpses of the energy and excitement that prevail during this wonderful weeklong camp. It was my pleasure to meet with Sherri and Jac as they inaugurate their new HealthEd Connect project that will bring village health care and community-based orphan programs to Sub-Sahara Africa, and help provide for the significant number of children in those countries who have AIDS. Sherri is truly the Albert Schweitzer of Graceland. She has traveled the world bringing health- care education and hope to those most in need. Her work with the Graceland-affiliated International Health Center for the last 20 years has been nothing short of miraculous. She has taken

In [84]:
# Note for future research best embedding models
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                      device="cuda")

In [81]:
# Running on slow bum CPU
# CPU no batching takes 9.12 mins
# for item in tqdm(pages_and_chunks_over_min_token_len):
#     item["embedding"] = embedding_model.encode(item["sentence_chunk"])

  0%|          | 0/2469 [00:00<?, ?it/s]

In [28]:
%%time
# Embedding model using GPU

# embedding_model.to("cuda")

# for item in tqdm(pages_and_chunks_over_min_token_len):
#     item["embedding"] = embedding_model.encode(item["sentence_chunk"])

CPU times: total: 0 ns
Wall time: 0 ns


In [85]:
%%time
# running with CPU chunking
# CPU with batching takes 7.16 mins
text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]

text_chunk_embeddings = embedding_model.encode(text_chunks,
 batch_size=16, convert_to_tensors=True)

text_chunk_embeddings

CPU times: total: 1min 22s
Wall time: 10.9 s


array([[ 0.01565549,  0.05134168,  0.00469097, ...,  0.01968559,
         0.04327909,  0.00772345],
       [-0.00152143,  0.09567615,  0.00168284, ...,  0.02596665,
         0.01168879,  0.02988   ],
       [ 0.07282847,  0.04050159, -0.01421707, ..., -0.05362667,
         0.00808336, -0.01800963],
       ...,
       [ 0.00455149,  0.10821065,  0.01038493, ..., -0.0101995 ,
        -0.02181573,  0.00449938],
       [ 0.01957653,  0.06110422, -0.00176075, ..., -0.00284812,
         0.04169122, -0.02332427],
       [ 0.03256436,  0.10554793, -0.04640035, ...,  0.01929192,
         0.05620907,  0.01205661]], dtype=float32)

In [86]:
#Save embeddings
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_len)
embeddings_df_save_path = "text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

In [87]:
import pandas as pd

In [88]:
text_chunks_and_embeddings_df_load = pd.read_csv(embeddings_df_save_path)

text_chunks_and_embeddings_df_load.head()

Unnamed: 0,document_name,sentence_chunk,chunk_char_count,chunk_word_count,chunk_token_count,score,embedding
0,2009FallHorizons.pdf,I am dressed for SPEC in this impromptu photo ...,1234,204,308.5,,[ 1.56554971e-02 5.13416603e-02 4.69097216e-...
1,2009FallHorizons.pdf,He understands how to function in these far-an...,1095,187,273.75,,[-1.52143801e-03 9.56762135e-02 1.68284110e-...
2,2009FallHorizons.pdf,"Administration John Sellars, Ph. D. \t Preside...",3075,520,768.75,,[ 7.28284568e-02 4.05015796e-02 -1.42170694e-...
3,2009FallHorizons.pdf,She and her husband Jac Kirkpatrick are now la...,1182,204,295.5,,[ 4.19127801e-03 6.59697279e-02 -4.01556864e-...
4,2009FallHorizons.pdf,HealthEd Connect Expands Sherri’s Lifelong Pas...,1219,192,304.75,,[ 1.52009120e-02 9.64530185e-02 -1.43753057e-...


In [93]:
import torch
import numpy as np

# convert embeddins colum back to np.array from csv
# text_chunks_and_embeddings_df_load = pd.read_csv("text_chunk_embeddings_df.csv")
# text_chunks_and_embeddings_df["embedding"] = text_chunks_and_embeddings_df_load["embedding"].apply(lambda x: np.fromstring(x.strip("[]")))

device = "cuda" if torch.cuda.is_available() else "cpu"

embeddings = text_chunks_and_embeddings_df["embedding"].tolist()
embeddings = torch.tensor(np.stack(text_chunks_and_embeddings_df["embedding"].tolist(), axis=0), dtype=torch.float32).to(device)
embeddings.shape

torch.Size([2469, 768])

In [98]:
# Create model
from sentence_transformers import util, SentenceTransformer
from time import perf_counter as timer

embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2",
                                      device=device)

query = "Graceland"
print(f"Query: {query}")

#Note: embed query with the same model you embedded your passage with.
query_embedding = embedding_model.encode(query, convert_to_tensor=True)

Query: Graceland


In [100]:
start_time = timer()
dot_scores = util.dot_score(a=query_embedding, b=embeddings)[0]
end_timer = timer()

print(f"Time taken to get scores on {(len(embeddings))} embeddings: {end_timer-start_time:.5f} seconds.")

# Get top 5 results of query search
top_results_dot_product = torch.topk(dot_scores, k=5)
top_results_dot_product

Time taken to get scores on 2469 embeddings: 0.00035 seconds.


torch.return_types.topk(
values=tensor([0.5841, 0.5794, 0.5716, 0.5634, 0.5628], device='cuda:0'),
indices=tensor([2399,  188, 1899,  796,  175], device='cuda:0'))

In [102]:
# Not great results
pages_and_chunks[1]

{'document_name': '2009FallHorizons.pdf',
 'sentence_chunk': 'I am dressed for SPEC in this impromptu photo with Sherri and Jac Kirkpatrick. They joined 1,200 high school students and volunteers in late July for the an- nual Community of Christ SPECTACULAR event on our Lamoni campus. See the photos on pages 8-11 of this issue of Horizons for glimpses of the energy and excitement that prevail during this wonderful weeklong camp. It was my pleasure to meet with Sherri and Jac as they inaugurate their new HealthEd Connect project that will bring village health care and community-based orphan programs to Sub-Sahara Africa, and help provide for the significant number of children in those countries who have AIDS. Sherri is truly the Albert Schweitzer of Graceland. She has traveled the world bringing health- care education and hope to those most in need. Her work with the Graceland-affiliated International Health Center for the last 20 years has been nothing short of miraculous. She has taken

In [75]:
# For vector search read up on Faiss, nearest neighbour search
import textwrap

def print_wrapped(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)

In [105]:
query = "Alumna Chris Helene Bridge"
query_embedding = embedding_model.encode(query, convert_to_tensor=True)
dot_scores = util.dot_score(a=query_embedding, b=embeddings)[0]
top_results_dot_product = torch.topk(dot_scores, k=5)
top_results_dot_product


print(f"Query: {query}\n")
print("Results:")
for score, idx in zip(top_results_dot_product[0], top_results_dot_product[1]):
    print(f"Score: {score:.4f}")
    print(f"Page number: {pages_and_chunks[idx]["document_name"]}")
    print("Text:")
    print_wrapped(pages_and_chunks[idx]["sentence_chunk"])
    print("\n")

Query: Alumna Chris Helene Bridge

Results:
Score: 0.5127
Page number: 2012FallHorizons.pdf
Text:
Graceland is now a 1st-tier school in the arts. Students from around the world
will follow their hearts and dreams to Graceland just for the chance to work and
learn in this magnificent facility. In the short-term, I have some very exciting
travel plans coming up this fall to meet with donors concerning athletics, the
sciences, the arts, music, agriculture, and more. In the long-term, I feel an
excitement for Grace- land’s future that seems to grow as each day passes. I
thank you for making this excitement possible. Sincerely, John Sellars, Ph. D.
President Annual Report G r a c e l a n d U n i v e r s i t y 2 0 12


Score: 0.4947
Page number: 2013FallHorizons.pdf
Text:
Fall 2013 Horizons  | 5 Powell House Created Tom Powell ’73 joined the Graceland
faculty in 1980 and then served as VP of Student Life from 1984 until January
2010. Graceland honors Tom’s legacy by naming a new mens’ house,

In [44]:
# Functionizing resources
def retrieve_relevant_resources(query: str,
                                embeddings: torch.tensor,
                                model: SentenceTransformer=embedding_model,
                                n_resources_to_return: int=5,
                                print_time: bool=True):
    
    query_embedding = model.encode(query, convert_to_tensor=True)

    start_time = timer()
    dot_scores = util.dot_score(query_embedding, embeddings)[0]
    end_time = timer()

    if print_time:
        print(f"Time taken to get scores on {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")
    
    scores, indices = torch.topk(input=dot_scores,
                                k=n_resources_to_return)
    
    return scores, indices

def print_top_results_and_scores(query: str,
                                 embeddings: torch.tensor,
                                 pages_and_chunks: list[dict]= pages_and_chunks,
                                 n_resources_to_return: int=5):
    
    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=embeddings,
                                                  n_resources_to_return=n_resources_to_return)
    
    print(f"Query: {query}\n")
    print("Results:")
    for scores, indices in zip(scores, indices):
        print(f"Score: {scores:.4f}")
        print(f"Page number: {pages_and_chunks[indices]["document_name"]}")
        print("Text:")
        print_wrapped(pages_and_chunks[indices]["sentence_chunk"])
        print("\n")

In [45]:
query = "Resch"
print_top_results_and_scores(query=query,embeddings=embeddings)

Time taken to get scores on 2469 embeddings: 0.00006 seconds.
Query: Resch

Results:
Score: 0.5490
Page number: 2010WinterHorizons.pdf
Text:
. . . . . . . . . . . . . . . . . . . .4,062 . .


Score: 0.4510
Page number: HorizonsSU18_FINAL.pdf
Text:
So, who among the class of 2018 will capture headlines in their lifetime. It’s
too soon to tell. They will take many different roads as they pursue their
passions. Some may lead to fame and fortune; others may provide more private
satisfactions of family and friendship. It is both a thrilling and unsettling
fact of teaching that you never know if what you’re saying may have a profound
life-changing impact, even when you wondered if they were listening at all. A
philosopher tells us that the goal in life is not to try to do extraordinary
things, but to do ordinary things with an appreciation of their extraordinary
significance. And so, we send our graduates out to do ordinary things with
confidence that, with their Graceland experience, they w

In [46]:
# checking hardware relative to LLM model
#7b model size model is what we'll be looking for this project.

import torch
gpu_memory_bytes = torch.cuda.get_device_properties(0).total_memory
gpu_memory_gb = round(gpu_memory_bytes / (2**30))

In [47]:
if gpu_memory_gb < 5.1:
    print(f"Your available GPU memory is {gpu_memory_gb}GB, you may not have enough memory to run a Gemma LLM locally without quantization.")
elif gpu_memory_gb < 8.1:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma2 2B in 4-bit precision.")
    use_quantization_config = True 
    model_id = "google/gemma2-2b-it"
elif gpu_memory_gb < 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma2 2B in float16 or Gemma 7B in 4-bit precision.")
    use_quantization_config = False 
    model_id = "google/gemma2-2b-it"
elif gpu_memory_gb > 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommend model: Gemma2 7B in 4-bit or float16 precision.")
    use_quantization_config = False 
    model_id = "google/gemma2-7b-it"

print(f"use_quantization_config set to: {use_quantization_config}")
print(f"model_id set to: {model_id}")

GPU memory: 12 | Recommended model: Gemma2 2B in float16 or Gemma 7B in 4-bit precision.
use_quantization_config set to: False
model_id set to: google/gemma2-2b-it


In [48]:

from huggingface_hub import login

In [49]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available

# Create quantization config
from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.float16)

from dotenv import load_dotenv, find_dotenv

dot_env_path = find_dotenv()
load_dotenv(dot_env_path)
access_token = os.getenv("HUGGING_FACE_KEY")


In [50]:
# loading LLM
# Model: gemma 7b
# Use Flash Attention 2 if possible

#login to huggingface
login(token=access_token)
print("Logged in successfully!")

# Flash attention 2 if possible
if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8):
    attn_implementation = "flash_attention_2"
else:
    attn_implementation = "sdpa"

# Loading model
model_id = "google/gemma-2-2b-it"

# Instantiate tokenizer 
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b-it")

# Load model
llm_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    token=access_token,
    torch_dtype=torch.bfloat16,
    device_map=None,
    attn_implementation=attn_implementation
).to("cuda")

Logged in successfully!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [51]:
# Inspect model
llm_model

Gemma2ForCausalLM(
  (model): Gemma2Model(
    (embed_tokens): Embedding(256000, 2304, padding_idx=0)
    (layers): ModuleList(
      (0-25): 26 x Gemma2DecoderLayer(
        (self_attn): Gemma2Attention(
          (q_proj): Linear(in_features=2304, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2304, out_features=1024, bias=False)
          (v_proj): Linear(in_features=2304, out_features=1024, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2304, bias=False)
        )
        (mlp): Gemma2MLP(
          (gate_proj): Linear(in_features=2304, out_features=9216, bias=False)
          (up_proj): Linear(in_features=2304, out_features=9216, bias=False)
          (down_proj): Linear(in_features=9216, out_features=2304, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)
        (post_attention_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)
        (pre_feedforward_layernorm): Gemm

In [52]:
inputs_text = "Tell me about the Resch building."
print(f"Input text:\n{inputs_text}")

Input text:
Tell me about the Resch building.


In [53]:
dialog_template = [
    {"role":"user",
     "content": inputs_text}
]

prompt = tokenizer.apply_chat_template(conversation=dialog_template,
                                       tokenize=False,
                                       add_generation_prompt=True)

print(f"\nPrompt (formatted):\n{prompt}")


Prompt (formatted):
<bos><start_of_turn>user
Tell me about the Resch building.<end_of_turn>
<start_of_turn>model



In [54]:
tokenizer

GemmaTokenizerFast(name_or_path='google/gemma-2-2b-it', vocab_size=256000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<bos>', 'eos_token': '<eos>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<start_of_turn>', '<end_of_turn>']}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<eos>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("<bos>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("<mask>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=False),
	5: AddedToken("<2mass>", rstrip=False, lstrip=False, single_w

In [57]:
%%time

prompt = "Tell me about North Park."

dialog_template = [
    {"role":"user",
     "content": prompt}
]

prompt = tokenizer.apply_chat_template(conversation=dialog_template,
                                       tokenize=False,
                                       add_generation_prompt=True)

# Tokenize and move to GPU
inputs = tokenizer(prompt, return_tensors="pt")
input_ids = inputs["input_ids"].to("cuda")
attention_mask = inputs["attention_mask"].to("cuda")

# Generate
outputs = llm_model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_new_tokens=512,
    do_sample=True,
    temperature=0.7,
    top_p=0.95
)

# Decode the output
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)

user
Tell me about North Park.
model
## North Park: A Vibrant Neighborhood in San Diego

North Park is a diverse and trendy neighborhood in San Diego, California, known for its unique blend of eclectic shops, restaurants, breweries, and nightlife.  Here's a glimpse:

**What makes North Park special?**

* **Eclectic Vibe:** It boasts a bohemian, artsy feel with vintage shops, independent boutiques, and quirky cafes. 
* **Foodie Paradise:** From farm-to-table restaurants to trendy food trucks, North Park offers a culinary scene that caters to every palate.
* **Craft Beer Capital:** Home to numerous craft breweries and brewpubs, North Park is a haven for beer lovers. 
* **Outdoor Activities:** The neighborhood boasts a vibrant park scene with the popular **North Park Community Park**, a dog-friendly space with a playground and grassy fields.
* **Walkable and bikeable:** North Park is a pedestrian-friendly neighborhood, with a network of well-maintained sidewalks and bike lanes.
* **Commun

In [58]:
def prompt_formatter(query: str,
                     context_items: list[dict]) -> str:
    context = "- " + "\n- ".join([item["sentence_chunk"] for item in context_items])

    base_prompt = """Based on the following context items, please answer the query.
    Context items:
{context}
    Query: {query}
    Answer: 
    """

    prompt = base_prompt.format(context=context,
                                query=query)
    return prompt



In [59]:
query = "Tell me about the Resch building."

scores, indices = retrieve_relevant_resources(query=prompt,
                                              embeddings=embeddings)

context_items = [pages_and_chunks[i] for i in indices]

prompt = prompt_formatter(query=query,
                          context_items=context_items)

print(prompt)

input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

outputs = llm_model.generate(**input_ids,
                             temperature=0.5,
                             do_sample=True,
                             max_new_tokens=512)

output_text = tokenizer.decode(outputs[0])
print(f"Query: {query}")
print(f"RAG answer:\n{output_text.replace(prompt, '')}")

Time taken to get scores on 2469 embeddings: 0.00008 seconds.
Based on the following context items, please answer the query.
    Context items:
- Anne sees each addition to the milking herd she now owns as a distinct character and names her accordingly after an appropriate performer, goddess, NPR news broadcaster, or politician. There will likely be a Hillary frolicking among this season’s kids. Anne’s father (John Rufus Topham ’28) coached her on the finer points of hand milking, and her quest for the best use of the pure, white goat milk was on. Anne wanted to re-create the first goat cheese she had ever tasted, a confection that had been carried from Paris to Madison by the mother of a college friend. “It was a lovely, blooming-rind round of cheese resting on a bed of straw, and I’ve never forgotten it,” she says. Anne’s mother, Alta Royer ’28 Topham, was a piano stu- dent at GU and went on to become a valued member of the first RLDS Hymnal Committee. “My mother drove a Model T all 

In [60]:
# takes query and finds relevant news articles and then generates an answer based on the relevant resources.
def ask(query: str,
        temperature: float=0.7,
        max_new_tokens: int=512,
        fromat_answer_text = True,
        return_answer_only = True):
    
    # Retrieval 
    scores, indices = retrieve_relevant_resources(query=query,
                                                 embeddings=embeddings)
    
    context_items = [pages_and_chunks[i] for i in indices]

    for i, item in enumerate(context_items):
        item["score"] = scores[i].cpu()


    # Augmentation
    prompt = prompt_formatter(query=query,
                              context_items=context_items)
    
    
    #Generation
    input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

    outputs = llm_model.generate(**input_ids,
                                 temperature= temperature,
                                 do_sample = True,
                                 max_new_tokens=max_new_tokens)
    
    output_text = tokenizer.decode(outputs[0])

    if fromat_answer_text:
        output_text = output_text.replace(prompt, "").replace("<bos>", "").replace("<eos>","")
    
    if return_answer_only:
        return output_text, context_items


In [63]:
query = "North Park"

ask(query=query)

Time taken to get scores on 2469 embeddings: 0.00006 seconds.


('"The context you provided does not mention North Park." \n<end_of_turn>',
 [{'document_name': '2009SpringHorizons.pdf',
   'sentence_chunk': 'Join me in celebrating a handful of the thousands of ‘Graceland Stories’ we would like to tell. The stellar alumni you will read about here graduated as long ago as 1953 and as recently as 2008. Their visions and efforts cover the gamut of the human experience. I champion their achievements. To all Gracelanders I would like to say, hold on to your most-cherished goals and be guided by the life-path vi- sions you took away from this university. Do you remem- ber the dreams you had on your Commencement Day? Reach for them.     John Sellars      President Administration John Sellars, Ph. D. \t President Steven L. Anders, Ph. D. ’73 \t Vice President for Academic Affairs and Dean of Faculty Kathleen M. Clauson Bash, Ph. D. \t Vice President for Institutional Effectiveness Kelly W. Everett, B. A. ’77 \t Vice President for Institutional Advancement S