In [29]:
import requests
from bs4 import BeautifulSoup

urls = ["https://www.charlotte.edu/","https://www.charlotte.edu/research","https://www.charlotte.edu/landing/campus-life"]
test_data = []

for url in urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    title = soup.find('title')
    body_text = soup.find('body').get_text(strip=True, separator=' ')
    
    test_data.append({
        "url": url,
        "title": title.text,
        "body_text": body_text
    })

In [30]:
import pandas as pd

df = pd.DataFrame(test_data) 

In [31]:
text_to_remove = ["Skip to main content Admissions & Financial Aid Academics Research Athletics Campus Life Diversity Alumni & Friends Faculty & Staff Prospective Students Community Current Students Parents & Family My UNC Charlotte Directory Make a Gift Library About Us Apply Now Visit Our Campus Give to UNC Charlotte Take a Virtual Tour Corporate Engagement ",
                  "Click For More Less toggle footer Campus Links Alerts Jobs Make a Gift Maps / Directions Accessibility Resources Alumni & Friends Faculty & Staff Prospective Students Community Current Students Parents and Family Stay In Touch facebook instagram flickr linkedin twitter youtube maps The University of North Carolina at Charlotte 9201 University City Blvd, Charlotte, NC 28223-0001 704-687-8622 © 2024 UNC Charlotte | All Rights Reserved Contact Us | Terms of Use | University Policies Report a Concern"]

for text in text_to_remove:
    df['body_text'] = df['body_text'].str.replace(text, '')

In [32]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it", device_map="cuda:0", torch_dtype=torch.bfloat16)

PROMPT = (
    "Given the url: {url}, and title: {title}, "
    "Describe the content of the webpage in 3 sentences: {body_text} ```"
)

def extract_summary(text):
    parts = text.split("```")
    if len(parts) > 1:
        return parts[1].strip().replace("<eos>", "")
    else:
        return None

def generate_summary(row):
    prompt = f"{PROMPT.format(url=row['url'], title=row['title'], body_text=row['body_text'])}"
    input_ids = tokenizer(prompt, return_tensors="pt").to("cuda:0")
    
    output = model.generate(**input_ids, max_new_tokens=50)
    return extract_summary(tokenizer.decode(output[0]))



df['summary'] = df.apply(generate_summary, axis=1)

df['summary']


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

0    This webpage is a portal to the University of ...
1    This webpage is a website for researchers at t...
2    This webpage provides a comprehensive overview...
Name: summary, dtype: object

In [33]:
# print the first summary
print(df['summary'][0])


This webpage is a portal to the University of North Carolina at Charlotte, highlighting various news and events, academic programs, research opportunities, and student life experiences.


In [34]:
import gc
torch.cuda.empty_cache()
gc.collect()

20

In [42]:
from sentence_transformers import SentenceTransformer, util
import torch

model = SentenceTransformer('msmarco-distilbert-base-v3')
embeddings = model.encode(df["summary"], convert_to_tensor=True)

def search(query, top_k):
    query_embedding = model.encode(query, convert_to_tensor=True)
    cos_scores = util.pytorch_cos_sim(query_embedding, embeddings)[0]
    cos_scores = cos_scores.cpu()

    top_results = torch.topk(cos_scores, k=top_k)

    results = []
    for idx, score in zip(top_results[1], top_results[0]):
        result_dict = {
            'url': df['url'][idx.item()],
            'score': score.item(),
            'title': df['title'][idx.item()],
            'summary': df['summary'][idx.item()],
        }
        results.append(result_dict)

    return results

In [43]:
print(search('research', 1))
print(search('campus life', 2))
print(search('home page', 3))

[{'url': 'https://www.charlotte.edu/research', 'score': 0.40909749269485474, 'title': 'Research at the University of North Carolina at Charlotte | UNC Charlotte', 'summary': 'This webpage is a website for researchers at the University of North Carolina at Charlotte. It provides information about the research conducted by faculty and staff, as well as the research collaborations and funding opportunities available to researchers.'}]
[{'url': 'https://www.charlotte.edu/landing/campus-life', 'score': 0.303379625082016, 'title': 'Campus life at the University of North Carolina at Charlotte | UNC Charlotte', 'summary': 'This webpage provides a comprehensive overview of the various activities and resources available on campus at the University of North Carolina at Charlotte. It highlights the many opportunities for students to get involved and make new friends, as well as the many events and activities that are'}, {'url': 'https://www.charlotte.edu/', 'score': 0.3017105460166931, 'title': 'T

In [52]:
import nest_asyncio
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
import uvicorn

app = FastAPI()

@app.get("/search")
def search_api(query: str, top_k: int):
    if top_k > len(df):
        top_k = len(df)
    if query.strip() == "":
        return []
    return search(query, top_k)

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Required for running server in Jupyter notebook
nest_asyncio.apply()
uvicorn.run(app, host='localhost', port=8000)

INFO:     Started server process [1368]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://localhost:8000 (Press CTRL+C to quit)


INFO:     127.0.0.1:64560 - "GET /search?query=%20&top_k=2 HTTP/1.1" 200 OK
INFO:     127.0.0.1:64560 - "GET /search?query=%20%20&top_k=2 HTTP/1.1" 200 OK
INFO:     127.0.0.1:64560 - "GET /search?query=%20%20%20&top_k=2 HTTP/1.1" 200 OK
INFO:     127.0.0.1:64560 - "GET /search?query=&top_k=2 HTTP/1.1" 200 OK
INFO:     127.0.0.1:64560 - "GET /search?query=t&top_k=2 HTTP/1.1" 200 OK
INFO:     127.0.0.1:64563 - "GET /search?query=tes&top_k=2 HTTP/1.1" 200 OK
INFO:     127.0.0.1:64562 - "GET /search?query=te&top_k=2 HTTP/1.1" 200 OK
INFO:     127.0.0.1:64563 - "GET /search?query=test&top_k=2 HTTP/1.1" 200 OK
INFO:     127.0.0.1:64563 - "GET /search?query=&top_k=2 HTTP/1.1" 200 OK
INFO:     127.0.0.1:64563 - "GET /search?query=r&top_k=2 HTTP/1.1" 200 OK
INFO:     127.0.0.1:64562 - "GET /search?query=re&top_k=2 HTTP/1.1" 200 OK
INFO:     127.0.0.1:64563 - "GET /search?query=res&top_k=2 HTTP/1.1" 200 OK
INFO:     127.0.0.1:64563 - "GET /search?query=rese&top_k=2 HTTP/1.1" 200 OK
INFO:     127

INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [1368]


KeyboardInterrupt: 

In [48]:
# Call the search api

import requests

response = requests.get("http://localhost:8000/search?query=research&top_k=1")
print(response.json())

Task exception was never retrieved
future: <Task finished name='Task-1' coro=<Server.serve() done, defined at C:\Users\Ryan\AppData\Roaming\Python\Python312\site-packages\uvicorn\server.py:67> exception=KeyboardInterrupt()>
Traceback (most recent call last):
  File "C:\Users\Ryan\AppData\Roaming\Python\Python312\site-packages\IPython\core\interactiveshell.py", line 3577, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\Ryan\AppData\Local\Temp\ipykernel_1368\3586425972.py", line 13, in <module>
    uvicorn.run(app, host='localhost', port=8000)
  File "C:\Users\Ryan\AppData\Roaming\Python\Python312\site-packages\uvicorn\main.py", line 575, in run
    server.run()
  File "C:\Users\Ryan\AppData\Roaming\Python\Python312\site-packages\uvicorn\server.py", line 65, in run
    return asyncio.run(self.serve(sockets=sockets))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Ryan\AppData\Roaming\Python\Python312\site-packages\nest_asyncio.py", 

KeyboardInterrupt: 

In [None]:
# Small frontend to interact with the search API

from ipywidgets import interact, widgets
import requests

def search(query, top_k):
    response = requests.get(f"http://localhost:8000/search?query={query}&top_k={top_k}")
    return response.json()

interact(search, query="research", top_k=widgets.IntSlider(min=1, max=5, step=1, value=1))

interact(search, query="campus life", top_k=widgets.IntSlider(min=1, max=5, step=1, value=1))

interact(search, query="home page", top_k=widgets.IntSlider(min=1, max=5, step=1, value=1))
