In [3]:
# Environment setup
from dotenv import load_dotenv
import os

# Load the .env file
load_dotenv()

# Get the access token
access_token = os.getenv("HUGGINGFACE_TOKEN")

In [1]:
# Summary generation

# import ./data/endpoints.csv with rows URL,Text
import pandas as pd
endpoints = pd.read_csv('./data/endpoints.csv')

# remove any rows with empty URL or Text
endpoints = endpoints.dropna(subset=['URL', 'Text'])

#filter out urls with search term in the text ex. ?e=, ?q=, ?s=, ?search=
filter_out_terms = ['?']
# Create a mask that is True for rows where the 'Text' column contains any of the filter_out_terms
mask = endpoints['URL'].apply(lambda x: any(term in x for term in filter_out_terms))
# Use ~ to negate the mask, i.e., keep only the rows where the mask is False
endpoints = endpoints[~mask]

#sort endpoints by how many paths they have ex. None, /path, /path/path, /path/path/path
# get the number of paths in a url
def get_path_count(url: str) -> int:
    return url.count('/') - 2
endpoints['path_count'] = endpoints['URL'].apply(get_path_count)
endpoints = endpoints.sort_values(by=['path_count'])

# endpoints where path_count is 
test_set = endpoints[endpoints['path_count'] < 2]

# Ensure "Text" column is a string
test_set['Text'] = test_set['Text'].astype(str)

# Remove any rows with "Text" less than 50 characters
test_set = test_set[test_set['Text'].str.len() > 50]

# Remove any rows with duplicate URLs
test_set = test_set.drop_duplicates(subset=['URL'])

# save test set to csv
test_set.to_csv('./data/test_set.csv', index=False)

# get the first 30 rows
test_set.head(100)

# convert the test set to a list of tuples (URL, Text)
test_set = list(test_set.itertuples(index=False, name=None))

test_set

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_set['Text'] = test_set['Text'].astype(str)


[('https://charlotte.edu',
  'The University of North Carolina at Charlotte | UNC Charlotte Skip to main content News & Events News Music students participating in touring education production Tue, 02/06/2024 UNC Charlotte receives Library Excellence in Access and Diversity Award Fri, 02/02/2024 Excellence in Leadership Awards bestowed on 10 outstanding alumni Fri, 02/02/2024 Young alumni advancing in their fields and communities Thu, 01/25/2024 Noted neuroscience researcher Kelly Cartwright named Spangler Distinguished Professor of Early Literacy Wed, 01/24/2024 View All News Events UNC Charlotte Shape What\'s Next UNC Charlotte Icons 0 doctoral programs UNC Charlotte Icons 0 Living Alumni UNC Charlotte Icons 0 #NinerNation Undergrads to Overachievers Variety is more than the spice of life. It is life! The world offers a broader range of career opportunities than ever before, which is why we offer the way to explore and prepare for so many of them right. Choose from diverse majors in 

In [4]:
# LLM Summary Generation 
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
from textwrap import dedent

MODEL = "google/gemma-2b-it" # Newer and small model, promising given the size to performance as well as open source
# MODEL = "yam-peleg/Experiment26-7B" # too large for my GPU

device = "cuda:0" if torch.cuda.is_available() else "cpu"
#device = "cpu"
print(device)

config = AutoConfig.from_pretrained(MODEL, use_auth_token=access_token, max_new_tokens=800)
quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16)

model = AutoModelForCausalLM.from_pretrained(MODEL, use_auth_token=access_token, config=config, quantization_config=quantization_config)#.to(device) #.to not needed if quantization is used, defaults to cuda
tokenizer = AutoTokenizer.from_pretrained(MODEL, use_auth_token=access_token, config=config)

generator = pipeline("text-generation", model=MODEL, config=config)

cuda:0


`low_cpu_mem_usage` was None, now set to True since model is quantized.


bin C:\Users\Ryan\AppData\Roaming\Python\Python312\site-packages\bitsandbytes\libbitsandbytes_cuda121.dll


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

You are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
import json

LLM_INSTRUCT_PROMPT = dedent('''\
Given the URL "{url}", and the following text from the site: "{text}" provide a concise summary that includes: 
1. The main topics in the text.
2. The purpose or objective of the website, inferred from the text and url (including subdomain and path).
3. Tags or keywords that a user may search to try and find the site in a search engine.

You must utilize information from the URL (such as the specific path and subdomain) to contextualize and add to the understanding of the text.

Format the response in a json like the following:

'summary': 'The description of url and summary of the text goes here.',
'topics': ['topic1', 'topic2', 'topic3'],
'tags': ['tag1', 'tag2', 'tag3']
''')

# This function takes a prompt and returns generated text with transformers generator
def generate_summary(url: str, text: str, debug: bool = False) -> str:
    prompt = LLM_INSTRUCT_PROMPT.format(url=url, text=text)
    return generator(prompt, max_length=800, do_sample=True)

# Manual
def gen_summary(url: str, text:str, debug: bool = False) -> str:
    chat = [
        { "role": "user", "content": LLM_INSTRUCT_PROMPT.format(url=url, text=text) },
    ]
    prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
    input_ids = tokenizer([prompt], add_special_tokens=False, return_tensors="pt").to(device)
    output = model.generate(**input_ids, max_new_tokens=1000)
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Helpers
def fill_prompt(input_text: str) -> str:
    return LLM_INSTRUCT_PROMPT + " " + input_text

def extract_summary(llm_output: str):
    model_index = llm_output.find("'tags': ['tag1', 'tag2', 'tag3']\nmodel")
    if model_index != -1:
        json_string = llm_output[model_index + 40:]
        
        # Replace ",] with "] to fix json formatting, common issue with LLM
        json_string = json_string.replace('",]', '"]')
        
        # If last character is not } then add it
        if json_string[-1] != '}':
            json_string += '}'
            
        # If first character is not { then add it
        if json_string[0] != '{':
            json_string = '{' + json_string
        
        try:
            json_object = json.loads(json_string)
            return json_object
        except:
            return None
    else:
        raise ValueError('Could not find model output in LLM output')

In [17]:
print(len(test_set))

4380


In [18]:
from IPython.display import clear_output, display
import time
import json

current_display = display('Starting...', display_id=True)
progress_display = display('Starting...', display_id=True)
time_display = display('Starting...', display_id=True)
success_count = 0
failure_count = 0
avg_time = 0
last_time = 0
current_time = 0

# Test llm summaries
summary_outputs = []
extract_summaries = []
final_urls = []

for t in test_set:
    start_time = time.time()
    try:
        sum = gen_summary(t[0],t[1])
        summary_outputs.append(sum)
        extract = extract_summary(sum)
        extract_summaries.append(extract)
        if extract is not None:
            extract['url']=t[0]
            final_urls.append(extract)
            success_count += 1
            current_display.update(f'Most recent URL: {t[0]}, Extract: {extract["summary"][:60]}...')
            progress_display.update(f'Successes: {success_count}, Failures: {failure_count}')
            
            # Save the final urls to a json file every 10 summaries
            if success_count % 10 == 0:
                with open('./data/sublist_llm_summaries_from_pipeline.json', 'w') as f:
                    json.dump(final_urls, f, indent=4)
        else:
            raise Exception('Error parsing json')
    except Exception as e:
        failure_count += 1
        current_display.update(f'Most recent URL: {t[0]} \nError: {e}')
        progress_display.update(f'Successes: {success_count}, Failures: {failure_count}')
        continue
    end_time = time.time()
    last_time = end_time - start_time
    avg_time = (avg_time * (success_count - 1) + last_time) / success_count
    time_display.update(f'Average time: {avg_time}, Last time: {last_time}')
            
# Save the final urls to a json file one last time at the end
with open('./data/sublist_llm_summaries_from_pipeline.json', 'w') as f:
    json.dump(final_urls, f, indent=4)

'Most recent URL: https://geoearth.charlotte.edu/martha-cary-missy-eppes, Extract: Dr. Martha Cary (Missy) Eppes is a professor in the Departme...'

'Successes: 558, Failures: 4'

KeyboardInterrupt: 

In [19]:
# Save the final urls to a json file
with open('./data/full_llm_summaries_from_pipeline.json', 'w') as f:
    json.dump(final_urls, f, indent=4)