In [1]:
import requests
import json
from bs4 import BeautifulSoup
import re
import pandas as pd

In [2]:
OLLAMA_API_URL = "http://localhost:11434/v1/chat/completions"  # Default Ollama API URL

In [3]:
#This function defines what you want to ask OLLAMA to do with your html output from the other script make sure you have loaded the correct model from OLLAMA

def info(article_text, article_tables): 
    full_content = article_text.strip()
    
    
    if article_tables:
        full_content += "\n\n--- TABLES ---\n"
        full_content += article_tables.strip()

    #this is the promt for OLLAMA can be edited to whatever you want to ask 
    prompt = (
        "From the following article, extract the income eligibility limits (if mentioned) for the Homelessness Prevention Program. "
        "Be concise and quote the source if possible.\n\n"
        "--- ARTICLE START ---\n"
        f"{full_content}\n"
        "--- ARTICLE END ---\n\n"
        "What are the income eligibility limits for this program?"
    )

    #MAKE SURE THE MODEL MATCHES THE MODEL YOU ARE USING
    payload = {
        "model": "llama2",
        "messages": [
            {"role": "user", "content": prompt}
        ]
    }

    headers = {
        "Content-Type": "application/json"
    }

    response = requests.post(OLLAMA_API_URL, json=payload, headers=headers)

    if response.status_code == 200:
        result = response.json()
        return result['choices'][0]['message']['content'].strip()
    else:
        print(f"Error: {response.status_code} - {response.text}")
        return None


In [None]:
#THIS TAKES LIKE AN HOUR TO RUN DEPENDING ON HOW MANY RESULTS YOU HAVE
#Load your JSON that you created with the income limits crawler, replace the file path with your json file
with open("C:\\Users\\mt886\\OneDrive\\Documents\\GitHub\\HPP_Crawler\\hpp10r.json", 'r', encoding='utf-8') as file:
    articles = json.load(file)

#Empty list to store the data
article_data = []

#grab the elements you want
for article in articles:
    article_text = article.get("text")
    article_url = article.get("link")
    article_location = article.get("location")
    article_tables = article.get("tables")  

    if article_text:
        #Runs the function to get OLLAMA to parse the text and tables
        info_you_want = info(article_text,article_tables)

        #Store the data
        data_entry = {
            "location": article_location,
            "url": article_url,
            "limits": info_you_want
        }

        
        if article_tables:
            data_entry["tables"] = article_tables

        article_data.append(data_entry)

In [5]:
#save LLM Processed info

with open('hppincomellm.json', 'w', encoding='utf-8') as f:
    json.dump(article_data, f, indent=4, ensure_ascii=False)

In [6]:
#This code just save a version without all the table text

# Step 1: Load the JSON file
with open('hppincomellm.json', 'r', encoding='utf-8') as file:
    data = json.load(file)


for item in data:
    if isinstance(item, dict):  
        item.pop('tables', None)  


with open('hppincomellmnotables.json', 'w', encoding='utf-8') as file:
    json.dump(data, file, indent=4)
