In [None]:
import api_test
import old_rag
import os
import requests
import importlib
importlib.reload(old_rag)
importlib.reload(api_test)

from newspaper import Article
from newspaper.article import ArticleException
from keybert import KeyBERT
from scipy.spatial import distance
from sentence_transformers import SentenceTransformer

import psycopg2
from psycopg2 import extensions
from pgvector.psycopg2 import register_vector
import psycopg2.extras

In [4]:
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
NEWS_API_KEY = os.getenv("NEWS_API_KEY")
POSTGRESQL_PWD = os.getenv("POSTGRESQL_PWD")
print(GEMINI_API_KEY)
print(NEWS_API_KEY)
print(POSTGRESQL_PWD)

AIzaSyDZJa4XnMVzxwE4CekARlI2jsbJM8JoMaw
05c5488a934a4e33a702e25364b898c1
CebuCity1!


In [81]:
# ONLY RUN WHEN NEEDED
url = ('https://newsapi.org/v2/everything?'
       'language=en&'
       'q=Gaza&'
       'from=2025-07-01&'
       'sortBy=relevancy&'
       'pageSize=100&'
       f'apiKey={NEWS_API_KEY}')

response = requests.get(url)
print(response.json())

{'status': 'ok', 'totalResults': 6382, 'articles': [{'source': {'id': None, 'name': 'BBC News'}, 'author': None, 'title': 'Gaza aid contractor tells BBC he saw colleagues fire on hungry Palestinians', 'description': 'The Israel- and US-backed Gaza Humanitarian Foundation has strongly denied the allegation.', 'url': 'https://www.bbc.com/news/articles/cnvmry71q5yo', 'urlToImage': 'https://ichef.bbci.co.uk/news/1024/branded_news/b134/live/e8e06df0-5847-11f0-b5c5-012c5796682d.jpg', 'publishedAt': '2025-07-03T20:14:26Z', 'content': "Lucy WilliamsonBBC Middle East correspondent\r\nThe contractor shared footage from inside a GHF site with the BBC\r\nA former security contractor for Gaza's controversial new Israel- and US-backed aid di… [+4847 chars]"}, {'source': {'id': None, 'name': 'BBC News'}, 'author': None, 'title': "Jeremy Bowen: Israel's allies see evidence of war crimes in Gaza mounting up", 'description': 'The Middle East has been transformed since 7 October, and almost two years on,

In [106]:
#--------------------- NEW RAG JUST DROPPED ---------------------#
# REWRITING ARTICLE CONTENT 
def rewrite_content(article_url):
    """
    given a url to some article, return its content
    this is really just a helper function. also has safeguards for articles that block the scraping tool. 
    """
    article = Article(article_url)
    try: 
        article.download()
        article.parse()
    
        # cleaning it up
        text = article.text
        filtered_lines = filter(str.strip, text.splitlines())
        cleaned_text = "\n".join(filtered_lines)
        return cleaned_text

    except ArticleException as e:
        print(f"[!] Skipping article at {article_url} — {e}")
        return None
    
def extract_content(article_list):
    """
    given a list of articles, get the content for them
    """

    for i in range(len(article_list)):
        content = rewrite_content(article_list[i]['url'])

        # skip articles that contain nothing or that block the scraper
        if (article_list[i]['content'] == None) or (content == None):
            continue
        article_list[i]['content'] = content


# CHUNK THE DATA
class ArticleChunk:
    def __init__(self, author, url, title, source, content):
        self.author = author
        self.title = title
        self.source = source
        self.url = url
        self.content = content
        self.embedding = None
    
    def get_citation(self):
        return "citation here"
    
    def embed(self, embedding):
        self.embedding = embedding.tolist()
    
def divide_content(content, max_tokens=300, overlap=25):
    """
    input: a string of content from an article
    output: a flat list of said content in max 300 token long chunks each, with 25 token of overlap each
    """
    # Split content into tokens (assuming whitespace tokenization)
    tokens = content.split()

    # Validate overlap
    if overlap >= max_tokens:
        raise ValueError("Overlap must be smaller than max_tokens.")

    chunks = []
    start = 0

    while start < len(tokens):
        end = start + max_tokens
        chunk = tokens[start:end]
        chunks.append(" ".join(chunk))

        # Move start forward, keeping overlap
        start += max_tokens - overlap

    return chunks
    
def chunkify(raw_articles):
    """
    each item here is a dict with content. need to break each into 300 token chunks
    output will be a list of ArticleChunks. 
    """
    chunks = []

    for article in raw_articles:
        # small content can go straight into a chunk

        if len(article['content'].split()) < 300:
            chunks.append(ArticleChunk(article['author'], article['url'], article['title'], article['source']['name'], article['content']))
        else:
            # divide article content into max 300 token long chunks with 30 token overlap each, this is a list of content
            divided_content = divide_content(article['content'])
            for content in divided_content:
                chunks.append(ArticleChunk(article['author'], article['url'], article['title'], article['source']['name'], content))

    return chunks

def add_embeddings(article_chunks, sentence_model):
    """
    input: list of article chunks
    output: chunks with embeddings
    """
    
    for chunk in article_chunks:
        
        embedding = sentence_model.encode(chunk.content)

        chunk.embed(embedding)


#------------------------- SQL STUFF -------------------------#
def store_sql(article_chunks):
    """
    stores chunks into a postgreSQL database. 
    input: list of objects of class ArticleChunk, that have vector embeddings
    """
    # SQL connection params
    hostname = 'localhost'
    database = 'newsdash'
    username = 'postgres'
    pwd = POSTGRESQL_PWD
    port_id = 5432
    conn = None
    cur = None

    print("article chunk length: ", len(article_chunks))
    
    try:
        # initializing connection object
        conn = psycopg2.connect(
                    host = hostname,
                    database = database,
                    user = username,
                    password = pwd,
                    port = port_id)

        register_vector(conn)

        # Turn on autocommit so CREATE EXTENSION runs immediately
        conn.set_isolation_level(extensions.ISOLATION_LEVEL_AUTOCOMMIT)
        
        # open a cursor, performs SQL operations
        cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)
        cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")
        cur.execute('CREATE EXTENSION IF NOT EXISTS "uuid-ossp";')
        cur.execute("DROP TABLE IF EXISTS chunked_data;")

        # Switch back to transactional mode to make the table
        conn.set_isolation_level(extensions.ISOLATION_LEVEL_READ_COMMITTED)
        
        create_script = ''' CREATE TABLE IF NOT EXISTS chunked_data (
                                chunk_id UUID NOT NULL PRIMARY KEY,
                                author VARCHAR(50),
                                url VARCHAR(300),
                                title VARCHAR(200),
                                source VARCHAR(100),
                                content TEXT,
                                embedding VECTOR(384))'''
        
        cur.execute(create_script)
        # insert data into the table

        insert_script = 'INSERT INTO chunked_data (chunk_id, author, url, title, source, content, embedding) VALUES (uuid_generate_v4(), %s, %s, %s, %s, %s, %s)'
        
        for chunk in article_chunks:
            data = [chunk.author, chunk.url, chunk.title, chunk.source, chunk.content, chunk.embedding]
            cur.execute(insert_script, data)

        # always place this at the bottom
        conn.commit()

    except Exception as error:
        import traceback; traceback.print_exc()
        raise
    finally:
        # close the cursor and the connection
        if cur is not None:
            cur.close()
        if conn is not None:
            conn.close()

def fetch_sql(encoded_query, n_chunks):
    """
    stores chunks into a postgreSQL database. 
    input: list of objects of class ArticleChunk, that have vector embeddings
    """
    # SQL connection params
    hostname = 'localhost'
    database = 'newsdash'
    username = 'postgres'
    pwd = POSTGRESQL_PWD
    port_id = 5432
    conn = None
    cur = None
    
    try:
        # initializing connection object
        conn = psycopg2.connect(
                    host = hostname,
                    database = database,
                    user = username,
                    password = pwd,
                    port = port_id)

        register_vector(conn)

        # Turn on autocommit so CREATE EXTENSION runs immediately
        conn.set_isolation_level(extensions.ISOLATION_LEVEL_AUTOCOMMIT)
        
        # open a cursor, performs SQL operations
        cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor)

        # Switch back to transactional mode to make the table
        conn.set_isolation_level(extensions.ISOLATION_LEVEL_READ_COMMITTED)

        # fetching data
        featch_script ="""
        SELECT
            chunk_id,
            author,
            url,
            title,
            source,
            content,
            embedding <=> %s   AS cosine_dist
        FROM chunked_data
        ORDER BY cosine_dist
        LIMIT %s;
        """

        cur.execute(featch_script, (encoded_query, n_chunks))
        top_n_chunks = cur.fetchall()
        
        print("SQL select completed")
        # always place this at the bottom
        conn.commit()

    except Exception as error:
        import traceback; traceback.print_exc()
        raise
    finally:
        # close the cursor and the connection
        if cur is not None:
            cur.close()
        if conn is not None:
            conn.close()
        
        return top_n_chunks
        

#------------------------- GEMINI -------------------------#
def read_with_gemini(top_n_chunks):
    """
    feed the chunks, a prompt, and system instructions into gemini
    """
    just_contnent = []

    for chunk in top_n_chunks:
        just_contnent.append(chunk['content'])

    # turn the list of chunks into a string
    sep = ".\n"
    content = sep.join(just_contnent)
    
    instructions = "You don't know anything except the information provided for you. Base your answer solely off of this information provided."
    prompt = "Evaluate the validity of the users question, or generate an accurate summary from the information provided."

    gemini_response = api_test.ask_gemini(prompt, content, instructions, test_mode=True)

    return gemini_response

def get_citation(articles):
    """
    get citation format of the sources used
    """
    final_citation = "Response is based on the following sources: \n"
    citation_list = []

    for article in articles:
        citation_list.append(
                        f"{article['author']}. "
                        f"\"{article['title']}\". "
                        f"{article['source']}. "
                        f"{article['url']}\n"
                    )
        
    # remove duplicates
    unique_citation = list(set(citation_list))

    # turn into string
    sep = " "
    final_citation += sep.join(unique_citation)
    final_citation = final_citation.replace("None.", "").strip()
    
    return final_citation


In [107]:
def new_rag_system():
    """
    1. get a user input, use its keyword to search the API
    2. rewrite the said content of each in a list of articles, each is a dict
    3. chunk up the data, with some overlap between chunks, also get an embedding for each of them
    4. store each of these chunks, along with info into a database
    5. gather the N most similar chunks to the user's query 
    6. feed these chunks into the LLM prompt
    7. return the answer as well as the sources used
    """

    # 1. get a user input, use its keyword to search the API
    # user input
    query = "Gaza conflict"
    date = "2025-07-01"
    everything = True
    sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
    n_chunks = 5             # how many chunks to base llm response on
    n_articles = 20          # how many articles to base llm response on

    # put this keyword into the API search as its "q" value and sort by relevance
    keyword = old_rag.get_keyword(query)

    # call the news_api with a set of parameters, remember there's a limit so comment this out whenever you can.
    # response = call_news_api(keyword, date, NEWS_API_KEY, everything)
    response_data = response.json()

    # 2. rewrite the said content of each in a list of articles, each is a dict
    # format the articles into a list called all_articles. each item in this list is a dictionary
    all_articles = response_data.get('articles', [])[:n_articles] 

    # write the actual content into this list of articles
    extract_content(all_articles)
    all_articles = [a for a in all_articles if a['content'] is not None]

    # 3. chunk up the data, with some overlap between chunks and add embeddings
    article_chunks = chunkify(all_articles)
    add_embeddings(article_chunks, sentence_model)

    # 4. store each of these chunks, along with info into a postgreSQL database
    store_sql(article_chunks)

    # 5. gather the N most similar chunks to the user's query 
    encoded_query = sentence_model.encode(query)
    top_n_chunks = fetch_sql(encoded_query, n_chunks)

    # 6. feed these chunks into the LLM prompt
    final_answer = read_with_gemini(top_n_chunks)
    
    # 7. return the answer as well as the sources used
    citation = get_citation(top_n_chunks)

    print(final_answer)
    print(citation)
    
new_rag_system()

article chunk length:  88
SQL select completed
In test mode, will not call API
Response is based on the following sources: 
James Hider. "Israeli Prime Minister Benjamin Netanyahu to meet President Trump today". NPR. https://www.npr.org/2025/07/07/g-s1-76296/israel-netanyahu-meet-trump
  "Jeremy Bowen: Israel's allies see evidence of war crimes in Gaza mounting up". BBC News. https://www.bbc.com/news/articles/cp863mln0pmo
 Matt Novak. "The Rise of AI Is Making Life Even Harder for Real People in Gaza". Gizmodo.com. https://gizmodo.com/the-rise-of-ai-is-making-life-even-harder-for-real-people-in-gaza-2000607395


https://newspaper.readthedocs.io/en/latest/
- easy webscraping of news sources https://scrapeops.io/python-web-scraping-playbook/newspaper3k/


News API gets you the most relevant articles, authors, description etc. 
BUT it cannot get you the actual content of each article but newspaper3k can. it can also give us the main keywords, although it's not reliable

now that we have all the stories for a certain keyword in one string we have several options
 - use it as ground truth fact checker
   - remember that this is based on a time range from when we set our API request to and what keyword
 - provide a summary of it all
   - convinenet because we also a have a list of sources used


In [None]:
# SUMMARIZATION
from transformers import T5Tokenizer, T5ForConditionalGeneration

model_name = 't5-small'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# basic summarization function
def summarize(text):
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=5120, truncation=True)
    summary_ids = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# basic summarizer, note its not very good.
# misses a lot of key details and ignores many stories, even after increasing the max length 5 fold. 
# probably needs some fine tuning
text = all_articles
summary = summarize(text)
print(summary)

In [None]:
# fine tune summarization https://huggingface.co/docs/transformers/en/tasks/summarization
# use the provided descriptions as targets