In [1]:
import requests
from newspaper import Article
from newspaper.article import ArticleException
from keybert import KeyBERT
from scipy.spatial import distance
from sentence_transformers import SentenceTransformer
from api_test import ask_gemini

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import os
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
NEWS_API_KEY = os.getenv("NEWS_API_KEY")

In [5]:
# ONLY RUN WHEN NEEDED
url = ('https://newsapi.org/v2/top-headlines?'
       'q=Trump&'
       'from=2025-07-00&'
       'sortBy=relevancy&'
       'pageSize=100&'
       f'apiKey={NEWS_API_KEY}')

response = requests.get(url)
print(response)

<Response [200]>


In [6]:
def call_news_api(keyword, date, everything=True):
    API_KEY = NEWS_API_KEY
    
    # choose endpoint
    endpoint = "everything" if everything else "top-headlines"
    # build URL with the passed-in variables
    url = (
        f"https://newsapi.org/v2/{endpoint}"
        f"?q={keyword}"
        f"&from={date}"
        "&sortBy=relevancy"
        f"&apiKey={API_KEY}"
    )
    response = requests.get(url)

    return response
    
def search_article_type(lean_list, all_articles, ratio):
    """
    combs through articles and returns N amount of a certain type of article, meaning N amount of left, right, and center
    """
    count = 0
    i = 0
    type_list = []
    while count < ratio:
        i += 1
        if(all_articles[i]['source']['name'] in lean_list):
            type_list.append(all_articles[i])
            count += 1

    return type_list

def get_blend(all_articles, n_search):
    blended_articles = []
    
    # compile list of sources for each type
    right_leaning = ['Breitbart News', ]
    left_leaning = []
    center_leaning = ['NBC News','The Washington Post','ABC News',]

    # how much of each type of source to get. this automatically assumes an equal blend of each source
    ratio = n_search // 3       
    
    # loop through each type
    types = ['right', 'left', 'center']
    for x in types:
        result = search_article_type(x, all_articles, ratio)
        blended_articles.append(result)

    return blended_articles

def rewrite_content(article_url):
    """
    given a url to some article, return its content
    this is really just a helper function. also has safeguards for articles that block the scraping tool. 
    """
    article = Article(article_url)
    try: 
        article.download()
        article.parse()
    
        # cleaning it up
        text = article.text
        filtered_lines = filter(str.strip, text.splitlines())
        cleaned_text = "\n".join(filtered_lines)
        return cleaned_text

    except ArticleException as e:
        print(f"[!] Skipping article at {url} — {e}")
        return None
    
def get_keyword(query):
    """
    simply return the main keyword of the query the user has asked
    """
    keyword_model = KeyBERT('distilbert-base-nli-mean-tokens')
    keywords = keyword_model.extract_keywords(query)            # a list of keywords with the most relevant listed first

    return keywords[0][0]

def get_similarities(all_articles, query):
    """
    sort the articles in order of their semantic similarity to the query 
    return only the top n results
    need to add a "score" attribute to all of the articles
    """
    semantic_model = SentenceTransformer('all-MiniLM-L6-v2')
    encoded_query = semantic_model.encode(query)

    for article in all_articles:
        encoded_description = semantic_model.encode(article['description'])
        article["sem_sim"] = 1-distance.cosine(encoded_query, encoded_description)

def get_n_matches(all_articles, N):
    """
    all_articles is a list of dictionaries. need to sort this list to get N of the dictionaries with the largest sem_sim value
    """
    sorted_articles = sorted(all_articles, key=lambda x: x['sem_sim'], reverse=True)
    
    return sorted_articles[:N]
    
def extract_content(article_list):
    """
    given a list of articles, get the content for them
    save this in a text file
    """
    all_content = ""
    article_list

    for article in article_list:
        content = rewrite_content(article['url'])

        # skip articles that contain nothing or that block the scraper
        if (article['content'] == None) or (content == None):
            continue
        article['content'] = content
        all_content += article['source']['name'] + "\n"
        all_content += article['content'] + "\n----------------------------\n"

    return all_content

def get_citation(articles):
    citation = "Response is based on the following sources: \n"

    for article in articles:
        citation += (
                        f"{article['author']}. "
                        f"\"{article['title']}\". "
                        f"{article['source']['name']}. "
                        f"{article['url']}\n"
                    )
    
    return citation




In [None]:
# MAIN
def main():
    # user input
    query = "Trump wants to defund healthcare"
    date = "2025-07-01"
    everything = True
    n_search = 5             # how many articles to base the llm response on
    blend = False             # TODO: get an even mix of right, center, and left wing sources
    
    # put this keyword into the API search as its "q" value and sort by relevance
    keyword = get_keyword(query)

    # call the news_api with a set of parameters, remember there's a limit so comment this out whenever you can.
    # response = call_news_api(keyword, date, everything)
    response_data = response.json()

    # format the articles into a list called all_articles. each item in this list is a dictionary
    all_articles = response_data.get('articles', [])   
    
    # adds a semantic similarity score between the query and the article's description to the dictionary
    get_similarities(all_articles, query)

    # gets the N articles with the best semantic similarity scores
    top_matches = get_n_matches(all_articles, n_search)

    if blend:
        top_matches = get_blend(all_articles, n_search)
    
    # rewrite the content of each article AND also return a string that's all of the articles content
    top_content = extract_content(top_matches)
    
    # save this to a txt file for our own viewing
    output_file = open("top_content.txt", "w", encoding="utf-8")
    output_file.write(top_content)    
    output_file.close()

    # ask gemini to search the contents based on the query
    prompt = "Based soley on the articles provided, is it true that "
    prompt += query
    gemini_response = ask_gemini(prompt, top_content, test_mode=False)    # set test mode to true to just call the function without the API. formatting purposes, avoiding too many API requests
    print(gemini_response)

    # return the response and the list of sources used
    citation = get_citation(top_matches)
    print(citation)
    
    # TODO: finish the list of sources for right, center, and left leaning sources. Limit is 5min a day. 

if __name__ == "__main__":
    main()

[!] Skipping article at https://newsapi.org/v2/top-headlines?q=Trump&from=2025-07-00&sortBy=relevancy&pageSize=100&apiKey=05c5488a934a4e33a702e25364b898c1 — Article `download()` failed with 403 Client Error: Forbidden for url: https://thespun.com/soccer/everyone-had-same-reaction-to-photo-of-president-trumps-ankles on URL https://thespun.com/soccer/everyone-had-same-reaction-to-photo-of-president-trumps-ankles
Yes, based solely on the articles provided, it is stated that President Donald Trump's "signature 'One Big Beautiful Bill' ... will offer tax breaks to the wealthy while kicking millions off of Medicaid."

One article also notes that Elon Musk "has issued no objection to the cuts to food benefits and healthcare," implying that cuts to healthcare are part of this bill.
Response is based on the following sources: 
AJ Dellinger. "Elon Musk Rekindles Trump Criticism, Attacks ‘Big, Beautiful Bill’". Gizmodo.com. https://gizmodo.com/elon-musk-rekindles-trump-criticism-attacks-big-beaut

https://newspaper.readthedocs.io/en/latest/
- easy webscraping of news sources https://scrapeops.io/python-web-scraping-playbook/newspaper3k/


News API gets you the most relevant articles, authors, description etc. 
BUT it cannot get you the actual content of each article but newspaper3k can. it can also give us the main keywords, although it's not reliable

now that we have all the stories for a certain keyword in one string we have several options
 - use it as ground truth fact checker
   - remember that this is based on a time range from when we set our API request to and what keyword
 - provide a summary of it all
   - convinenet because we also a have a list of sources used


In [None]:
# SUMMARIZATION
from transformers import T5Tokenizer, T5ForConditionalGeneration

model_name = 't5-small'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# basic summarization function
def summarize(text):
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=5120, truncation=True)
    summary_ids = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# basic summarizer, note its not very good.
# misses a lot of key details and ignores many stories, even after increasing the max length 5 fold. 
# probably needs some fine tuning
text = all_articles
summary = summarize(text)
print(summary)

In [None]:
# fine tune summarization https://huggingface.co/docs/transformers/en/tasks/summarization
# use the provided descriptions as targets