In [None]:
from serpapi import GoogleSearch
from dotenv import load_dotenv
import os
from newspaper import Article
from newspaper import ArticleException
from datetime import datetime, timedelta
import nltk
# nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\saman\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [5]:
load_dotenv()
api_key = os.getenv("SERP_API_KEY")

# Testing news article extraction with Newspaper3k

In [6]:
url = "https://www.cbsnews.com/news/trump-tariffs-on-mexico-canada-and-china-on-first-day-in-office/"
article = Article(url)

In [None]:
article.download()
article.parse()
article.nlp()

In [9]:
article_title = article.title
article_title

'Trump threatens to impose sweeping new tariffs on Mexico, Canada and China on first day in office'

In [10]:
article.keywords

['day',
 'fentanyl',
 'china',
 'tariffs',
 'office',
 'impose',
 'illegal',
 'canada',
 'trump',
 'mexico',
 'states',
 'united',
 'border',
 'sweeping',
 'threatens']

In [15]:
print(article.summary)

President-elect Donald Trump is threatening to impose sweeping new tariffs on Mexico, Canada and China as soon as he takes office as part of his efforts to crack down on illegal immigration and drugs.
The U.S. is the largest importer of goods in the world, with Mexico, China and Canada its top three suppliers, according to the most recent Census data.
Trump made the announcements on his Truth Social site Monday evening as he railed against an influx of illegal migrants.
Trump also turned his ire on China, saying he has "had many talks with China about the massive amounts of drugs, in particular Fentanyl, being sent into the United States – But to no avail."
If Trump were to move forward with the threatened tariffs, the new taxes would pose an enormous challenge for the economies of Canada and Mexico, in particular.


In [16]:
params = {
  "engine": "google",
  "q": f"related: {article_title}",
#   "location": "Seattle-Tacoma, WA, Washington, United States", don't need location
  "hl": "en",
  "gl": "us",
  "google_domain": "google.com",
  "num": "10",
#   "start": "10",
  "safe": "active",
  "api_key": api_key,
  "device": "desktop",
}

search = GoogleSearch(params)
results = search.get_dict()
organic_results = results["organic_results"]

In [17]:
organic_results

[{'position': 1,
  'title': 'Trump threatens to impose sweeping new tariffs on Mexico ...',
  'link': 'https://www.cbsnews.com/news/trump-tariffs-on-mexico-canada-and-china-on-first-day-in-office/',
  'redirect_link': 'https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.cbsnews.com/news/trump-tariffs-on-mexico-canada-and-china-on-first-day-in-office/&ved=2ahUKEwjoyIqOlP2JAxX96ckDHUh3HP8QFnoECC4QAQ',
  'displayed_link': 'https://www.cbsnews.com › U.S.',
  'favicon': 'https://serpapi.com/searches/674766e3993688a7f0da330d/images/930dded951de406061ac6b15cfa0bf4aec734ceb73c2af8796531310f6c51a4e.png',
  'date': '1 day ago',
  'snippet': 'President-elect Donald Trump threatened to impose new tariffs on Mexico, Canada and China as soon as he takes office as part of his efforts ...',
  'snippet_highlighted_words': ['Trump threatened to impose new tariffs on Mexico',
   'Canada and China',
   'office'],
  'source': 'CBS News'},
 {'position': 2,
  'title': 'Trump threate

In [18]:
def relative_date_to_absolute(relative_date):
    now = datetime.now()

    if "day" in relative_date:
        days = int(relative_date.split()[0])
        return (now - timedelta(days=days)).strftime('%Y-%m-%d')
    elif "hour" in relative_date:
        hours = int(relative_date.split()[0])
        return (now - timedelta(hours=hours)).strftime('%Y-%m-%d')
    elif "minute" in relative_date:
        minutes = int(relative_date.split()[0])
        return (now - timedelta(minutes=minutes)).strftime('%Y-%m-%d')
    else:
        return datetime.strftime(relative_date, "%Y-%m-%d")

In [19]:
def process_organic_results(results):
    similar_article_info = []
    irrelevant_texts = [
            "You have permission to edit this article.\n\nEdit Close",
            "Some other irrelevant text"
        ]
    for result in results:
        article_dict = {}
        try:
            link = result['link']
            article = Article(link, language='en')
            article.download()
            article.parse()
            article.nlp()
            article_dict['title'] = article.title 
            article_dict['authors'] = article.authors
            if article.text in irrelevant_texts:
                article_dict['summary'] = ''
                article_dict['full_text'] = ''
            else:
                article_dict['summary'] = article.summary 
                article_dict['full_text'] = article.text
                
            if article.publish_date:
                article_dict['publish_date'] = str(article.publish_date.date())
            else:
                article_dict['publish_date'] = relative_date_to_absolute(result.get('date'))
            article_dict['source'] = result['source']
            similar_article_info.append(article_dict)
        except ArticleException:
            article_dict['title'] = result['title']
            article_dict['authors'] = None
            article_dict['summary'] = result['snippet']
            article_dict['full_text'] = None
            if result.get('date'):
                article_dict['publish_date'] = relative_date_to_absolute(result.get('date'))
            else:
                article_dict['publish_date'] = None
            article_dict['source'] = result['source']
            similar_article_info.append(article_dict)
    return similar_article_info


In [20]:
similar_article_info = process_organic_results(organic_results)
similar_article_info

[{'title': 'Trump threatens to impose sweeping new tariffs on Mexico, Canada and China on first day in office',
  'authors': [],
  'summary': 'President-elect Donald Trump is threatening to impose sweeping new tariffs on Mexico, Canada and China as soon as he takes office as part of his efforts to crack down on illegal immigration and drugs.\nThe U.S. is the largest importer of goods in the world, with Mexico, China and Canada its top three suppliers, according to the most recent Census data.\nTrump made the announcements on his Truth Social site Monday evening as he railed against an influx of illegal migrants.\nTrump also turned his ire on China, saying he has "had many talks with China about the massive amounts of drugs, in particular Fentanyl, being sent into the United States – But to no avail."\nIf Trump were to move forward with the threatened tariffs, the new taxes would pose an enormous challenge for the economies of Canada and Mexico, in particular.',
  'full_text': 'Presiden

### Create new collection in chromadb (name: news_articles)

In [None]:
# imports
import chromadb
import pandas as pd

In [None]:
# chroma_client
chroma_client = chromadb.HttpClient(host='localhost', port=8000)

In [None]:
# data preprocessing
# doing this here for testing only, when we implement this is our app.py we will be rerunning the whole file with the url from users
train_data = pd.DataFrame(data = similar_article_info)
train_data.head(1)

Unnamed: 0,title,authors,summary,full_text,publish_date,source
0,Trump threatens to impose sweeping new tariffs...,[],President-elect Donald Trump is threatening to...,President-elect Donald Trump is threatening to...,2024-11-26,CBS News


In [None]:
# # Haven't run this yet, need to check with calvin with what to do for this.
# # get chroma client
# collection = chroma_client.get_or_create_collection(name="news_article")

In [None]:
# # if else statement to prevent adding same doc to docker if distances == 0
# all_title = train_data['title']
# for i in range(train_data.shape[0]):
#     dist = collection.query(query_texts=[all_title[i]], n_results=1)
#     dist = dist["distances"][0][0]
#     if dist == 0:
#         train_data = train_data.drop([i])

In [None]:
# # put non-repetitive data into documents, metadatas, ids lists.
# documents = []
# metadatas = []
# ids = []
# prev_df_size = collection.count()
# prev_df_plus_serp_api_df_size = train_data.shape[0] + prev_df_size
# for i in range(prev_df_size, prev_df_plus_serp_api_df_size):
#     documents.append(train_data.loc[i - prev_df_size, 'title'])
#     metadatas.append({"authors": train_data.loc[i - prev_df_size, 'authors'], "summary": train_data.loc[i - prev_df_size, "summary"], "publish_date": train_data.loc[i - prev_df_size, "publish_date"], "source": train_data.loc[i - prev_df_size, "source"], })
#     ids.append("id" + str(i))

In [None]:
# Haven't run this yet, need to check with Calvin to see how we want to use this
# # add those data to collection
# collection.add(documents = documents, 
#                metadatas=metadatas, 
#                ids=ids)

In [None]:
# # Test query to ensure it works! Returns top 3 closest statements from our data to the text imputted!
# results = collection.query(query_texts=["Promise Kept: Planned Parenthood regained fede"], 
#                  n_results=3,
#                #   where=
#                #   {
#                #      "label": "true"
#                #   })
# )
# print(results['documents'])
# print(results["distances"])

### Adding Serp API searching result to normal and fcot prompting for Gemini to utilize this resource

In [None]:
def ask_normal_prompting_questions(event: me.ClickEvent):
  """loop through our normal prompted questions to ask gemini to give us a score of 1 to 10 
    for the sensationalism and political stance
    
    Args:
        event: this question is activated when the button associated with this function is clicked 
  """
  state = me.state(State)
  for question in state.normal_prompting_question:
    # editing the question that will be going into gemini
    articles_from_serp_api = similar_article_info
    text_to_add = " Please also consider these articles' information in your analysis of the score." + similar_article_info
    question = question + text_to_add
    print("start asking normal prompting questions")
    print(f"Question:{question}")
    response_generator = transform(question, state.chat_history)  
    response = ''.join(response_generator)
    print(f"Response:{response}")
    time.sleep(5)

In [None]:
def ask_fcot_prompting_questions(event: me.ClickEvent):
  """loop through our fractal chain of thought prompted questions (3 iterations) to ask gemini to give us a score of 1 to 10 
    for the sensationalism and political stance
    
    Args:
        event: this question is activated when the button associated with this function is clicked 
  """
  state = me.state(State)
  for question in state.fcot_prompting_question:
    # editing the question that will be going into gemini
    articles_from_serp_api = similar_article_info
    text_to_add = " With in each iteration please also consider these articles' information in your analysis of the score." + similar_article_info
    question = question + text_to_add
    print("start asking fcot prompting questions")
    print(f"Question:{question}")
    response_generator = transform(question, state.chat_history)  
    response = ''.join(response_generator)
    print(f"Response:{response}")
    time.sleep(5)