In [53]:
from serpapi import GoogleSearch
from dotenv import load_dotenv
import os
from newspaper import Article
from newspaper import ArticleException
from datetime import datetime, timedelta

In [25]:
load_dotenv()
api_key = os.getenv("SERP_API_KEY")

# Testing news article extraction with Newspaper3k


In [82]:
url = "https://www.cbsnews.com/news/trump-tariffs-on-mexico-canada-and-china-on-first-day-in-office/"
article = Article(url)

In [83]:
article.download()
article.parse()

In [84]:
article_title = article.title
article_title

'Trump threatens to impose sweeping new tariffs on Mexico, Canada and China on first day in office'

In [88]:
if (article.publish_date):
    print("true")

In [31]:
article.nlp()

In [32]:
article.keywords

['tariffs',
 'china',
 'goods',
 'trade',
 'mexico',
 'trumps',
 'massive',
 'trump',
 'taxes',
 'vowing',
 'proposed',
 'day',
 'united',
 'states',
 'tariff',
 'ups']

In [38]:
print(article.summary)

What America importsThe United States’ top import from Canada is oil, which reached a record 4.3 million barrels per day in July, according to the US Energy Information Administration.
The Trump tariffs also hit foreign steel, aluminum, washing machines and solar panels.
It’s not clear how Trump would plan to implement the proposed tariffs without violating the USMCA.
Most mainstream economists believe tariffs will be inflationary, and the Peterson Institute for International Economics has estimated Trump’s proposed tariffs (before the new tariffs announced Monday night) would cost the typical US household over $2,600 a year.
He did just that when he was last in the White House, placing large tariffs on goods, primarily from China.


In [43]:
print(str(article.publish_date.date()))

2024-11-25


In [36]:
article.source_url

'https://www.cnn.com'

# Serp API testing

In [None]:
params = {
  "engine": "google",
  "q": f"related: {article_title}",
#   "location": "Seattle-Tacoma, WA, Washington, United States", don't need location
  "hl": "en",
  "gl": "us",
  "google_domain": "google.com",
  "num": "10",
#   "start": "10",
  "safe": "active",
  "api_key": api_key,
  "device": "desktop",
}

search = GoogleSearch(params)
results = search.get_dict()
organic_results = results["organic_results"]

In [35]:
organic_results

[{'position': 1,
  'title': 'Trump tries to force other countries to negotiating table on ...',
  'link': 'https://www.cnn.com/2024/11/26/politics/trump-tariffs-negotiating-tactic/index.html',
  'redirect_link': 'https://www.google.com/url?sa=t&source=web&rct=j&opi=89978449&url=https://www.cnn.com/2024/11/26/politics/trump-tariffs-negotiating-tactic/index.html&ved=2ahUKEwi-p5yl2fuJAxVZCnkGHX9nCWkQFnoECCEQAQ',
  'displayed_link': 'https://www.cnn.com › trump-tariffs-negotiating-tactic',
  'favicon': 'https://serpapi.com/searches/6746a2fddc969fd66e6a5a8e/images/150bfd05a6e4024a7030ad89a1d96d3b989a4d1d3a6c04b3fc151fde8cfb2d91.png',
  'date': '7 hours ago',
  'snippet_highlighted_words': ["Trump's", 'massive tariffs'],
  'source': 'CNN'},
 {'position': 2,
  'title': 'Trump threatens to impose sweeping new tariffs on Mexico ...',
  'link': 'https://www.cbsnews.com/news/trump-tariffs-on-mexico-canada-and-china-on-first-day-in-office/',
  'redirect_link': 'https://www.google.com/url?sa=t&sour

In [54]:
def relative_date_to_absolute(relative_date):
    now = datetime.now()

    if "day" in relative_date:
        days = int(relative_date.split()[0])
        return (now - timedelta(days=days)).strftime('%Y-%m-%d')
    elif "hour" in relative_date:
        hours = int(relative_date.split()[0])
        return (now - timedelta(hours=hours)).strftime('%Y-%m-%d')
    elif "minute" in relative_date:
        minutes = int(relative_date.split()[0])
        return (now - timedelta(minutes=minutes)).strftime('%Y-%m-%d')
    else:
        return datetime.strftime(relative_date, "%Y-%m-%d")

In [101]:
def process_organic_results(results):
    similar_article_info = []
    irrelevant_texts = [
            "You have permission to edit this article.\n\nEdit Close",
            "Some other irrelevant text"
        ]
    for result in results:
        article_dict = {}
        try:
            link = result['link']
            article = Article(link, language='en')
            article.download()
            article.parse()
            article.nlp()
            article_dict['title'] = article.title 
            article_dict['authors'] = article.authors
            if article.text in irrelevant_texts:
                article_dict['summary'] = ''
                article_dict['full_text'] = ''
            else:
                article_dict['summary'] = article.summary 
                article_dict['full_text'] = article.text
                
            if article.publish_date:
                article_dict['publish_date'] = str(article.publish_date.date())
            else:
                article_dict['publish_date'] = relative_date_to_absolute(result.get('date'))
            article_dict['source'] = result['source']
            similar_article_info.append(article_dict)
        except ArticleException:
            article_dict['title'] = result['title']
            article_dict['authors'] = None
            article_dict['summary'] = result['snippet']
            article_dict['full_text'] = None
            if result.get('date'):
                article_dict['publish_date'] = relative_date_to_absolute(result.get('date'))
            else:
                article_dict['publish_date'] = None
            article_dict['source'] = result['source']
            similar_article_info.append(article_dict)
    return similar_article_info


In [102]:
similar_article_info = process_organic_results([organic_results[1]])
similar_article_info

[{'title': 'Trump threatens to impose sweeping new tariffs on Mexico, Canada and China on first day in office',
  'authors': [],
  'summary': 'President-elect Donald Trump is threatening to impose sweeping new tariffs on Mexico, Canada and China as soon as he takes office as part of his efforts to crack down on illegal immigration and drugs.\nThe U.S. is the largest importer of goods in the world, with Mexico, China and Canada its top three suppliers, according to the most recent Census data.\nTrump made the announcements on his Truth Social site Monday evening as he railed against an influx of illegal migrants.\nTrump also turned his ire on China, saying he has "had many talks with China about the massive amounts of drugs, in particular Fentanyl, being sent into the United States – But to no avail."\nIf Trump were to move forward with the threatened tariffs, the new taxes would pose an enormous challenge for the economies of Canada and Mexico, in particular.',
  'full_text': 'Presiden