In [None]:
from serpapi import GoogleSearch
from dotenv import load_dotenv
import os
from newspaper import Article
from newspaper import ArticleException
from datetime import datetime, timedelta

In [None]:
load_dotenv()
api_key = os.getenv("SERP_API_KEY")

# Testing news article extraction with Newspaper3k


In [None]:
url = "https://www.cbsnews.com/news/trump-tariffs-on-mexico-canada-and-china-on-first-day-in-office/"
article = Article(url)

In [None]:
article.download()
article.parse()

In [None]:
article_title = article.title
article_title

In [None]:
if (article.publish_date):
    print("true")

In [None]:
article.nlp()

In [None]:
article.keywords

In [None]:
print(article.summary)

In [None]:
print(str(article.publish_date.date()))

In [None]:
article.source_url

# Serp API testing

In [None]:
params = {
  "engine": "google",
  "q": f"related: {article_title}",
#   "location": "Seattle-Tacoma, WA, Washington, United States", don't need location
  "hl": "en",
  "gl": "us",
  "google_domain": "google.com",
  "num": "10",
#   "start": "10",
  "safe": "active",
  "api_key": api_key,
  "device": "desktop",
}

search = GoogleSearch(params)
results = search.get_dict()
organic_results = results["organic_results"]

In [None]:
organic_results

In [None]:
def relative_date_to_absolute(relative_date):
    now = datetime.now()

    if "day" in relative_date:
        days = int(relative_date.split()[0])
        return (now - timedelta(days=days)).strftime('%Y-%m-%d')
    elif "hour" in relative_date:
        hours = int(relative_date.split()[0])
        return (now - timedelta(hours=hours)).strftime('%Y-%m-%d')
    elif "minute" in relative_date:
        minutes = int(relative_date.split()[0])
        return (now - timedelta(minutes=minutes)).strftime('%Y-%m-%d')
    else:
        return datetime.strftime(relative_date, "%Y-%m-%d")

In [None]:
def process_organic_results(results):
    similar_article_info = []
    irrelevant_texts = [
            "You have permission to edit this article.\n\nEdit Close",
            "Some other irrelevant text"
        ]
    for result in results:
        article_dict = {}
        try:
            link = result['link']
            article = Article(link, language='en')
            article.download()
            article.parse()
            article.nlp()
            article_dict['title'] = article.title 
            article_dict['authors'] = article.authors
            if article.text in irrelevant_texts:
                article_dict['summary'] = ''
                article_dict['full_text'] = ''
            else:
                article_dict['summary'] = article.summary 
                article_dict['full_text'] = article.text
                
            if article.publish_date:
                article_dict['publish_date'] = str(article.publish_date.date())
            else:
                article_dict['publish_date'] = relative_date_to_absolute(result.get('date'))
            article_dict['source'] = result['source']
            similar_article_info.append(article_dict)
        except ArticleException:
            article_dict['title'] = result['title']
            article_dict['authors'] = None
            article_dict['summary'] = result['snippet']
            article_dict['full_text'] = None
            if result.get('date'):
                article_dict['publish_date'] = relative_date_to_absolute(result.get('date'))
            else:
                article_dict['publish_date'] = None
            article_dict['source'] = result['source']
            similar_article_info.append(article_dict)
    return similar_article_info


In [None]:
similar_article_info = process_organic_results([organic_results[1]])
similar_article_info