# Google News Scraper

In [3]:
!pip install gnews
!pip install newspaper3k
!pip install lxml_html_clean
!pip install googlenewsdecoder

Collecting gnews
  Downloading gnews-0.4.0-py3-none-any.whl.metadata (19 kB)
Collecting feedparser~=6.0.2 (from gnews)
  Downloading feedparser-6.0.11-py3-none-any.whl.metadata (2.4 kB)
Collecting dnspython (from gnews)
  Downloading dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Collecting sgmllib3k (from feedparser~=6.0.2->gnews)
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading gnews-0.4.0-py3-none-any.whl (18 kB)
Downloading feedparser-6.0.11-py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dnspython-2.7.0-py3-none-any.whl (313 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.6/313.6 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: sgmllib3k
  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
  Created wheel for sgmllib3k: filename=s

In [5]:
import pandas as pd
import numpy as np
from gnews import GNews
import datetime
from newspaper import Article, fulltext
from googlenewsdecoder import gnewsdecoder
from tqdm import tqdm
tqdm.pandas()
import random
import requests
from nltk.tokenize import sent_tokenize
pd.set_option('display.max_rows', 200)

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
def decode(url):
    '''
    function to convert google news hashed rss link back into original article link from domain

    url: str
    returns: str

    extraction from google news returns a google news link to the article which passes through a redirect
    newspaper3k and scrapers have difficulty with the redirect and therefore need the original link
    gnewsdecoder is able to get the link.

    process:
    1. set interval for sleep
       as we are sending requests to google to get the redirects, too many repeated requests acts similarly
       to a rapid scraper and leads to rate limits.
    2. attempt to get the url and return as a string. if any kind of error, return None and skip
    '''
    interval_time = random.uniform(10,20)  # interval is optional, default is None

    source_url = url

    try:
        decoded_url = gnewsdecoder(source_url, interval=interval_time)

        if decoded_url.get("status"):
            result = decoded_url["decoded_url"]
        else:
            print("Cannot decode:", decoded_url["message"])
            result = None
    except Exception as e:
        print(f"Error occurred: {e}")
        result = None

    return result

In [8]:
# initialize GNews search object and run
# note, only returns 100 results max at once even if more in time frame
# if need to be thorough in time frame, must split into smaller periods

google_news = GNews(
    language='en',
    country='US',
    # period='7d',
    start_date=datetime.datetime(2001,1,1),
    end_date=datetime.datetime(2005,12,31),
    max_results=100
)

r = google_news.get_news('market')

In [9]:
# convert search to df
df = pd.DataFrame(r)
df.shape

(100, 5)

In [15]:
# df = pd.DataFrame() ## first run only
# df = pd.concat([df,pd.DataFrame(r)])
# df.shape

(21, 5)

In [10]:
df['links'] = df.url.progress_apply(decode)

  2%|▏         | 2/100 [00:13<11:02,  6.76s/it]


KeyboardInterrupt: 

In [27]:
# df.to_csv('market_2005.csv',index=False)

In [11]:
# news is just a concat of the dfs if multiple (i.e. for time intervals)
# news = pd.concat([pd.read_csv('sp500_2005.csv'),pd.read_csv('market_2005.csv')]).reset_index(drop=True)
news.head()

Unnamed: 0,title,description,published date,url,publisher,short
0,Targeting stocks in 2002 - CNN,Targeting stocks in 2002 CNN,"Mon, 31 Dec 2001 08:00:00 GMT",https://news.google.com/rss/articles/CBMib0FVX...,"{'href': 'https://money.cnn.com', 'title': 'CNN'}",https://money.cnn.com/2001/12/31/markets/strat...
1,Market Holds Little Risk for Privatized Social...,Market Holds Little Risk for Privatized Social...,"Thu, 03 May 2001 07:00:00 GMT",https://news.google.com/rss/articles/CBMiSEFVX...,"{'href': 'https://www.mackinac.org', 'title': ...",https://www.mackinac.org/V2001-19
2,"Stocks fall to 3-year lows - Sep. 17, 2001 - CNN","Stocks fall to 3-year lows - Sep. 17, 2001 CNN","Mon, 17 Sep 2001 07:00:00 GMT",https://news.google.com/rss/articles/CBMiaEFVX...,"{'href': 'https://money.cnn.com', 'title': 'CNN'}",https://money.cnn.com/2001/09/17/markets/marke...
3,"Dow digs out of a hole - Apr. 27, 2001 - CNN","Dow digs out of a hole - Apr. 27, 2001 CNN","Fri, 27 Apr 2001 07:00:00 GMT",https://news.google.com/rss/articles/CBMiaEFVX...,"{'href': 'https://money.cnn.com', 'title': 'CNN'}",https://money.cnn.com/2001/04/27/markets/marke...
4,Cold Calculation Of Terror - Forbes,Cold Calculation Of Terror Forbes,"Tue, 28 May 2002 07:00:00 GMT",https://news.google.com/rss/articles/CBMiXkFVX...,"{'href': 'https://www.forbes.com', 'title': 'F...",https://www.forbes.com/2002/05/28/0529simons.html


In [34]:
# link = news.links[0]
# link

'https://money.cnn.com/2001/12/31/markets/strategists/index.htm'

In [15]:
def process_text(link):
    '''
    this function uses newspaper3k to extract the approximate first paragraph from the articles
    link: str
    returns: text (str)

    process:
    1. initialize article object with newspaper3k by passing in article link
    2. newspaper initial steps require to download and parse, then begin extraction
    3. extraction:
      a. attempt to extract via newspaper3k built in functionality:
         newspaper3k is able to locate the text body of the article as an object (article) attribute.
         generally, paragraphs split by newline, so looking for about 2 sentences worth (to match nyt)
           by splitting with period and newline and taking the first 2 instances.
         however, sometimes due to extras being caught in the text attribute (author, figure, etc), put
            a limit of 100 words minimum to use this method, otherwise take the first 150 words.
            (to try and capture enough words to surpass the noise and include a full thought)
      ---
      b. attempt to extract via entire page html:
         sometimes newspaper3k text attribute fails, but it is still able to retrieve all text from a page.
         use the full text page and a similar strategy to capture 100-150 words.
    4. return text
    '''
    article = Article(link)
    try:
      article.download()
      article.parse()
      if len(' '.join(article.text.split('.\n')[:2]).split()) > 100:
        text = ' '.join(article.text.split('.\n')[:2])
      else:
        text = ' '.join(article.text.split()[:150])
    except:
      try:
        if len(' '.join(fulltext(requests.get(link).text).split('.\n')[:2]).split()) > 100:
          text = ' '.join(article.text.split('.\n')[:2])
        else:
          text = ' '.join(fulltext(requests.get(link).text).split()[:150])
      except: text=None
    return text

In [16]:
# extract ~first paragraph
news['text'] = news.short.progress_apply(process_text)

100%|██████████| 99/99 [01:11<00:00,  1.38it/s]


In [18]:
# cleanup
news.text = news.text.str.replace('\n',' ')

In [19]:
# news.to_csv('news.csv',index=False)

In [92]:
# trying to find a good number of words to stop at
nyt['lead_paragraph'].str.split().str.len().describe()

Unnamed: 0,lead_paragraph
count,163549.0
mean,35.734318
std,18.051906
min,0.0
25%,26.0
50%,34.0
75%,44.0
max,350.0


Notes

1. Missing extraction: I think these can just be ignored and are not worth the effort.
2. There is a lot of noise in the extracted text which is things like author names, figure captions, etc. I don't have a good blanket way of dealing with this unless we decide to put a buffer at the beginning of each extraction, which could cut into the beginning of the content. If it is in the middle of the content, I am not sure it can be removed easily.

# manual (irrelevant)

In [None]:
query = 'business'
start_date = '2000-01-01'
end_date = '2010-01-31'

In [None]:
# business code = CAAqJggKIiBDQkFTRWdvSUwyMHZNRGx6TVdZU0FtVnVHZ0pWVXlnQVAB

In [None]:
newslist = f'https://news.google.com/rss/search?q="economy"+before:{end_date}+after:{start_date}&ceid=US:en&hl=en-US&gl=US'

In [None]:
newslist

'https://news.google.com/rss/search?q="economy"+before:2010-01-31+after:2000-01-01&ceid=US:en&hl=en-US&gl=US'

In [None]:
feed = feedparser.parse(newslist)

# Extract relevant data
data = []
for entry in feed.entries:
    data.append({
        "title": entry.title,
        "url": entry.link,
        "published": pd.to_datetime(entry.published) if "published" in entry else None
    })