# Google News Version

In [1]:
# Imports the JSON library to open search parameters
import json

# Opens the parameters file
parameters_file = open('./news_parameters.json', 'r')

# Reads the parameters file and converts to dict
parameters = json.loads(parameters_file.read())

In [2]:
class Article(object):
    
    def __init__(self, p_name, p_key, p_link, p_lang):
        self.name = p_name
        self.key = p_key
        self.link = p_link
        self.lang = p_lang
        
    def __str__(self):
        return self.name
    
    def __eq__(self, other):
        if not isinstance(other, Article):
            return NotImplemented
        
        return self.key == other.key
        

In [3]:
# Imports the required package or installs it if not found
try:
    from pygooglenews import GoogleNews
except ModuleNotFoundError as e:
    print("GoogleNews package is not installed. Intalling...")
    !python3 -m pip install pygooglenews
    
# Imports datetime to handle dates
import datetime

In [9]:
# Stop flag for the iteration loop
finish = False

# List to store the articles information
articles_list = []

# Retrieves the date start parameters
START_YEAR = parameters["start_year"]
START_MONTH = parameters["start_month"]
START_DAY = parameters["start_day"]

# Retrieves the final date parameters
END_YEAR = parameters["end_year"]
END_MONTH = parameters["end_month"]
END_DAY = parameters["end_day"]

# Creates the date object
date = datetime.datetime(START_YEAR, START_MONTH, START_DAY)

# Iterates until the end 
while not finish:
    
    # Creates the start date string
    START = "{}-{}-{}".format(date.year, date.month, date.day)
    
    # Creates the end date string
    end_date = date + datetime.timedelta(days=1)
    END = "{}-{}-{}".format(end_date.year, end_date.month, end_date.day)
    
    # Prints feedback information
    print("Date: {}. Documents: {}".format(START, len(articles_list)), end='\r')
    
    # Iterates over countries
    for country in parameters["countries"]:
        
        # Iterates over languages
        for lang in parameters["languages"]:
            
            # Creates the API instance
            api = GoogleNews(lang=lang, country=country)
            
            # Iterate over keywords
            for kw in parameters["keywords"]:
            
                # Asks a response from the API
                response = api.search(kw, from_=START, to_=END)

                # Checks that more than one document is retrieved
                
                # Iterates over every response
                for entry in response["entries"]:

                    article = Article(entry["title"], entry["id"], entry["link"], lang)

                    
                    if article not in articles_list:
                        articles_list.append(article)
    
    # Checks the stop conditions
    stop_year = date.year == END_YEAR
    stop_month = date.month == END_MONTH
    stop_day = date.day == END_DAY
    
    # Stops if all three conditions are met
    if stop_year and stop_month and stop_day:
        finish = True
        
    # Increments date by one day
    else:
        date += datetime.timedelta(days=1)


Date: 2021-1-20. Documents: 575842

KeyError: 'link'

In [24]:
for i, article in enumerate(articles_list):

    # Sets filename
    filename = "./../../data/news/URL_{0:06d}.json".format(i)
    
    data ={
        "name" : article.name,
        "key" : article.key,
        "link" : article.link,
        "lang" : article.lang
    }
    
    # 
    with open(filename, 'w') as file:
    
        # 
        writer = json.dump(data, file)


In [25]:
count = {
    "en":0,
    "es":0,
    "fr":0,
    "it":0
}


for article in articles_list:
    count[article.lang] += 1
    

In [26]:
print(count)

{'en': 22734, 'es': 7431, 'fr': 17012, 'it': 10424}


In [27]:
print(len(articles_list))

57601
