Neste notebook é feita a requisição automática para a News API a cada uma hora (sempre em horário inteiro, ou seja, terminados em 00 minutos).
A requisição é, então, salva em uma tabela que será frequentemente atualizada com novos dados que são requisitados a cada hora.

In [0]:
import requests
import json
import time
import urllib.parse
from datetime import datetime, timedelta
from pyspark.sql.types import StructType, StructField, StringType, MapType
from pyspark.sql.functions import col 

In [0]:
%sql
DROP TABLE IF EXISTS table_articles

In [0]:
# https://newsapi.org/

MINUTE_TO_RUN_REQUEST = '00'  # 00

# 7076253965e946c7bd9fce398dac2626
# 8e7ff5750c804b09a640c4a0509ea78d
API_KEY = '0264241255ba4cf2a91242c8c66784a7'
key_words = ['research','AND','medicinal cannabis','NOT','recreative']
key_words = ' '.join(key_words)
key_words_url_encoded = urllib.parse.quote(key_words)

schema = StructType([
    StructField('author', StringType(), True),
    StructField('content', StringType(), True),
    StructField('description', StringType(), True),
    StructField('publishedAt', StringType(), True),
    StructField('source', MapType(StringType(), StringType(), True), True),
    StructField('title', StringType(), True),
    StructField('url', StringType(), True),
    StructField('urlToImage', StringType(), True)
    ])

def request(date_from):
    url = ('https://newsapi.org/v2/everything?'
        'q={}&'
        'from={}&'
        'sortBy=publishedAt&'
        'apiKey={}').format(key_words_url_encoded, date_from, API_KEY) 
    
    response = requests.get(url)
    resposta_json = response.json()

    return resposta_json

date_from = str(datetime.now() - timedelta(days=2, hours=1))
articles_date_exist = False

while True:
    if str(datetime.now().minute) == MINUTE_TO_RUN_REQUEST:
        resposta_json = request(date_from)
        df_data_articles = spark.createDataFrame(resposta_json['articles'], schema=schema)
        df_data_articles = df_data_articles.withColumn('source', col('source.name')) # <- this overwrites the colum "source" (type:map) to make a colum
                                                                                     # (type:str) with only the name of the source ("source id" deleted)
        
        new_date_from = []
        if spark.catalog.tableExists('table_articles'):
            df_temp = spark.read.parquet('/FileStore/tables/table_articles')
            df_data_articles = df_data_articles.subtract(df_temp)

            date_from_table_articles = df_table_articles.orderBy(col('publishedAt').desc())
            date_from_table_articles = date_from_table_articles.select('publishedAt').first()[0]
            new_date_from.append(date_from_table_articles)
            articles_date_exist = True

        if not df_data_articles.isEmpty():
            df_data = df_data_articles.orderBy(col('publishedAt').desc())
            date_from_df_articles = df_data.select('publishedAt').first()[0]
            new_date_from.append(date_from_df_articles)
            articles_date_exist = True

        if articles_date_exist: # if False, data_from will be the starting date_from variable, atributed before While
            date_from = max(new_date_from)

        df_data_articles = df_data_articles.filter(~col('source').contains('[Removed]'))
        df_data_articles = df_data_articles.dropDuplicates()
        df_data_articles.write.mode('append').format('parquet').option(
        'path', '/FileStore/tables/table_articles').saveAsTable('table_articles')

        print(f'Request done! Total NEW articles found: {df_data_articles.count()}')
        print(f'Most recent article date/hour: {date_from}')
        print('---------------------')

        time.sleep(100) # 100 seconds

Request done! Total NEW articles found: 6
Most recent article date/hour: 2023-10-03T19:00:36Z
---------------------
