In [1]:
import os
import pandas as pd
import requests
import json
import time
import dateutil
import datetime
from dateutil.relativedelta import relativedelta
import urllib3
from dotenv import load_dotenv
import logging
# from marshmallow import schema,fields
# from azure.cosmos import exceptions, CosmosClient, PartitionKey
import pymongo
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
logging.basicConfig(filename="logs/scraping.log",level=logging.INFO)

In [2]:
load_dotenv('.env') 

True

In [3]:
API_KEY = os.environ.get('NEW_YORK_TIMES_API_KEY')
COSMOS_CONNECTION_STRING = os.environ["COSMOS_CONNECTION_STRING"]
COSMOS_DB_DATABASE = os.environ["COSMOS_DB_DATABASE"]
COSMOS_DB_COLLECTION = os.environ["COSMOS_DB_COLLECTION"]

In [4]:
class AzureCosmosConnector():
    def __init__(
        self,
        conn_string=COSMOS_CONNECTION_STRING,
        db_name=COSMOS_DB_DATABASE,
        collection_name = COSMOS_DB_COLLECTION
    ):

        self.conn_string = conn_string
        self.db_name = db_name
        self.collection_name = collection_name
        self.client = self.db_connect()
        # connect database
        
    def db_connect(self):
        try:
            self.client = pymongo.MongoClient(self.conn_string)
            try:
                print(self.client.server_info())
                return self.client
            except Exception as e:
                logging.WARNING(f"Get DB server info failed: {e}")
                return e
        except  pymongo.errors.ConnectionFailure as e:
            logging.WARNING(f"DB Connection failed: {e}")
            return e
    def stop_client(self):
        """Gracefully close the Mongo client"""

        self.client.close()
        self.client = None

    # def get_db(self):
    #     """Get the <db_name> database"""

    #     # db = self.().get_database(COSMOS_DB_DATABASE)
    #     # if db is None:
    #     #     raise Exception({"message": "Database not found"})
    #     # return db

    def insert_articles(self,articles=pd.DataFrame):
        db = self.client[self.db_name]
        collection = db[self.collection_name]
        articles = json.loads(articles.T.to_json()).values()
        collection.insert(articles)
        pass
    def select(self):
        pass
    def delete(self):
        pass
    def update(self):
        pass

In [26]:
class NYT_SCRAPER:
    def __init__(
            self,api_key,query,
            news_desk,news_type,type_of_material,
            begin_date,end_date):
        self.api_key = api_key
        self.news_desk = news_desk
        self.query = query
        self.news_type = news_type
        self.type_of_material = type_of_material
        self.begin_date = begin_date
        self.end_date = end_date
    
    def requests_query(self):
        
    
    def nytimes_query(self):
        # Set the base url for the query
        base_url = f'https://api.nytimes.com/svc/search/v2/articlesearch.json?q={query}'

            # Empty dictionary for filters
        filter_queries = {}
        # empty dictionary for dates
        dates = {}
        # Populate the filter dictionary
        if self.news_desk:
            filter_queries.update({'news_desk': self.news_desk})
            print(filter_queries)
        if self.news_type:
            filter_queries.update({'news_type': self.news_type})
            print(filter_queries)
        if self.type_of_material:
            filter_queries.update({'type_of_material': self.type_of_material})
            print(filter_queries)
        # Populate the date dictionary
        if self.begin_date:
            dates.update({'begin_date': self.begin_date})
            print(dates)
        if self.end_date:
            dates.update({'end_date':self.end_date})
            print(dates)
        # If 1 filter is present, and/or date params, add to URL and execute query
        if len(filter_queries) == 0:
            if len(dates) == 1:
                base_url += f'&{list(dates.keys())[0]}={list(dates.values())[0]}'
                print(base_url)
            elif len(dates) == 2:
                base_url += '&'
                for i in dates.keys():
                    base_url += f'{i}={dates[i]}&'
                print(base_url)
        elif len(filter_queries) == 1:
            base_url += f'&fq={list(filter_queries.keys())[0]}:("{list(filter_queries.values())[0]}")'
            print(base_url)
            if len(dates) == 1:
                base_url += f'&{list(dates.keys())[0]}={list(dates.values())[0]}'
                print(base_url)
            elif len(dates) == 2:
                base_url += '&'
                for i in dates.keys():
                    base_url += f'{i}={dates[i]}&'
                print(base_url)
        # If 2 or more filters are present, concatenate with AND, add dates if present and execute
        elif  len(filter_queries) > 1:
            base_url += '&fq='
            for i in filter_queries.keys():
                base_url += f'{i}:({filter_queries[i]}) AND '
            # remove the last 'AND ' at the end of the loop
            base_url = base_url[:-5]
            print(base_url)

            if len(dates) == 1:
                base_url += f'&{list(filter_queries.keys())[0]}={list(filter_queries.values())[0]}&'
                print(base_url)

            elif len(dates) == 2:
                base_url += '&'
                print(base_url)

                for i in dates.keys():
                    base_url += f'{i}={dates[i]}&'
                    print(base_url)

        # concatenate page number and api key and make the request.
        # Returns a truncated JSON indexed past the metadata
        # If you want the full json, simply remove ['response']['docs']
        # from the return line
        base_url += f'&page={n_page}'
        base_url += f'&api-key={api_key}'
        print(f'Final query : {base_url}')
        r = requests.get(base_url)
        json_data = r.json()
        return r.json()['response']['docs']


In [None]:
# Main

# DB connection
db = AzureCosmosConnector()
db_client = db.db_connect()

# initiate module
scraper = NYT_SCRAPER()
# company name list for searching articles
query_list = ['Microsoft','Tesla']

# setting date
begin_date = '20220101'
end_date='20220105'
date_index = pd.date_range(start=begin_date,end=end_date)
date_list = date_index.strftime("%Y%m%d").to_list()
date_list

['20220101', '20220102', '20220103', '20220104', '20220105']

In [28]:
#Tesla,Microsoft
# Business Day,Business, Technology
# begin_date = '20210101' end_date = '20240628'
query = scraper.nytimes_query(api_key= API_KEY,query='Nvidia',news_desk='Business',begin_date='20220101',end_date='20220131')
# query = scraper.nytimes_query(API_KEY,query='Microsoft',news_desk=["Business Day","Business", "Technology"],begin_date='20220102',end_date='20220102')

query

{'news_desk': 'Business'}
{'begin_date': '20220101'}
{'begin_date': '20220101', 'end_date': '20220131'}
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=Nvidia&fq=news_desk:("Business")
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=Nvidia&fq=news_desk:("Business")&begin_date=20220101&end_date=20220131&
Final query : https://api.nytimes.com/svc/search/v2/articlesearch.json?q=Nvidia&fq=news_desk:("Business")&begin_date=20220101&end_date=20220131&&page=0&api-key=YPrMAd93hm7pmTqscIQVlnGZiv1EwSOI


[{'abstract': 'Some big events are delaying their return to in-person gatherings, but the organizers of CES, the Winter Olympics and the Australian Open have decided it’s time to gather in person again.',
  'web_url': 'https://www.nytimes.com/2022/01/05/business/ces-omicron.html',
  'snippet': 'Some big events are delaying their return to in-person gatherings, but the organizers of CES, the Winter Olympics and the Australian Open have decided it’s time to gather in person again.',
  'lead_paragraph': 'After going virtual last year, CES, a mega-conference in Las Vegas that’s the traditional launchpad for many of the tech industry’s latest gadgets, is trying to make a comeback. The trade show kicks off on Wednesday, with an estimated 2,200 exhibitors set to show up in person.',
  'source': 'The New York Times',
  'multimedia': [{'rank': 0,
    'subtype': 'xlarge',
    'caption': None,
    'credit': None,
    'type': 'image',
    'url': 'images/2022/01/05/multimedia/05db-newsletter-ces/05

In [29]:
tmp = pd.DataFrame(query)
tmp

Unnamed: 0,abstract,web_url,snippet,lead_paragraph,source,multimedia,headline,keywords,pub_date,document_type,news_desk,section_name,byline,type_of_material,_id,word_count,uri,print_section,print_page,subsection_name
0,Some big events are delaying their return to i...,https://www.nytimes.com/2022/01/05/business/ce...,Some big events are delaying their return to i...,"After going virtual last year, CES, a mega-con...",The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",{'main': 'Is it time for big in-person events ...,"[{'name': 'subject', 'value': 'Coronavirus (20...",2022-01-05T14:01:29+0000,article,Business,Business Day,"{'original': 'By Michael J. de la Merced', 'pe...",News,nyt://article/093785c6-b590-572f-8a85-5da3fda5...,363,nyt://article/093785c6-b590-572f-8a85-5da3fda5...,,,
1,"After a record year of transactions in 2021, d...",https://www.nytimes.com/2022/01/08/business/de...,"After a record year of transactions in 2021, d...",The DealBook newsletter delves into a single t...,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...","{'main': 'What’s Next for the Deal Boom?', 'ki...","[{'name': 'subject', 'value': 'Mergers, Acquis...",2022-01-08T13:00:04+0000,article,Business,Business Day,"{'original': 'By Michael J. de la Merced', 'pe...",News,nyt://article/58c56b6f-f600-5014-bb6a-96945b80...,917,nyt://article/58c56b6f-f600-5014-bb6a-96945b80...,B,6.0,DealBook
2,"Stocks recovered after a sharp drop, but they ...",https://www.nytimes.com/2022/01/25/business/de...,"Stocks recovered after a sharp drop, but they ...","So far, 2022 is off to a rocky start for the s...",The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...","{'main': 'Was the Market Sell-Off Overdone?', ...","[{'name': 'subject', 'value': 'United States E...",2022-01-25T12:40:52+0000,article,Business,Business Day,"{'original': 'By Andrew Ross Sorkin, Jason Kar...",News,nyt://article/8fcc334e-daf3-597d-ad85-1d290731...,1799,nyt://article/8fcc334e-daf3-597d-ad85-1d290731...,,,DealBook
3,"Organizers insist that the conference, which f...",https://www.nytimes.com/2022/01/05/business/de...,"Organizers insist that the conference, which f...","After going virtual last year, the Consumer El...",The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",{'main': 'CES Tests an Appetite for Live Event...,"[{'name': 'subject', 'value': 'Labor and Jobs'...",2022-01-05T12:48:04+0000,article,Business,Business Day,"{'original': 'By Andrew Ross Sorkin, Jason Kar...",News,nyt://article/60b61d97-56df-5899-a1ea-185db71e...,1802,nyt://article/60b61d97-56df-5899-a1ea-185db71e...,,,DealBook
4,The exit of SoftBank’s chief operating officer...,https://www.nytimes.com/2022/01/28/business/de...,The exit of SoftBank’s chief operating officer...,SoftBank said today that Marcelo Claure was st...,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",{'main': 'Marcelo Claure Leaves Masa Son’s Orb...,"[{'name': 'organizations', 'value': 'SOFTBANK ...",2022-01-28T12:42:52+0000,article,Business,Business Day,"{'original': 'By Andrew Ross Sorkin, Jason Kar...",News,nyt://article/227dd2c8-95bd-55ab-a8b4-2cd5c3b0...,1799,nyt://article/227dd2c8-95bd-55ab-a8b4-2cd5c3b0...,,,DealBook


'headline': {'main': 'Is it time for big in-person events again? Organizers are divided.',
   'kicker': None,
   'content_kicker': None,
   'print_headline': None,
   'name': None,
   'seo': None,
   'sub': None},

In [None]:
#+company abstract	web_url	source	headline['main']	keywords	pub_date	document_type	news_desk	section_name	type_of_material