In [3]:
import os
import pandas as pd
import requests
import json
import time
import dateutil
import datetime
from dateutil.relativedelta import relativedelta
import urllib3
from dotenv import load_dotenv
import logging
# from marshmallow import schema,fields
# from azure.cosmos import exceptions, CosmosClient, PartitionKey
import pymongo
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
logging.basicConfig(filename="logs/scraping.log",level=logging.DEBUG)

In [4]:
load_dotenv('.env') 

True

In [3]:
API_KEY = os.environ.get('NEW_YORK_TIMES_API_KEY')
COSMOS_CONNECTION_STRING = os.environ["COSMOS_CONNECTION_STRING"]
COSMOS_DB_DATABASE = os.environ["COSMOS_DB_DATABASE"]
COSMOS_DB_COLLECTION = os.environ["COSMOS_DB_COLLECTION"]

In [None]:
class AzureCosmosConnector(object):
    def __init__(
        self,
        conn_string=COSMOS_CONNECTION_STRING,
        db_name=COSMOS_DB_DATABASE,
        collection_name = COSMOS_DB_COLLECTION
    ):

        self.conn_string = conn_string
        self.db_name = db_name
        self.collection_name = collection_name
        self.client = pymongo.MongoClient(self.conn_string)

        
# connect database
        try:
            client = pymongo.MongoClient(conn_string)
        except pymongo.errors.ConnectionFailure:
            self.database = self.client.get_database_client(database=self.db_id)
# connect container
        try:
            self.contact_container_name = "Contact"
            self.contact_container = self.database.create_container(
                id=self.contact_container_name,
                partition_key=PartitionKey(path="/id"),
            )
        except exceptions.CosmosResourceExistsError:
            self.contact_container = self.database.get_container_client(
                self.contact_container_name
            )

       
# search contact
    def search_contact(self, querydict=None):
        query = {"query": "SELECT * FROM contact s"}
        options = {}
        options["enableCrossPartitionQuery"] = True

        if querydict:
            query["query"] += " where " + " and ".join(
                "s." + str(each_key) + "='" + str(querydict[each_key]) + "'"
                for each_key in querydict
            )

        docs = self.contact_container.query_items(
            query=query["query"],
            enable_cross_partition_query=True,
        )
        return {"Items": list(docs)}

# create contact
    def create_contact(self, doc):
        return self.contact_container.create_item(body=doc)

# delete contact
    def delete_contact(self, doc_id):
        try:
            return self.contact_container.delete_item(
                item=doc_id, partition_key=doc_id
            )
        except exceptions.CosmosResourceNotFoundError:
            return {"error": "CosmosResourceNotFoundError"}

# update contact
    def update_contact(self, doc):
        try:
            return self.contact_container.upsert_item(doc)
        except exceptions.CosmosResourceNotFoundError:
            return {"error": "CosmosResourceNotFoundError"}

In [21]:
class NYT_SCRAPER:
    def __init__(self,):
        pass
    
    def nytimes_query(
        
        api_key, query,news_desk = None, news_type = None, type_of_material = None,
        begin_date = None, end_date = None, n_page = 0
        ):
        # Set the base url for the query
        base_url = f'https://api.nytimes.com/svc/search/v2/articlesearch.json?q={query}'

            # Empty dictionary for filters
        filter_queries = {}
        # empty dictionary for dates
        dates = {}
        # Populate the filter dictionary
        if news_desk:
            filter_queries.update({'news_desk': news_desk})
            print(filter_queries)
        if news_type:
            filter_queries.update({'news_type': news_type})
            print(filter_queries)
        if type_of_material:
            filter_queries.update({'type_of_material': type_of_material})
            print(filter_queries)
        # Populate the date dictionary
        if begin_date:
            dates.update({'begin_date': begin_date})
            print(dates)
        if end_date:
            dates.update({'end_date':end_date})
            print(dates)
        # If 1 filter is present, and/or date params, add to URL and execute query
        if len(filter_queries) == 0:
            if len(dates) == 1:
                base_url += f'&{list(dates.keys())[0]}={list(dates.values())[0]}'
                print(base_url)
            elif len(dates) == 2:
                base_url += '&'
                for i in dates.keys():
                    base_url += f'{i}={dates[i]}&'
                print(base_url)
        elif len(filter_queries) == 1:
            base_url += f'&fq={list(filter_queries.keys())[0]}:("{list(filter_queries.values())[0]}")'
            print(base_url)
            if len(dates) == 1:
                base_url += f'&{list(dates.keys())[0]}={list(dates.values())[0]}'
                print(base_url)
            elif len(dates) == 2:
                base_url += '&'
                for i in dates.keys():
                    base_url += f'{i}={dates[i]}&'
                print(base_url)
        # If 2 or more filters are present, concatenate with AND, add dates if present and execute
        elif  len(filter_queries) > 1:
            base_url += '&fq='
            for i in filter_queries.keys():
                base_url += f'{i}:({filter_queries[i]}) AND '
            # remove the last 'AND ' at the end of the loop
            base_url = base_url[:-5]
            print(base_url)

            if len(dates) == 1:
                base_url += f'&{list(filter_queries.keys())[0]}={list(filter_queries.values())[0]}&'
                print(base_url)

            elif len(dates) == 2:
                base_url += '&'
                print(base_url)

                for i in dates.keys():
                    base_url += f'{i}={dates[i]}&'
                    print(base_url)

        # concatenate page number and api key and make the request.
        # Returns a truncated JSON indexed past the metadata
        # If you want the full json, simply remove ['response']['docs']
        # from the return line
        base_url += f'&page={n_page}'
        base_url += f'&api-key={api_key}'
        print(f'Final query : {base_url}')
        r = requests.get(base_url)
        json_data = r.json()
        return r.json()['response']['docs']


In [42]:
#Tesla,Microsoft
# Business Day,Business, Technology
# begin_date = '20210101' end_date = '20240628'
# query = nytimes_query(API_KEY,query='Nvidia',news_desk='Business',begin_date='20220101',end_date='20220131')
query = nytimes_query(API_KEY,query='Microsoft',news_desk=["Business Day","Business", "Technology"],begin_date='20220102',end_date='20220102')

query

{'begin_date': '20220102'}
{'begin_date': '20220102', 'end_date': '20220102'}
Final query : https://api.nytimes.com/svc/search/v2/articlesearch.json?q=Microsoft&page=0&api-key=YPrMAd93hm7pmTqscIQVlnGZiv1EwSOI


[{'abstract': 'The tech giant has been accused of stifling competition by packaging its video conferencing app with other tools like Word and Excel.',
  'web_url': 'https://www.nytimes.com/2024/06/25/business/european-union-microsoft-teams-antitrust.html',
  'snippet': 'The tech giant has been accused of stifling competition by packaging its video conferencing app with other tools like Word and Excel.',
  'lead_paragraph': 'European Union regulators on Tuesday charged Microsoft with breaking antitrust rules by bundling its Teams video conferencing and collaboration software with a suite of other productivity tools, giving it an unfair advantage over rivals.',
  'print_section': 'B',
  'print_page': '5',
  'source': 'The New York Times',
  'multimedia': [{'rank': 0,
    'subtype': 'xlarge',
    'caption': None,
    'credit': None,
    'type': 'image',
    'url': 'images/2024/06/25/multimedia/25eu-microsoft1-gjct/25eu-microsoft1-gjct-articleLarge.jpg',
    'height': 400,
    'width': 600

In [43]:
tmp = pd.DataFrame(query)
tmp

Unnamed: 0,abstract,web_url,snippet,lead_paragraph,print_section,print_page,source,multimedia,headline,keywords,pub_date,document_type,news_desk,section_name,byline,type_of_material,_id,word_count,uri,subsection_name
0,The tech giant has been accused of stifling co...,https://www.nytimes.com/2024/06/25/business/eu...,The tech giant has been accused of stifling co...,European Union regulators on Tuesday charged M...,B,5.0,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",{'main': 'E.U. Charges Microsoft With Antitrus...,"[{'name': 'organizations', 'value': 'Microsoft...",2024-06-25T10:40:44+0000,article,Business,Business Day,"{'original': 'By Adam Satariano', 'person': [{...",News,nyt://article/52cb3926-2b9a-5ed9-ae48-05713fa5...,448,nyt://article/52cb3926-2b9a-5ed9-ae48-05713fa5...,
1,Brad Smith testified before a House committee ...,https://www.nytimes.com/2024/06/13/technology/...,Brad Smith testified before a House committee ...,Republican lawmakers questioned a senior Micro...,B,5.0,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",{'main': 'Lawmakers Question Microsoft’s Presi...,"[{'name': 'subject', 'value': 'Computers and t...",2024-06-13T20:03:16+0000,article,Business,Technology,"{'original': 'By Karen Weise', 'person': [{'fi...",News,nyt://article/1b10293a-5061-5bf7-8a21-05531197...,854,nyt://article/1b10293a-5061-5bf7-8a21-05531197...,
2,The chip maker’s stock price has jumped over t...,https://www.nytimes.com/2024/06/18/technology/...,The chip maker’s stock price has jumped over t...,"Move over, Microsoft and Apple. The stock mark...",B,1.0,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",{'main': 'Nvidia Becomes Most Valuable Public ...,"[{'name': 'subject', 'value': 'Computers and t...",2024-06-18T20:12:08+0000,article,Business,Technology,{'original': 'By Tripp Mickle and Joe Rennison...,News,nyt://article/d6132dd9-c937-5a0e-8da7-22ba9573...,856,nyt://article/d6132dd9-c937-5a0e-8da7-22ba9573...,
3,The Justice Department and the Federal Trade C...,https://www.nytimes.com/2024/06/05/technology/...,The Justice Department and the Federal Trade C...,Federal regulators have reached a deal that al...,B,1.0,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",{'main': 'U.S. Clears Way for Antitrust Inquir...,"[{'name': 'subject', 'value': 'Artificial Inte...",2024-06-06T03:38:59+0000,article,Business,Technology,"{'original': 'By David McCabe', 'person': [{'f...",News,nyt://article/94129125-9eec-5633-83ce-5b4fe711...,951,nyt://article/94129125-9eec-5633-83ce-5b4fe711...,
4,"Apple, Microsoft and Google need more access t...",https://www.nytimes.com/2024/06/23/technology/...,"Apple, Microsoft and Google need more access t...","Apple, Microsoft and Google are heralding a ne...",B,1.0,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",{'main': 'What the Arrival of A.I. Phones and ...,"[{'name': 'subject', 'value': 'Artificial Inte...",2024-06-23T04:01:25+0000,article,Business,Technology,"{'original': 'By Brian X. Chen', 'person': [{'...",News,nyt://article/12bc7d15-60b8-5064-ad3c-9f470586...,1406,nyt://article/12bc7d15-60b8-5064-ad3c-9f470586...,Personal Tech
5,“They really sort of make you feel like it’s C...,https://www.nytimes.com/2024/06/14/podcasts/ha...,“They really sort of make you feel like it’s C...,"This week we go to Cupertino, Calif., for Appl...",,,The New York Times,[],"{'main': 'Apple Joins the A.I. Party, Elon’s W...","[{'name': 'subject', 'value': 'Science and Tec...",2024-06-14T09:06:07+0000,article,Podcasts,Podcasts,"{'original': 'By Kevin Roose, Casey Newton, Ra...",News,nyt://article/8cbce16d-0aac-5643-ba0c-e739aa0a...,166,nyt://article/8cbce16d-0aac-5643-ba0c-e739aa0a...,
6,The leading companies are co-opting Silicon Va...,https://www.nytimes.com/2024/06/13/opinion/big...,The leading companies are co-opting Silicon Va...,Silicon Valley prides itself on disruption: St...,,,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...","{'main': 'How Big Tech Is Killing Innovation',...","[{'name': 'subject', 'value': 'Computers and t...",2024-06-13T09:03:37+0000,article,OpEd,Opinion,{'original': 'By Mark Lemley and Matt Wansley'...,Op-Ed,nyt://article/3d8e7a0c-6d0a-5ade-a986-81ea0368...,1394,nyt://article/3d8e7a0c-6d0a-5ade-a986-81ea0368...,
7,"Mr. Burgum, North Dakota’s governor, is a prim...",https://www.nytimes.com/2024/06/09/us/politics...,"Mr. Burgum, North Dakota’s governor, is a prim...",After taking his software company public in 19...,A,14.0,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...","{'main': 'For Trump, Doug Burgum Emerges as a ...","[{'name': 'subject', 'value': 'Presidential El...",2024-06-09T09:02:41+0000,article,Politics,U.S.,"{'original': 'By Michael C. Bender', 'person':...",News,nyt://article/0205efe1-dcbe-5a0b-a3e9-03564ca3...,1615,nyt://article/0205efe1-dcbe-5a0b-a3e9-03564ca3...,Politics
8,Humane’s Ai Pin was supposed to free people fr...,https://www.nytimes.com/2024/06/06/technology/...,Humane’s Ai Pin was supposed to free people fr...,Days before gadget reviewers weighed in on the...,B,1.0,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",{'main': '‘This Is Going to Be Painful’: How a...,"[{'name': 'organizations', 'value': 'Humane In...",2024-06-06T09:02:53+0000,article,Business,Technology,{'original': 'By Tripp Mickle and Erin Griffit...,News,nyt://article/701029fb-c8a3-5947-9d7f-ca5834a6...,1448,nyt://article/701029fb-c8a3-5947-9d7f-ca5834a6...,
9,“Did you ever think we would have a literal Av...,https://www.nytimes.com/2024/05/24/podcasts/sc...,“Did you ever think we would have a literal Av...,"This week, more drama at OpenAI: The company w...",,,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...","{'main': 'ScarJo vs. ChatGPT, Neuralink’s Firs...","[{'name': 'subject', 'value': 'Science and Tec...",2024-05-24T09:05:37+0000,article,Podcasts,Podcasts,"{'original': 'By Kevin Roose, Casey Newton, Ra...",News,nyt://article/8cc69ccb-a762-5774-b341-000f726f...,203,nyt://article/8cc69ccb-a762-5774-b341-000f726f...,
