In [1]:
import os
import pandas as pd
import requests
import json
import time
import dateutil
import datetime
from dateutil.relativedelta import relativedelta
import urllib3
from dotenv import load_dotenv
import logging
# from marshmallow import schema,fields
# from azure.cosmos import exceptions, CosmosClient, PartitionKey
import pymongo
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
logging.basicConfig(filename="logs/scraping.log",level=logging.INFO)

In [2]:
load_dotenv('.env') 

True

In [3]:
API_KEY = os.environ.get('NEW_YORK_TIMES_API_KEY')
COSMOS_CONNECTION_STRING = os.environ["COSMOS_CONNECTION_STRING"]
COSMOS_DB_DATABASE = os.environ["COSMOS_DB_DATABASE"]
COSMOS_DB_COLLECTION = os.environ["COSMOS_DB_COLLECTION"]

In [7]:
class AzureCosmosConnector():
    def __init__(
        self,
        conn_string=COSMOS_CONNECTION_STRING,
        db_name=COSMOS_DB_DATABASE,
        collection_name = COSMOS_DB_COLLECTION
    ):

        self.conn_string = conn_string
        self.db_name = db_name
        self.collection_name = collection_name
        self.client = self.db_connect()
        self.db = self.client[self.db_name]
        self.collection = self.db[self.collection_name]
        # connect database
        
    def db_connect(self):
        try:
            self.client = pymongo.MongoClient(self.conn_string)
            try:
                print(self.client.server_info())
                return self.client
            except Exception as e:
                logging.WARNING(f"Get DB server info failed: {e}")
                return e
        except  pymongo.errors.ConnectionFailure as e:
            logging.WARNING(f"DB Connection failed: {e}")
            return e
    def stop_client(self):
        """Gracefully close the Mongo client"""

        self.client.close()
        self.client = None

    # def get_db(self):
    #     """Get the <db_name> database"""

    #     # db = self.().get_database(COSMOS_DB_DATABASE)
    #     # if db is None:
    #     #     raise Exception({"message": "Database not found"})
    #     # return db

    def insert_articles(self,articles=pd.DataFrame):
        db = self.client[self.db_name]
        collection = db[self.collection_name]
        # articles = json.loads(articles.T.to_json()).values()
        collection.insert_many(articles.to_dict('records'))
        pass
    def select(self):
        pass
    def delete(self):
        pass
    def update(self):
        pass

In [10]:
class NYT_SCRAPER:
    def __init__(
            self,api_key,query,
            begin_date,end_date):
        self.api_key = api_key
        self.query = query
        self.begin_date = begin_date
        self.end_date = end_date
        self.db_client = AzureCosmosConnector().db_connect()

    def scraping_nyt(self):
        date_list =  pd.date_range(start=self.begin_date,end=self.end_date).strftime("%Y%m%d").to_list()
        final_articles = pd.DataFrame()
        for date in date_list:
            articles = self.requests_query(date)
            
            final_articles = pd.concat([final_articles,articles])
        print(f'ARTICLES INFO :::::{final_articles.info()}')
        return final_articles
    
    def requests_query(self):
        articles_df_with_query = pd.DataFrame(columns=['query','abstract','web_url','source','headline','keywords','pub_date','document_type','news_desk','section_name','type_of_material'])
        for q in self.query:
            base_url = f'https://api.nytimes.com/svc/search/v2/articlesearch.json?q={q}'
            base_url+= '&fq=news_desk:(\"Business\",\"Business Day\",\"Technology\")'
            # AND news_type:(\"News\",\"Article\")
            base_url += f'&begin_date={self.begin_date}&end_date={self.end_date}'
            
            n_page = 0
            articles = []
            while True:
                base_url_with_page = base_url+ f'&page={n_page}'
                base_url_with_page += f'&api-key={self.api_key}'
                print(f'REQUEST QUERY : {base_url_with_page}')
                try:
                    r = requests.get(base_url_with_page,verify=False)
                    
                except Exception as e:
                    print(f"REQUEST ERROR::::{e}")
                    break
                try:
                    response = r.json()['response']['docs'] 
                except Exception as e:
                    print(f"There is no response ERROR::::{e}::::::::::{r}")
                    break
                articles.extend(response)
                print(f'LENGTH OF ARTICLES LIST ::::::::{q}::::::::::{len(articles)}')
                if len(response) <10:
                    break
                n_page +=1
                time.sleep(12)
            articles_df = pd.DataFrame(articles)
            print(f'@@@@@@{q}@@@@@ARTICLES DF INFO :::::{len(articles_df)}')
            # print(f'ARTICLES DATAFRAME :{articles_df}')
            #+query abstract	web_url	source	headline['main']	keywords	pub_date	document_type	news_desk	section_name	type_of_material
            articles_df['query'] = q
            articles_df = articles_df.loc[:,~articles_df.columns.isin(['snippet','lead_paragraph','multimedia','byline','_id','word_count','uri','print_section','print_page','subsection_name'])]
            # articles_df.drop(['snippet','lead_paragraph','multimedia','byline','_id','word_count','uri','print_section','print_page','subsection_name'],axis=1)
            print(f'@@@@{q}@@@@@@AFTER DROP COLUMNS ARTICLES DATAFRAME INFO :{len(articles_df)}')
            articles_df_with_query = pd.concat([articles_df_with_query,articles_df])
            # print(f'ARTICLES DATAFRAME INFO :{articles_df_with_query.info()}')
        print(f'@@@@FINAL@@@@@@AFTER DROP COLUMNS ARTICLES DATAFRAME INFO :{len(articles_df_with_query)}')
        return articles_df_with_query
        
    def nytimes_query(self,query):
        # news_desk_list = ['Business','Business Day','Technology']
        # type_of_material_list=['News','Article']
        #fq=news_desk:("Sports","Business") AND glocations:("NEW YORK CITY")
        # Set the base url for the query
        base_url = f'https://api.nytimes.com/svc/search/v2/articlesearch.json?q={query}'
        base_url+= '&fq=news_desk:(\"Business\",\"Business Day\",\"Technology\") AND news_type:(\"News\",\"Article\")'
        print(f'BASE QUERY : {base_url}')
        return base_url


In [181]:
# Main
query_list = ['Microsoft','Tesla']
begin_date = '20201201'
end_date='20201231'

nyt_scraper = NYT_SCRAPER(api_key=API_KEY,query=query_list,
                          begin_date= begin_date,end_date= end_date)

tmp = nyt_scraper.requests_query()
tmp



# date_index = pd.date_range(start=begin_date,end=end_date)
# date_list = date_index.strftime("%Y%m%d").to_list()
# date_list

{'version': '6.0.0', 'versionArray': [6, 0, 0, 0], 'bits': 64, 'maxBsonObjectSize': 16777216, 'ok': 1.0}
{'version': '6.0.0', 'versionArray': [6, 0, 0, 0], 'bits': 64, 'maxBsonObjectSize': 16777216, 'ok': 1.0}
REQUEST QUERY : https://api.nytimes.com/svc/search/v2/articlesearch.json?q=Microsoft&fq=news_desk:("Business","Business Day","Technology")&begin_date=20201201&end_date=20201231&page=0&api-key=YPrMAd93hm7pmTqscIQVlnGZiv1EwSOI
LENGTH OF ARTICLES LIST ::::::::Microsoft::::::::::10
REQUEST QUERY : https://api.nytimes.com/svc/search/v2/articlesearch.json?q=Microsoft&fq=news_desk:("Business","Business Day","Technology")&begin_date=20201201&end_date=20201231&page=1&api-key=YPrMAd93hm7pmTqscIQVlnGZiv1EwSOI
LENGTH OF ARTICLES LIST ::::::::Microsoft::::::::::20
REQUEST QUERY : https://api.nytimes.com/svc/search/v2/articlesearch.json?q=Microsoft&fq=news_desk:("Business","Business Day","Technology")&begin_date=20201201&end_date=20201231&page=2&api-key=YPrMAd93hm7pmTqscIQVlnGZiv1EwSOI
LENGTH 

Unnamed: 0,query,abstract,web_url,source,headline,keywords,pub_date,document_type,news_desk,section_name,type_of_material
0,Microsoft,The hackers gained more access than the compan...,https://www.nytimes.com/2020/12/31/technology/...,The New York Times,{'main': 'Microsoft Says Russian Hackers Viewe...,"[{'name': 'organizations', 'value': 'Microsoft...",2020-12-31T18:02:02+0000,article,Business,Technology,News
1,Microsoft,The Salesforce C.E.O.’s planned acquisition of...,https://www.nytimes.com/2020/12/02/technology/...,The New York Times,{'main': 'Marc Benioff Sets His Sights on Micr...,"[{'name': 'subject', 'value': 'Computers and t...",2020-12-02T14:22:23+0000,article,Business,Technology,News
2,Microsoft,As regulators seek ways to curb the company’s ...,https://www.nytimes.com/2020/12/14/technology/...,The New York Times,{'main': 'Google Dominates Thanks to an Unriva...,"[{'name': 'subject', 'value': 'Search Engines'...",2020-12-14T10:00:12+0000,article,Business,Technology,News
3,Microsoft,Silicon Valley is building a powerful influenc...,https://www.nytimes.com/2020/12/14/technology/...,The New York Times,{'main': 'Big Tech Turns Its Lobbyists Loose o...,"[{'name': 'subject', 'value': 'Computers and t...",2020-12-14T08:00:13+0000,article,Business,Technology,News
4,Microsoft,The move caps an acquisitive streak by Salesfo...,https://www.nytimes.com/2020/12/01/technology/...,The New York Times,{'main': 'Salesforce to Acquire Slack for $27....,"[{'name': 'subject', 'value': 'Computers and t...",2020-12-01T21:10:57+0000,article,Business,Technology,News
5,Microsoft,More than 30 states said that the company down...,https://www.nytimes.com/2020/12/17/technology/...,The New York Times,{'main': 'Google’s Legal Peril Grows in Face o...,"[{'name': 'organizations', 'value': 'Google In...",2020-12-17T17:55:37+0000,article,Business,Technology,News
6,Microsoft,The person writing the check is still the most...,https://www.nytimes.com/2020/12/02/technology/...,The New York Times,"{'main': 'Why Your Workplace Software Stinks',...","[{'name': 'organizations', 'value': 'Salesforc...",2020-12-02T18:28:48+0000,article,Business,Technology,News
7,Microsoft,Since the release of the highly anticipated Cy...,https://www.nytimes.com/2020/12/21/business/a-...,The New York Times,{'main': 'A much-hyped video game has turned i...,[],2020-12-21T10:55:06+0000,article,Business,Business Day,News
8,Microsoft,Amazon’s cloud computing business and Apple’s ...,https://www.nytimes.com/2020/12/01/technology/...,The New York Times,{'main': 'Amazon and Apple Are Powering a Shif...,"[{'name': 'subject', 'value': 'Computer Chips'...",2020-12-01T16:00:10+0000,article,Business,Technology,News
9,Microsoft,A group of experts gathered to debate the pros...,https://www.nytimes.com/2020/12/14/business/de...,The New York Times,{'main': 'How Climate Policy Will Change in 20...,"[{'name': 'subject', 'value': 'Global Warming'...",2020-12-14T11:00:04+0000,article,Business,Business Day,News


In [183]:
tmp = tmp.to_dict('records')
len(tmp)

39

In [8]:
db = AzureCosmosConnector()
db.db_connect()
a =db.collection


{'version': '6.0.0', 'versionArray': [6, 0, 0, 0], 'bits': 64, 'maxBsonObjectSize': 16777216, 'ok': 1.0}
{'version': '6.0.0', 'versionArray': [6, 0, 0, 0], 'bits': 64, 'maxBsonObjectSize': 16777216, 'ok': 1.0}


In [184]:
for row in tmp:
    a.insert_one(row)
    time.sleep(0.1)

In [185]:
count = a.count_documents({ "query": "Tesla" }) + a.count_documents({ "query": "Microsoft" })
count

2949

In [187]:
pre_count = count

In [186]:
if count == (pre_count + len(tmp)):
    print('True')
else:
    print('False')
    

True


272
312

In [37]:
articles_df_with_query = pd.DataFrame(columns=['query','abstract','web_url','source','headline','keywords','pub_date','document_type','news_desk','section_name','type_of_material'])
articles_df_with_query

Unnamed: 0,query,abstract,web_url,source,headline,keywords,pub_date,document_type,news_desk,section_name,type_of_material


In [28]:
#Tesla,Microsoft
# Business Day,Business, Technology
# begin_date = '20210101' end_date = '20240628'
query = scraper.nytimes_query(api_key= API_KEY,query='Nvidia',news_desk='Business',begin_date='20220101',end_date='20220131')
# query = scraper.nytimes_query(API_KEY,query='Microsoft',news_desk=["Business Day","Business", "Technology"],begin_date='20220102',end_date='20220102')

query

{'news_desk': 'Business'}
{'begin_date': '20220101'}
{'begin_date': '20220101', 'end_date': '20220131'}
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=Nvidia&fq=news_desk:("Business")
https://api.nytimes.com/svc/search/v2/articlesearch.json?q=Nvidia&fq=news_desk:("Business")&begin_date=20220101&end_date=20220131&
Final query : https://api.nytimes.com/svc/search/v2/articlesearch.json?q=Nvidia&fq=news_desk:("Business")&begin_date=20220101&end_date=20220131&&page=0&api-key=YPrMAd93hm7pmTqscIQVlnGZiv1EwSOI


[{'abstract': 'Some big events are delaying their return to in-person gatherings, but the organizers of CES, the Winter Olympics and the Australian Open have decided it’s time to gather in person again.',
  'web_url': 'https://www.nytimes.com/2022/01/05/business/ces-omicron.html',
  'snippet': 'Some big events are delaying their return to in-person gatherings, but the organizers of CES, the Winter Olympics and the Australian Open have decided it’s time to gather in person again.',
  'lead_paragraph': 'After going virtual last year, CES, a mega-conference in Las Vegas that’s the traditional launchpad for many of the tech industry’s latest gadgets, is trying to make a comeback. The trade show kicks off on Wednesday, with an estimated 2,200 exhibitors set to show up in person.',
  'source': 'The New York Times',
  'multimedia': [{'rank': 0,
    'subtype': 'xlarge',
    'caption': None,
    'credit': None,
    'type': 'image',
    'url': 'images/2022/01/05/multimedia/05db-newsletter-ces/05

In [29]:
tmp = pd.DataFrame(query)
tmp

Unnamed: 0,abstract,web_url,snippet,lead_paragraph,source,multimedia,headline,keywords,pub_date,document_type,news_desk,section_name,byline,type_of_material,_id,word_count,uri,print_section,print_page,subsection_name
0,Some big events are delaying their return to i...,https://www.nytimes.com/2022/01/05/business/ce...,Some big events are delaying their return to i...,"After going virtual last year, CES, a mega-con...",The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",{'main': 'Is it time for big in-person events ...,"[{'name': 'subject', 'value': 'Coronavirus (20...",2022-01-05T14:01:29+0000,article,Business,Business Day,"{'original': 'By Michael J. de la Merced', 'pe...",News,nyt://article/093785c6-b590-572f-8a85-5da3fda5...,363,nyt://article/093785c6-b590-572f-8a85-5da3fda5...,,,
1,"After a record year of transactions in 2021, d...",https://www.nytimes.com/2022/01/08/business/de...,"After a record year of transactions in 2021, d...",The DealBook newsletter delves into a single t...,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...","{'main': 'What’s Next for the Deal Boom?', 'ki...","[{'name': 'subject', 'value': 'Mergers, Acquis...",2022-01-08T13:00:04+0000,article,Business,Business Day,"{'original': 'By Michael J. de la Merced', 'pe...",News,nyt://article/58c56b6f-f600-5014-bb6a-96945b80...,917,nyt://article/58c56b6f-f600-5014-bb6a-96945b80...,B,6.0,DealBook
2,"Stocks recovered after a sharp drop, but they ...",https://www.nytimes.com/2022/01/25/business/de...,"Stocks recovered after a sharp drop, but they ...","So far, 2022 is off to a rocky start for the s...",The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...","{'main': 'Was the Market Sell-Off Overdone?', ...","[{'name': 'subject', 'value': 'United States E...",2022-01-25T12:40:52+0000,article,Business,Business Day,"{'original': 'By Andrew Ross Sorkin, Jason Kar...",News,nyt://article/8fcc334e-daf3-597d-ad85-1d290731...,1799,nyt://article/8fcc334e-daf3-597d-ad85-1d290731...,,,DealBook
3,"Organizers insist that the conference, which f...",https://www.nytimes.com/2022/01/05/business/de...,"Organizers insist that the conference, which f...","After going virtual last year, the Consumer El...",The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",{'main': 'CES Tests an Appetite for Live Event...,"[{'name': 'subject', 'value': 'Labor and Jobs'...",2022-01-05T12:48:04+0000,article,Business,Business Day,"{'original': 'By Andrew Ross Sorkin, Jason Kar...",News,nyt://article/60b61d97-56df-5899-a1ea-185db71e...,1802,nyt://article/60b61d97-56df-5899-a1ea-185db71e...,,,DealBook
4,The exit of SoftBank’s chief operating officer...,https://www.nytimes.com/2022/01/28/business/de...,The exit of SoftBank’s chief operating officer...,SoftBank said today that Marcelo Claure was st...,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",{'main': 'Marcelo Claure Leaves Masa Son’s Orb...,"[{'name': 'organizations', 'value': 'SOFTBANK ...",2022-01-28T12:42:52+0000,article,Business,Business Day,"{'original': 'By Andrew Ross Sorkin, Jason Kar...",News,nyt://article/227dd2c8-95bd-55ab-a8b4-2cd5c3b0...,1799,nyt://article/227dd2c8-95bd-55ab-a8b4-2cd5c3b0...,,,DealBook


'headline': {'main': 'Is it time for big in-person events again? Organizers are divided.',
   'kicker': None,
   'content_kicker': None,
   'print_headline': None,
   'name': None,
   'seo': None,
   'sub': None},