In [1]:
import scrapy
import requests
import pandas as pd
from sqlalchemy import create_engine
from scrapy.http import HtmlResponse
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
import os

In [2]:
class NWSpider(scrapy.Spider):
    name = 'network_world_spider'
    def __init__(self):
        super().__init__()
        conn_url = os.environ.get('POSTGRES_CONNECTION_STRING')
        self.engine = create_engine(f'{conn_url}cloudnewsdb')
        self.data = []

    def start_requests(self):
        urls = ['https://www.networkworld.com/cloud-computing/']
        for url in urls:
            yield scrapy.Request(url = url, callback = self.parse)

    def parse(self, response):
        for a in response.xpath("//a[@class and contains(concat(' ', normalize-space(@class), ' '), ' card ')]"):  
            date = a.xpath('div[contains (@class, "card__info--light")]/span[1]/text()').extract()[0]
            if 'dec' in date.lower():
                self.data.append({
                    'name': a.xpath('h4/text()').extract()[0],
                    'link':a.xpath('@href').extract()[0],
                    'date':date,
                    'source': 'Network World'
                })
        df = pd.DataFrame(self.data)
        df.to_sql('raw', self.engine, if_exists='replace')

In [3]:
process = CrawlerProcess()
process.crawl(NWSpider)
process.start()

2024-01-01 14:18:19 [scrapy.utils.log] INFO: Scrapy 2.11.0 started (bot: scrapybot)
2024-01-01 14:18:19 [scrapy.utils.log] INFO: Versions: lxml 4.9.2.0, libxml2 2.9.12, cssselect 1.2.0, parsel 1.8.1, w3lib 2.1.2, Twisted 22.10.0, Python 3.11.1 (tags/v3.11.1:a7a450f, Dec  6 2022, 19:58:39) [MSC v.1934 64 bit (AMD64)], pyOpenSSL 23.3.0 (OpenSSL 3.1.4 24 Oct 2023), cryptography 41.0.7, Platform Windows-10-10.0.19045-SP0
2024-01-01 14:18:19 [scrapy.addons] INFO: Enabled addons:
[]


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)

2024-01-01 14:18:19 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor
2024-01-01 14:18:19 [scrapy.extensions.telnet] INFO: Telnet Password: aac3e0567fb8e333
2024-01-01 14:18:19 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.lo

In [2]:
URL = 'https://www.networkworld.com/cloud-computing/'
#response = HtmlResponse(url=URL)
html = requests.get( URL ).content
sel = scrapy.Selector( text = html )

In [6]:
sel.xpath('//a[contains(@class, "card")]/@href').extract()

['https://www.networkworld.com/article/1250120/startup-oxide-computing-seeks-to-put-the-cloud-back-in-on-prem-private-clouds.html',
 'https://www.networkworld.com/article/1251939/ibm-cloud-service-aims-to-deliver-secure-multicloud-connectivity.html',
 'https://www.networkworld.com/article/1251281/eu-approves-1-3b-in-aid-for-cloud-edge-computing.html',
 'https://www.networkworld.com/article/1250044/mainframe-modernization-gets-a-boost-from-kyndryl-aws-collaboration.html',
 'https://www.networkworld.com/article/1249451/cisco-aws-further-integrate-cloud-management-capabilities.html',
 'https://www.networkworld.com/article/1247908/alibaba-scraps-plans-to-hive-off-cloud-business-amid-growing-chip-uncertainties.html',
 'https://www.networkworld.com/article/1247268/microsofts-maia-ai-azure-cobalt-chips-to-rev-up-efficiency-performance.html',
 'https://www.networkworld.com/article/1247134/cloud-management-skills-gap-drives-hybrid-cloud-adoption.html',
 'https://www.networkworld.com/article/957

In [7]:
data = []
for a in sel.xpath("//a[@class and contains(concat(' ', normalize-space(@class), ' '), ' card ')]"):  
    date = a.xpath('div[contains (@class, "card__info--light")]/span[1]/text()').extract()[0]
    if 'dec' in date.lower():
        data.append({
                    'name': a.xpath('h4/text()').extract()[0],
                    'link':a.xpath('@href').extract()[0],
                    'date':date
                })
data

[{'name': "Oxide puts the 'cloud' back in on-prem private clouds",
  'link': 'https://www.networkworld.com/article/1250120/startup-oxide-computing-seeks-to-put-the-cloud-back-in-on-prem-private-clouds.html',
  'date': 'Dec 12, 2023'},
 {'name': 'IBM cloud service aims to deliver secure, multicloud connectivity',
  'link': 'https://www.networkworld.com/article/1251939/ibm-cloud-service-aims-to-deliver-secure-multicloud-connectivity.html',
  'date': 'Dec 07, 2023'},
 {'name': 'EU approves $1.3B in aid for cloud, edge computing\n',
  'link': 'https://www.networkworld.com/article/1251281/eu-approves-1-3b-in-aid-for-cloud-edge-computing.html',
  'date': 'Dec 05, 2023'},
 {'name': 'BEYOND THE WALL The story of six VFX studios behind the epic Game of Thrones',
  'link': 'https://us.resources.networkworld.com/resources/beyond-the-wall-the-story-of-six-vfx-studios-behind-the-epic-game-of-thrones-3?utm_source=rss-feed&utm_medium=rss&utm_campaign=feed',
  'date': '15 Dec 2023'},
 {'name': '6 Secr

In [9]:
pd.DataFrame(data)

Unnamed: 0,name,link,date
0,Oxide puts the 'cloud' back in on-prem private...,https://www.networkworld.com/article/1250120/s...,"Dec 12, 2023"
1,"IBM cloud service aims to deliver secure, mult...",https://www.networkworld.com/article/1251939/i...,"Dec 07, 2023"
2,"EU approves $1.3B in aid for cloud, edge compu...",https://www.networkworld.com/article/1251281/e...,"Dec 05, 2023"
3,BEYOND THE WALL The story of six VFX studios b...,https://us.resources.networkworld.com/resource...,15 Dec 2023
4,6 Secrets to Successful Cloud-Based Workflows,https://us.resources.networkworld.com/resource...,15 Dec 2023
5,Why observability with business insights is cr...,https://us.resources.networkworld.com/resource...,15 Dec 2023
