# Scraping website to populate MongoDB

In [1]:
import requests
from pymongo import MongoClient
import numpy as np
from scrapy import Selector
import scrapy
from scrapy.crawler import CrawlerProcess

from random import randint
from time import sleep

In [2]:
client = MongoClient()
db = client['webScrape']

In [3]:
db['slashdot'].delete_many({})
db['reuters'].delete_many({})

<pymongo.results.DeleteResult at 0x20ff74a0dc8>

# Slashdot

In [4]:
response_ = requests.get('https://slashdot.org/').content
sel = Selector(text = response_)
hyperlinks = []
body_text_sd = [r.xpath('string(.)').extract_first()
for r in sel.xpath('//article[@data-fhtype = "story"]//div[@class = "body"]/div/i')]
title_sd = sel.xpath('//span[@class = "story-title"]/a/text()').extract()
for idx in np.arange(len(title_sd)):
    if not sel.css(f'article[data-fhtype = "story"]:nth-of-type({idx + 1}) a[class^="story"]::attr(href)').extract():
        hyperlinks.append(['no hyperlink'])
    else:
        hyperlinks.append(sel.css(f'article[data-fhtype = "story"]:nth-of-type({idx + 1}) a[class^="story"]::attr(href)').extract())

In [5]:
collection = []
for idx in np.arange(len(title_sd)):
    dict_sd = {'title': title_sd[idx],
               'summary': body_text_sd[idx],
               'hyperlink': hyperlinks[idx]}
    collection.append(dict_sd)

In [6]:
db['slashdot'].insert_many(collection)

<pymongo.results.InsertManyResult at 0x20ff73c9408>

In [7]:
db['slashdot'].find_one()

{'_id': ObjectId('5fac23910d0f85e18d9eb032'),
 'title': "Disaster 'Prepping' Was Once an American Pastime. Today, It's Mainstream Again.",
 'summary': ' There\'s a reason "preppers," people who plan for the worst-case scenario, like to talk about the zombie apocalypse. The idea of an army of walking dead swarming the country pervades their thoughts because, says Roman Zrazhevskiy, "If you prepare as if a zombie apocalypse is going to happen, you have all the bases covered." That means: an escape route, medical supplies, a few weeks\' worth of food. Zrazhevskiy has been thinking about this for decades. He was born in Russia a few months after the nuclear meltdown at Chernobyl. At the dinner table, his family often talked about the disaster and what went wrong. Then, after they relocated to New York, Zrazhevskiy stood on the waterfront outside his Brooklyn high school on September 11, 2001, and watched the World Trade Center towers collapse. Even then, he had a small go-bag prepared with

# Reuters

In [8]:
dc_dict = []

In [9]:
class reuters_spider(scrapy.Spider):
    name = 'reuters'
    
    def __init__(self, dc_dict):
        self.dc_dict = dc_dict
    def start_requests(self):
        urls = ['https://uk.reuters.com/news/technology/',
                'https://uk.reuters.com/news/archive/technologynews?view=page&page=2&pageSize=10']
        for url in urls:
            sleep(randint(0, 5))
            yield scrapy.Request(url = url, callback = self.parse_main_page)
            
    def parse_main_page(self, response):
        hyperlinks_reut = response.css('article.story > div.story-content > a::attr(href)').extract()
        self.hyperlinks_reut = ['https://uk.reuters.com' + ele for ele in hyperlinks_reut]

        for link in self.hyperlinks_reut:
            sleep(randint(0, 5))
            yield response.follow(url = link, callback = self.parse_story_page)
            
    def parse_story_page(self, response):
        body_of_text = ''.join(response.css('p[class^="Paragraph"]::text').extract())
        titles_reut = response.css('h1[class^="Headline"]::text').extract()
        link = response.request.url
        author = response.css('a[class*=author]::text').extract()
        
        document = {'title': titles_reut,
                    'author': author,
                    'hyperlink': link,
                    'story_text': body_of_text}
        dc_dict.append(document)

In [10]:
process = CrawlerProcess()

2020-11-11 17:46:57 [scrapy.utils.log] INFO: Scrapy 2.3.0 started (bot: scrapybot)
2020-11-11 17:46:57 [scrapy.utils.log] INFO: Versions: lxml 4.5.0.0, libxml2 2.9.9, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 20.3.0, Python 3.7.6 (default, Jan  8 2020, 20:23:39) [MSC v.1916 64 bit (AMD64)], pyOpenSSL 19.1.0 (OpenSSL 1.1.1d  10 Sep 2019), cryptography 2.8, Platform Windows-10-10.0.19041-SP0
2020-11-11 17:46:57 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.selectreactor.SelectReactor


In [11]:
process.crawl(reuters_spider, dc_dict)

2020-11-11 17:46:57 [scrapy.crawler] INFO: Overridden settings:
{}
2020-11-11 17:46:57 [scrapy.extensions.telnet] INFO: Telnet Password: e25cd827c4cb2ea5
2020-11-11 17:46:57 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.logstats.LogStats']
2020-11-11 17:46:57 [scrapy.middleware] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
 'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
 'scrapy.downloadermiddlewares.retry.RetryMiddleware',
 'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
 'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
 'scrapy.downloadermiddlewares.redirect.RedirectMiddleware',
 'scrapy.downloadermiddlewares.cookies.CookiesMidd

<Deferred at 0x20ff76da148>

In [12]:
process.start()

2020-11-11 17:46:59 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET http://uk.reuters.com/news/technology> from <GET https://uk.reuters.com/news/technology/>
2020-11-11 17:46:59 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://uk.reuters.com/news/archive/technologynews?view=page&page=2&pageSize=10> (referer: None)
2020-11-11 17:47:02 [scrapy.downloadermiddlewares.redirect] DEBUG: Redirecting (301) to <GET https://uk.reuters.com/news/technology> from <GET http://uk.reuters.com/news/technology>
2020-11-11 17:47:09 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://uk.reuters.com/article/us-autos-electric-bmw/bmw-unveils-electric-suv-to-challenge-tesla-plans-u-s-launch-in-early-2022-idUKKBN27R1T6> (referer: https://uk.reuters.com/news/archive/technologynews?view=page&page=2&pageSize=10)
2020-11-11 17:47:09 [scrapy.core.engine] DEBUG: Crawled (200) <GET https://uk.reuters.com/news/technology> (referer: None)
2020-11-11 17:47:15 [scrapy.core.engine] DEBUG:

In [13]:
db['reuters'].insert_many(dc_dict)

<pymongo.results.InsertManyResult at 0x20ff79f76c8>

In [14]:
db['reuters'].find_one()

{'_id': ObjectId('5fac23c30d0f85e18d9eb041'),
 'title': ['BMW unveils electric SUV to challenge Tesla, plans U.S. launch in early 2022'],
 'author': ['Joseph White'],
 'hyperlink': 'https://uk.reuters.com/article/us-autos-electric-bmw/bmw-unveils-electric-suv-to-challenge-tesla-plans-u-s-launch-in-early-2022-idUKKBN27R1T6',
 'story_text': 'BMW said the iX should have a driving range of 300 miles (480 km). That’s less than the estimated driving range of the Tesla Model X Long Range, which is rated at 371 miles in the United States. BMW said drivers would be able to add 75 miles of range in ten minutes at a fast-charging station.BMW said the iX would be comparable in size to the current BMW X5 SUV. The dashboard will be a sweeping, curved screen.The electric iX will enter a fast-growing field of battery-powered SUVs aimed at affluent customers.'}