# Import scrapy

In [1]:
import scrapy
from scrapy.crawler import CrawlerProcess

# Setup pipeline

In [2]:
import os
import json
class JsonPipeline(object):
    def open_spider(self, spider):
        if not os.path.exists('./data1'):
            os.mkdir('./data1')
        self.file = open('./data1/songresult.jsonl', 'w')
    
    def close_spider(self, spider):
        self.file.close()
    
    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item

# Define the spider

In [3]:
import logging

class SongSpider(scrapy.Spider):
    name = "song"
    start_urls = [
        'https://www.billboard.com/charts/hot-100'
    ]
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'ITEM_PIPELINES': {'__main__.JsonPipeline': 1},
        'FEED_FORMAT':'json',
        'FEED_URI': './data1/songresult.json'
    }
    
    def parse(self, response):
        for song in response.css('li.chart-list__element.display--flex'):
            yield {
                'rank': song.css('button span span.chart-element__rank__number::text').extract_first(),
                'song': song.css('button span span.chart-element__information__song.text--truncate.color--primary::text').extract_first(),
                'singer': song.css('button span span.chart-element__information__artist.text--truncate.color--secondary::text').extract_first(),
                'peak': song.css('button span span.chart-element__meta.text--center.color--secondary.text--peak::text').extract_first(),
                'duration': song.css('button span span.chart-element__meta.text--center.color--secondary.text--week::text').extract_first(),
            }

# Start the crawler

In [4]:
process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})

process.crawl(SongSpider)
process.start()


2020-02-18 21:48:56 [scrapy.utils.log] INFO: Scrapy 1.8.0 started (bot: scrapybot)
2020-02-18 21:48:56 [scrapy.utils.log] INFO: Versions: lxml 4.4.1.0, libxml2 2.9.9, cssselect 1.1.0, parsel 1.5.2, w3lib 1.21.0, Twisted 19.10.0, Python 3.7.4 (default, Aug 13 2019, 15:17:50) - [Clang 4.0.1 (tags/RELEASE_401/final)], pyOpenSSL 19.0.0 (OpenSSL 1.1.1d  10 Sep 2019), cryptography 2.7, Platform Darwin-18.7.0-x86_64-i386-64bit
2020-02-18 21:48:56 [scrapy.crawler] INFO: Overridden settings: {'FEED_FORMAT': 'json', 'FEED_URI': './data1/songresult.json', 'LOG_LEVEL': 30, 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'}


# Check the files

In [5]:
ll ./data1/songresult.*

-rw-r--r--  1 lfs  staff  10078  2 18 21:48 ./data1/songresult.json
-rw-r--r--  1 lfs  staff   9976  2 18 21:48 ./data1/songresult.jsonl


In [6]:
!tail -n 2 ./data1/songresult.jsonl

{"rank": "99", "song": "Camelot", "singer": "NLE Choppa", "peak": "37", "duration": "20"}
{"rank": "100", "song": "ORANGE SODA", "singer": "Baby Keem", "peak": "98", "duration": "2"}


In [7]:
!tail -n 2 ./data1/songresult.json

{"rank": "100", "song": "ORANGE SODA", "singer": "Baby Keem", "peak": "98", "duration": "2"}
]

# Database to store the data

Relational database. The data of songs like rank, song, singer are related to each other. So the data can fit in a table with rows and columns. Also, information can be retrieved from other related data.