# Scrapy


### About

Scrapy is an application framework for crawling web sites and extracting structured data which can be used for a wide range of useful applications, like data mining, information processing or historical archival.

##  

In [35]:
# #Installation
# #pip install scrapy

##  

In [36]:
# #Simple Scrapy Project - http://scrapy.readthedocs.org/en/latest/intro/tutorial.html

# #Domain to scrape - http://www.dmoz.org/

In [37]:
# #01 - Creating a new Scrapy Project

# #Start a new Scrapy Project - "my_web_crawler"
# $ scrapy startproject my_web_crawler

# #This will create a new folder - "my_web_crawler"
# #The directory will contain : 
# #my_web_crawler/

# #    scrapy.cfg        #Configuration File
# #    my_web_crawler/   #Project Module
# #        __init__.py
# #        items.py         
# #        pipelines.py      
# #        settings.py   
# #        spiders/      #Directory containing Spiders

In [38]:
# #02 - Defining our item

# #Items are containers that will be loaded with the scraped data. They work like Python Dictionaries.
# #Python File - items.py
import scrapy

class DmozItem(scrapy.Item):
    title = scrapy.Field()
    link = scrapy.Field()
    desc = scrapy.Field()

In [39]:
# #03 - Our first Spider
# #Spiders contain the initial list of URLs to download, which links to follow & how to parse the extracted data.
# #Python File under the spiders/ directory - dmoz_spiders.py
import scrapy

class DmozSpider(scrapy.Spider):
    # #Unique Name for each Spider
    name = "dmoz"
    allowed_domains = ["dmoz.org"]
    # #Spiders start crawling from these URLs & then successively crawl the other 
    # #extracted URLs.
    start_urls = [
        "http://www.dmoz.org/Computers/Programming/Languages/Python/Books/",
        "http://www.dmoz.org/Computers/Programming/Languages/Python/Resources/"
    ]

    # #Parsing the Response data & extracting the scraped data.
    def parse(self, response):
        filename = response.url.split("/")[-2] + '.html'
        with open(filename, 'wb') as f:
            f.write(response.body)

In [40]:
# #04 - Crawling
# #From the projects root directory, run

# $ scrapy crawl dmoz
# #This command runs the Spider with the name dmoz