# Tìm hiểu và làm ví dụ về Scrapy  

## Đáp ứng điều kiện cài đặt

In [13]:
# Xác định thông tin nền tảng

# Settings for notebook
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# Show Python version
import platform
platform.python_version()

'3.11.7'

## Cài đặt và import 

In [2]:
# Cài đặt và import scrapy 

try:
    import scrapy
except:
    !pip install scrapy
    import scrapy
from scrapy.crawler import CrawlerProcess

## Thiết lập một đường ống pipeline  
  
Lớp này tạo một đường dẫn đơn giản ghi tất cả các mục tìm thấy vào một tệp JSON, trong đó mỗi dòng chứa một phần tử JSON.

In [3]:
import json

class JsonWriterPipeline(object):

    def open_spider(self, spider):
        self.file = open('quoteresult.jl', 'w')

    def close_spider(self, spider):
        self.file.close()

    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item

## Define the spider  
  
Lớp QuoteSpider xác định URL nào sẽ bắt đầu thu thập dữ liệu và giá trị nào cần truy xuất. Tôi đặt mức ghi nhật ký của trình thu thập thông tin thành cảnh báo, nếu không sổ ghi chép sẽ bị quá tải với các thông báo GỠ LỖI về dữ liệu được truy xuất.

In [4]:
import logging

class QuotesSpider(scrapy.Spider):
    name = "quotes"
    start_urls = [
        'http://quotes.toscrape.com/page/1/',
        'http://quotes.toscrape.com/page/2/',
    ]
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'ITEM_PIPELINES': {'__main__.JsonWriterPipeline': 1}, # Used for pipeline 1
        'FEED_FORMAT':'json',                                 # Used for pipeline 2
        'FEED_URI': 'quoteresult.json'                        # Used for pipeline 2
    }
    
    def parse(self, response):
        for quote in response.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').extract_first(),
                'author': quote.css('span small::text').extract_first(),
                'tags': quote.css('div.tags a.tag::text').extract(),
            }



## Start the crawler 

In [5]:
process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})

process.crawl(QuotesSpider)
process.start()

2024-02-28 15:55:31 [scrapy.utils.log] INFO: Scrapy 2.11.1 started (bot: scrapybot)
2024-02-28 15:55:31 [scrapy.utils.log] INFO: Versions: lxml 5.1.0.0, libxml2 2.12.3, cssselect 1.2.0, parsel 1.8.1, w3lib 2.1.2, Twisted 23.10.0, Python 3.11.7 (main, Jan 29 2024, 16:03:57) [GCC 13.2.1 20230801], pyOpenSSL 24.0.0 (OpenSSL 3.1.4 24 Oct 2023), cryptography 41.0.7, Platform Linux-6.7.4-zen1-1-zen-x86_64-with-glibc2.39


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)

  exporter = cls(crawler)



<Deferred at 0x715f3c2b3d10>

## Kiểm tra và hiện ra kết quả thu được 

In [6]:
ll quoteresult.*

-rwxrwxrwx 1 root 5551 Feb 28 15:55 [0m[01;32mquoteresult.jl[0m*
-rwxrwxrwx 1 root 5573 Feb 28 15:55 [01;32mquoteresult.json[0m*


In [7]:
!tail -n 2 quoteresult.jl

{"text": "\u201cGood friends, good books, and a sleepy conscience: this is the ideal life.\u201d", "author": "Mark Twain", "tags": ["books", "contentment", "friends", "friendship", "life"]}
{"text": "\u201cLife is what happens to us while we are making other plans.\u201d", "author": "Allen Saunders", "tags": ["fate", "life", "misattributed-john-lennon", "planning", "plans"]}


In [8]:
!tail -n 2 quoteresult.json

{"text": "\u201cLife is what happens to us while we are making other plans.\u201d", "author": "Allen Saunders", "tags": ["fate", "life", "misattributed-john-lennon", "planning", "plans"]}
]

In [9]:
import pandas as pd
dfjson = pd.read_json('quoteresult.json')
dfjson

Unnamed: 0,text,author,tags
0,“The world as we have created it is a process ...,Albert Einstein,"[change, deep-thoughts, thinking, world]"
1,"“It is our choices, Harry, that show what we t...",J.K. Rowling,"[abilities, choices]"
2,“There are only two ways to live your life. On...,Albert Einstein,"[inspirational, life, live, miracle, miracles]"
3,"“The person, be it gentleman or lady, who has ...",Jane Austen,"[aliteracy, books, classic, humor]"
4,"“Imperfection is beauty, madness is genius and...",Marilyn Monroe,"[be-yourself, inspirational]"
5,“Try not to become a man of success. Rather be...,Albert Einstein,"[adulthood, success, value]"
6,“It is better to be hated for what you are tha...,André Gide,"[life, love]"
7,"“I have not failed. I've just found 10,000 way...",Thomas A. Edison,"[edison, failure, inspirational, paraphrased]"
8,“A woman is like a tea bag; you never know how...,Eleanor Roosevelt,[misattributed-eleanor-roosevelt]
9,"“A day without sunshine is like, you know, nig...",Steve Martin,"[humor, obvious, simile]"


In [10]:
dfjl = pd.read_json('quoteresult.jl', lines=True)
dfjl

Unnamed: 0,text,author,tags
0,“The world as we have created it is a process ...,Albert Einstein,"[change, deep-thoughts, thinking, world]"
1,"“It is our choices, Harry, that show what we t...",J.K. Rowling,"[abilities, choices]"
2,“There are only two ways to live your life. On...,Albert Einstein,"[inspirational, life, live, miracle, miracles]"
3,"“The person, be it gentleman or lady, who has ...",Jane Austen,"[aliteracy, books, classic, humor]"
4,"“Imperfection is beauty, madness is genius and...",Marilyn Monroe,"[be-yourself, inspirational]"
5,“Try not to become a man of success. Rather be...,Albert Einstein,"[adulthood, success, value]"
6,“It is better to be hated for what you are tha...,André Gide,"[life, love]"
7,"“I have not failed. I've just found 10,000 way...",Thomas A. Edison,"[edison, failure, inspirational, paraphrased]"
8,“A woman is like a tea bag; you never know how...,Eleanor Roosevelt,[misattributed-eleanor-roosevelt]
9,"“A day without sunshine is like, you know, nig...",Steve Martin,"[humor, obvious, simile]"


In [11]:
dfjson.to_pickle('quotejson.pickle')
dfjl.to_pickle('quotejl.pickle')

In [12]:
ll *pickle

-rwxrwxrwx 1 root 5454 Feb 28 15:57 [0m[01;32mquotejl.pickle[0m*
-rwxrwxrwx 1 root 5454 Feb 28 15:57 [01;32mquotejson.pickle[0m*


# Thử nghiệm việc sử dụng Scraper để crawl data làm project nhóm

Do nhóm em đã có nguồn dữ liệu tải được từ trước nên định hướng sẽ không sử dụng Scraper.  
Tuy nhiên trong khuôn khổ nội dung bài thực hành, nhóm em sẽ sử dụng crawl ảnh (nguồn dữ liệu dự án nhóm em sẽ sử dụng).

### Cách 1.

In [14]:
!pip install ImageScraper



In [15]:
!image-scraper --max-images 10 'https://vnexpress.net/'


ImageScraper
Requesting page....

Found 2 images: 
Progress: 100% ||||||||||||||||||||||||||||||||||||||| Time: 00:00:00   1.82 K/s

Done!
Downloaded 2 images
Failed: 0



Vậy là đã download được 2 ảnh có từ web 

### Cách 2. Vấn dùng Scraper

In [16]:
!scrapy startproject oral_cancer_images

New Scrapy project 'oral_cancer_images', using template directory '/home/harito/venv/py/lib/python3.11/site-packages/scrapy/templates/project', created in:
    /mnt/DataK/Univer/UniSubject/_3th_year/_2nd_term/3ii_DM/Lec_Ass/oral_cancer_images

You can start your first spider with:
    cd oral_cancer_images
    scrapy genspider example example.com


In [17]:
%cd oral_cancer_images/oral_cancer_images/spiders/

  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]



/mnt/DataK/Univer/UniSubject/_3th_year/_2nd_term/3ii_DM/Lec_Ass/oral_cancer_images/oral_cancer_images/spiders


In [18]:
!scrapy genspider images_spider "https://oralcancerfoundation.org/dental/oral-cancer-images"

Created spider 'images_spider' using template 'basic' in module:
  oral_cancer_images.spiders.images_spider


In [19]:
!scrapy crawl images_spider -o output.json

2024-02-28 16:27:51 [scrapy.utils.log] INFO: Scrapy 2.11.1 started (bot: oral_cancer_images)
2024-02-28 16:27:51 [scrapy.utils.log] INFO: Versions: lxml 5.1.0.0, libxml2 2.12.3, cssselect 1.2.0, parsel 1.8.1, w3lib 2.1.2, Twisted 23.10.0, Python 3.11.7 (main, Jan 29 2024, 16:03:57) [GCC 13.2.1 20230801], pyOpenSSL 24.0.0 (OpenSSL 3.1.4 24 Oct 2023), cryptography 41.0.7, Platform Linux-6.7.4-zen1-1-zen-x86_64-with-glibc2.39
2024-02-28 16:27:51 [scrapy.addons] INFO: Enabled addons:
[]
2024-02-28 16:27:51 [asyncio] DEBUG: Using selector: EpollSelector
2024-02-28 16:27:51 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.asyncioreactor.AsyncioSelectorReactor
2024-02-28 16:27:51 [scrapy.utils.log] DEBUG: Using asyncio event loop: asyncio.unix_events._UnixSelectorEventLoop
2024-02-28 16:27:51 [scrapy.extensions.telnet] INFO: Telnet Password: d51179d5211b3bbd
2024-02-28 16:27:52 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions