In [1]:
import scrapy
from scrapy.http import HtmlResponse
from scrapy import Selector
import requests
import json
import os

from importlib import reload
import sys
sys.path.append('/home/nur/projects/analysis/range_matching')
import imdb_parser

### HW [Link](https://www.imdb.com/search/name/?gender=male%2Cfemale&ref_=nv_cel_m)

In [None]:
DEBUG_URL = 'https://www.imdb.com/title/tt11041332/fullcredits?ref_=tt_cl_sm#cast'

req = requests.get(DEBUG_URL)
response = HtmlResponse(url=DEBUG_URL, body=req.content)

selector = Selector(response=response)

In [None]:
res = selector.xpath(".//div[@class='subpage_title_block__right-column']"
                    "/div[@class='parent']/h3/a/text()").extract_first().strip()
res

### Actual crawler

In [2]:
import logging
logging.getLogger('scrapy').disabled=True
logging.getLogger().setLevel(logging.WARNING);

class ActorItem(scrapy.Item):
    name = scrapy.Field()
    born = scrapy.Field()
    movies = scrapy.Field()
    url = scrapy.Field()
    bio = scrapy.Field()

class MovieItem(scrapy.Item):
    url = scrapy.Field()
    title = scrapy.Field()
    cast = scrapy.Field()

IS_ACTOR = False
    
class ImdbSpider(scrapy.Spider):
    name = 'imdb'
    allowed_domains = ["imdb.com"]
    base_url = "https://www.imdb.com"
    start_urls = ["https://www.imdb.com/search/name/?gender=male%2Cfemale&ref_=nv_cel_m"]
    
    def parse(self, response):
        url_parts = response.xpath(".//*[@class='lister-item-content']/h3[@class='lister-item-header']"
                                   "/a/@href").extract()
        actor_url_list = [self.base_url + x for x in url_parts]
        if IS_ACTOR:
            for actor_url in actor_url_list:
                yield scrapy.Request(actor_url,
                                     callback=self.parse_actor,
                                     meta={'url':actor_url + "/"})
        else:
            if os.path.exists('gen_movie.pkl'):
                movie_url_list = list(self.gen_movie(actor_url_list))
                with open('gen_movie.pkl', 'wb') as f:
                    pickle.dump(movie_url_list, 'gen_movie.pkl')
            else:
                with opne('gen_movie.pkl', 'rb') as f:
                    movie_url_list = pickle.load(f)
            
            for movie_url in movie_url_list:
                yield scrapy.Request(movie_url + 'fullcredits',
                                     callback=self.parse_cast,
                                     meta={'url': movie_url})
    
    def gen_movie(self, actor_url_list):
        already_parsed = set()
        for actor_url in actor_url_list:
            response = HtmlResponse(url=actor_url, body=requests.get(actor_url).content)
            selector = Selector(response=response)
            
            movie_url_list = selector.xpath(".//div[@class='filmo-category-section']"
                                        "/div/b/a/@href").extract()
            movie_url_list = set(movie_url_list) - already_parsed
            for movie_url in movie_url_list:
                movie_url = self.base_url + movie_url
                yield movie_url
            already_parsed |= movie_url_list
       
    def parse_cast(self, response):
        item = MovieItem()
        
        item['url'] = response.meta['url']
        item['title'] = response.xpath(".//div[@class='subpage_title_block__right-column']"
                    "/div[@class='parent']/h3/a/text()").extract_first().strip()
        res = response.xpath(".//table[@class='cast_list']//tr/td[2]/a/text()").extract()
        item['cast'] = list(map(str.strip, res))
        return item
        
    def parse_actor(self, response):
        item = ActorItem()
        
        item['name'] = response.xpath(".//td[@class='name-overview-widget__section']"
                                     "/h1/span/text()").extract_first().strip()
        
        bio_part = response.xpath(".//div[@class='name-trivia-bio-text']/div/"
                    "/descendant-or-self::text()").extract()
        stop_word ="See full bio"
        bio_str_list = []
        for r in bio_part:
            if r == stop_word:
                break
            bio_str_list.append(r)
        item['bio'] = " ".join(bio_str_list).strip()
        
        item['born'] = response.xpath(".//time/@datetime").extract_first()
        
        item['url'] = response.meta['url']
        
        movie_list = response.xpath(".//div[@class='filmo-category-section']/div/b/a/text()").extract()
        item['movies'] = list(map(str.strip, movie_list[:15]))

        return item

In [None]:
r = ImdbSpider()
for s in r.gen_movie(['/name/nm3480246/']):
    print(s)

In [None]:
!rm -rf items.json

In [3]:
from scrapy.crawler import CrawlerProcess

process = CrawlerProcess(
    settings={
        "FEEDS": {
            "actors.json" if IS_ACTOR else "movies.json": {"format": "json"}
        }
    }
)
process.crawl(ImdbSpider)
process.start()

2021-06-01 23:24:27 [scrapy.utils.log] INFO: Scrapy 2.5.0 started (bot: scrapybot)
2021-06-01 23:24:27 [scrapy.utils.log] INFO: Versions: lxml 4.6.3.0, libxml2 2.9.10, cssselect 1.1.0, parsel 1.6.0, w3lib 1.22.0, Twisted 21.2.0, Python 3.7.10 (default, May  3 2021, 02:48:31) - [GCC 7.5.0], pyOpenSSL 20.0.1 (OpenSSL 1.1.1k  25 Mar 2021), cryptography 3.4.7, Platform Linux-5.4.72-microsoft-standard-WSL2-x86_64-with-Ubuntu-18.04-bionic
2021-06-01 23:24:27 [scrapy.utils.log] DEBUG: Using reactor: twisted.internet.epollreactor.EPollReactor
2021-06-01 23:24:27 [scrapy.crawler] INFO: Overridden settings:
{}
2021-06-01 23:24:27 [scrapy.extensions.telnet] INFO: Telnet Password: 00c2fcfbf8ad7ef9
2021-06-01 23:24:27 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.feedexport.FeedExporter',
 'scrapy.extensions.logstats.LogStats']
2021-06-01 23:24:27 [scra

In [None]:
with open('movies.json', 'r') as f:
    res = json.load(f, )
res[0].keys()

In [None]:
res[0]['bio']

Второй файл
1. Ссылка на страницу фильма (ключ url). https://www.imdb.com/ в начале и закрывающий / в конце обязательны для прохождения проверки.
1. Название фильма (ключ title). 
1. Список актёров (ключ cast) - List[str] с перечислением имён (чтобы они вязались с п.4 предыдущего списка).

### Legacy

In [None]:
class ImdbSpider(scrapy.Spider):
    name = "imdb"
    allowed_domains = ["imdb.com"]
    start_urls = ['http://www.imdb.com/chart/top',]
   
    def parse(self, response):
        # получение таблицы со строками, хараактеризующими ТОП-фильмы
        table_rows = response.xpath(
            './/*[@class="chart full-width" and @data-caller-name="chart-top250movie"]/'
            'tbody[@class="lister-list"]/tr'
        
        )
        
        for row in table_rows:
            # для каждой строки извлечем необходимую информацию
            yield {
                # пока парсим 3 колонки
                "title": row.xpath("./td[@class='titleColumn']/a/text()").extract_first(),
                "year": row.xpath("./td[@class='titleColumn']/span/text()").extract_first().strip("() "),
                "rating": row.xpath("./td[@class='ratingColumn imdbRating']/strong/text()").extract_first(),
            }

In [None]:
DEBUG_URL = 'http://www.imdb.com/chart/top'

req = requests.get(DEBUG_URL)
response = HtmlResponse(url=DEBUG_URL, body=req.content)

selector = Selector(response=response)

In [None]:
selector.xpath('.//*[@class="chart full-width" and @data-caller-name="chart-top250movie"]'
               '/tbody[@class="lister-list"]')[0].extract()

In [None]:
selector.xpath('.//*[@class="chart full-width" and @data-caller-name="chart-top250movie"]'
               '/tbody[@class="lister-list"]/tr')[0].extract()

In [None]:
from scrapy.crawler import CrawlerProcess

In [None]:
process = CrawlerProcess()

process.crawl(ImdbSpider)
process.start()