In [1]:
import requests
from lxml import etree
import csv
from queue import Queue
import threading

BASE_DOMAIN = 'https://www.dytt8.net'
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36',
}

class PageProducer(threading.Thread):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36',
    }

    def __init__(self, page_queue, detail_queue, *args, **kwargs):
        super(PageProducer, self).__init__(*args, **kwargs)
        self.page_queue = page_queue
        self.detail_queue = detail_queue

    def run(self):
        while True:
            if self.page_queue.empty():
                break
            url = self.page_queue.get()
            self.get_detail_urls(url)


    def get_detail_urls(self, url):
        response = requests.get(url, headers=self.headers)
        # print(response.text)  有乱码
        # 可以从网页源代码中查看编码格式
        # F12 在console输入document.charset 查看编码方式
        text = response.content.decode('gbk', errors='ignore')
        html = etree.HTML(text)
        detail_urls = html.xpath("//table[@class='tbspan']//a/@href")
        detail_urls = map(lambda url: BASE_DOMAIN + url, detail_urls)
        for detail_url in detail_urls:
            self.detail_queue.put(detail_url)

class MiddlePC(threading.Thread):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36',
    }

    def __init__(self, page_queue, detail_queue, movie_queue, *args, **kwargs):
        super(MiddlePC, self).__init__(*args, **kwargs)
        self.page_queue = page_queue
        self.detail_queue = detail_queue
        self.movie_queue = movie_queue

    def run(self):
        while True:
            if self.detail_queue.empty() and self.page_queue.empty():
                break
            detail_url = self.detail_queue.get()
            self.parse_detail_page(detail_url)

    def parse_detail_page(self, detail_url):
        movie = {}
        response = requests.get(detail_url, headers=self.headers)
        text = response.content.decode('gbk')
        html = etree.HTML(text)
        title = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
        movie['title'] = title

        zoomE = html.xpath("//div[@id='Zoom']")[0]
        imgs = zoomE.xpath(".//img/@src")
        try:
            cover = imgs[0]
        except IndexError:
            cover = ''
        # screenshot = imgs[1]
        movie['cover'] = cover

        # movie['screenshot'] = screenshot

        def parse_info(info, rule):
            return info.replace(rule, '').strip()

        infos = zoomE.xpath(".//text()")
        for index, info in enumerate(infos):
            if info.startswith('◎年　　代'):
                info = parse_info(info, '◎年　　代')
                movie['year'] = info
            elif info.startswith('◎产　　地'):
                info = parse_info(info, '◎产　　地')
                movie['country'] = info
            elif info.startswith('◎类　　别'):
                info = parse_info(info, '◎类　　别')
                movie['catergory'] = info
            elif info.startswith('◎豆瓣评分'):
                info = parse_info(info, '◎豆瓣评分')
                movie['score'] = info
            elif info.startswith('◎片　　长'):
                info = parse_info(info, '◎片　　长')
                movie['duration'] = info
            elif info.startswith('◎导　　演'):
                info = parse_info(info, '◎导　　演')
                movie['director'] = info
            elif info.startswith('◎主　　演'):
                info = parse_info(info, '◎主　　演')
                actors = [info]
                for i in range(index + 1, len(infos)):
                    actor = infos[i].strip()
                    if actor.startswith('◎'):
                        break
                    actors.append(actor)
                movie['actors'] = actors
            elif info.startswith('◎简　　介'):
                info = parse_info(info, '◎简　　介')
                movie['introduce'] = infos[index + 1]
        download_url = html.xpath("//td[@bgcolor='#fdfddf']/a/@href")[0]
        movie['download_url'] = download_url
        self.movie_queue.put(movie)


class Consumer(threading.Thread):
    def __init__(self, movie_queue, detail_queue, *args, **kwargs):
        super(Consumer, self).__init__(*args, **kwargs)
        self.movie_queue = movie_queue
        self.detail_queue = detail_queue

    def run(self):
        while True:           
            m = self.movie_queue.get()
            print(m)

# def spider_():
#     base_url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html'
#     movies = []
#     header = ['title','cover','year','country','catergory','score','duration','director','actors','introduce','download_url']
#     # 遍历所有页
#     for x in range(1, 30):
#         url = base_url.format(x)
#         detail_urls = get_detail_urls(url)
#         # 遍历每页的电影详情url
#         for detail_url in detail_urls:
#             movie = parse_detail_page(detail_url)
#             movies.append(movie)
#             # print(movies)


def main():
    page_queue = Queue(100)
    detail_queue = Queue(100)
    movie_queue = Queue(500)
    base_url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html'
    header = ['title','cover','year','country','catergory','score','duration','director','actors','introduce','download_url']
    # 遍历所有页
    for x in range(1, 30):
        url = base_url.format(x)
        page_queue.put(url)

    for i in range(10):
        t = PageProducer(page_queue, detail_queue)
        t.start()

    for i in range(10):
        t = MiddlePC(page_queue, detail_queue, movie_queue)
        t.start()

    for i in range(5):
        t = Consumer(movie_queue, detail_queue)
        t.start()

if __name__ ==  '__main__':
    main()