In [1]:
import requests
import threading
import pymongo
import random
import logging
import json
import time
import re
from queue import Queue
from bs4 import BeautifulSoup

In [2]:
class Mongo():
    
    def __init__(self):
        self._client = pymongo.MongoClient("mongodb://localhost:27017/")
        self._db = None
        self._collection = None
    
    def insert(self, doc):
        result = None
        if type(doc) is dict:
            if self.find_one(doc) is not None:
                result = self._collection.update(doc,doc,True)
        else:
            new_doc = list()
            for d in doc:
                if self.find_one(d) is not None:
                    new_doc.append(d)
            if len(new_doc) > 0:
                result = self._collection.insert_many(new_doc)
        return result

    def get_collection(self):
        return self._collection
    
    def find_one(self, doc=None):
        if doc is not None:
            result = self._collection.find_one(doc)
        else:
            result = self._collection.find_one()
        return result
    
    def find(self, doc=None):
        if doc is not None:
            result = self._collection.find(doc)
        else:
            result = self._collection.find()
        return result
    
    def count(self, doc=None):
        if doc is not None:
            count = self._collection.find(doc).count()
        else:
            count = self._collection.find().count()
        return count

In [3]:
class MongoDouban(Mongo):
    __instance_lock = threading.Lock()
    __init_flag = False
    
    def __init__(self):
        if self.__init_flag is False:
            super(MongoDouban, self).__init__()
            self._db = self._client["douban"]
            self.__init_flag = True
    
    def __new__(cls, *args, **kwargs):
        if not hasattr(MongoDouban, "_instance"):
            with MongoDouban.__instance_lock:
                if not hasattr(MongoDouban, "_instance"): 
                    MongoDouban._instance = object.__new__(cls)
        return MongoDouban._instance

In [4]:
class DoubanMovieCollection(MongoDouban):
    
    def __init__(self):
        super(DoubanMovieCollection, self).__init__()
        self._collection = self._db['movie']

In [5]:
def run_time(func):
    def wrapper(*args, **kw):
        start = time.time()
        result = func(*args, **kw)
        end = time.time()
        print('running', end-start, 's')
        return result
    return wrapper

In [6]:
class UserAgent():
    __instance_lock = threading.Lock()
    __init_flag = False
    
    def __init__(self):
        if self.__init_flag is False:
            print('init UserAgent')
            self.__agents_pool = list()
            with open('./UserAgents/useragents.txt','r') as read_ob:
                for line in read_ob.readlines():
                    self.__agents_pool.append(line.strip())
            self.__init_flag = True
    
    def __new__(cls, *args, **kwargs):
        if not hasattr(UserAgent, "_instance"):
            print('new UserAgent')
            with UserAgent.__instance_lock:
                if not hasattr(UserAgent, "_instance"): 
                    UserAgent._instance = object.__new__(cls)
        return UserAgent._instance
    
    def get_useragent_randomly(self):
        return random.choice(self.__agents_pool)
    

In [7]:
class Crawl():

    def __init__(self):
        self._session = None
        self._headers = {
            'User-Agent':UserAgent().get_useragent_randomly(),
            'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language':'zh-cn',
            'Connection': 'keep-alive',
            'Accept-Encoding': 'gzip, deflate',
        }
        self._proxies = [
            {"http":"112.111.217.114:9999"},
            {"http":"180.118.128.118:9000"},
            {"http":"171.11.29.217:9999"},
            {"http":"120.83.109.191:9999"}
        ]
        
    def request_get(self, url, **kwargs):
        logging.info('scraping {}...'.format(url))
        try:
            response = requests.get(url, headers=self._headers, proxies=random.choice(self._proxies), **kwargs)
            if response.status_code == 200:
                return response
            logging.error('get invalid status code %s while scraping %s', response.status_code, url)
        except requests.RequestException:
            logging.error('error occurred while scraping %s', url, exc_info=True)
        else:
            logging.info('scraping {} finished'.format(url))
     
    def request_post(self, url, **kwargs):
        logging.info('scraping {}...'.format(url))
        try:
            response = requests.post(url, headers=self._headers, proxies=random.choice(self._proxies), **kwargs)
            if response.status_code == 200:
                return response
            logging.error('get invalid status code %s while scraping %s', response.status_code, url)
        except requests.RequestException:
            logging.error('error occurred while scraping %s', url, exc_info=True)
        else:
            logging.info('scraping {} finished'.format(url))
    
    def session_get(self, url, **kwargs):
        self.check_session()
        logging.info('scraping {}...'.format(url))
        try:
            response = self._session.get(url, headers=self._headers, proxies=random.choice(self._proxies), **kwargs)
            if response.status_code == 200:
                return response
            logging.error('get invalid status code %s while scraping %s', response.status_code, url)
        except requests.RequestException:
            logging.error('error occurred while scraping %s', url, exc_info=True)
        else:
            logging.info('scraping {} finished'.format(url))
    
    def session_post(self, url, **kwargs):
        self.check_session()
        logging.info('scraping {}...'.format(url))
        try:
            response = self._session.post(url, headers=self._headers, proxies=random.choice(self._proxies), **kwargs)
            if response.status_code == 200:
                return response
            logging.error('get invalid status code %s while scraping %s', response.status_code, url)
        except requests.RequestException:
            logging.error('error occurred while scraping %s', url, exc_info=True)
        else:
            logging.info('scraping {} finished'.format(url)) 

    def get_session(self):
        return self._session
    
    def set_session(self, session):
        self._session = session
        return True
    
    def save_cookies(self, user):
        cookieJar = requests.cookies.RequestsCookieJar()
        for cookie in user.cookies:
            cookieJar.set(cookie.name,cookie.value)
        for cookie in user.headers['Set-Cookie'].split(";"):
            key = cookie.split('=')[0]
            value = cookie.split('=')[1]
            cookieJar.set(key,value)
        return cookieJar
    
    def check_session(self):
        if self._session == None:
            self._session = requests.session()
        return
    
    def add_header(self, headers):
        for key, value in headers.items():
            self._headers[key] = value


In [9]:
class DoubanUser(Crawl):
    __instance_lock = threading.Lock()
    __init_flag = False
       
    def __init__(self):
        if self.__init_flag is False:
            super(DoubanUser, self).__init__()
            self.__user = None
    #         self._cookies = None
            self.__login()
            self.__init_flag = True

    def __new__(cls, *args, **kwargs):
        if not hasattr(DoubanUser, "_instance"):
            print('new Douban')
            with DoubanUser.__instance_lock:
                if not hasattr(Douban, "_instance"): 
                    DoubanUser._instance = object.__new__(cls)
        return DoubanUser._instance
    
    def __login(self):
        post_data = {
            'name':'18664678368',
            'password':'LJC970412',
            'remember':'false'
        }
        self.add_header({
            "Referer":'https://accounts.douban.com/passport/login'
        })
        user = self.session_post('https://accounts.douban.com/j/mobile/login/basic', data=post_data)
        login_detail = json.loads(user.text)
        if login_detail['status'] == 'success':
            print('login success!')
#             self.__cookies = self.save_cookies(user)
        else:
            print('login failed!')
        self.__user =  user.text
    
    def get_user_info(self):
        return self.__user

In [10]:
class Douban(Crawl):
    
    def __init__(self, douban_user):
        super(Douban, self).__init__()
        self.set_session(douban_user.get_session())
    
    def search(self, query, cat=''):
        res = dict()
        params = {
            'q':query,
            'cat':cat
        }
        count=0
        response = self.session_get('https://www.douban.com/search', params=params)
        soup = BeautifulSoup(response.content, 'html.parser')
        results = soup.find(class_='result-list').find_all(class_='result')
        for result in results:
            pic = result.find(class_='pic')
            content = result.find(class_='content')
            img = pic.img.get('src')
            link = content.a.get('href')
            name = content.a.text
            description = content.p.text if content.p else ''
            res[count]={
                'name':name,
                'img':img,
                'link':link,
                'description':description
            }
            count+=1
        return res

In [11]:
class DoubanMovie(Douban):
    
    def get_nowshowing_movies(self):
        response = self.request_get('https://movie.douban.com/cinema/nowplaying')
        soup = BeautifulSoup(response.text)
        nowshowing_movies = soup.find(class_='lists').find_all(class_='list-item')
        movies_list = []
        for nowshowing_movie in nowshowing_movies:
            movie = {
                'movie_id':nowshowing_movie.attrs['id'],
                'title':nowshowing_movie.attrs['data-title'],
                'actors':nowshowing_movie.attrs['data-actors'],
                'director':nowshowing_movie.attrs['data-director'],
                'score':nowshowing_movie.attrs['data-score'],
                'release':nowshowing_movie.attrs['data-release'],
                'region':nowshowing_movie.attrs['data-region']
            }
            movies_list.append(movie)
        return movies_list
    
    def get_movie_info(self, url):
        response = self.request_get(url)
        soup = BeautifulSoup(response.text)
        
        movie = {
            'movie_id':re.findall(r'https://movie.douban.com/subject/([0-9]*)',url)[0],
            'title':soup.find(property="v:itemreviewed").text.split()[0]
        }
        for info in soup.find(id='info').text.split('\n'):
            if '导演' in info:
                movie['director'] = info.split(': ')[1].split(' / ')
            elif '编剧' in info:
                movie['scriptwriter'] = info.split(': ')[1].split(' / ')
            elif '主演' in info:
                movie['actors'] = info.split(': ')[1].split(' / ')
            elif '国家' in info:
                movie['region'] = info.split(': ')[1].split(' / ')
        
        movie['score'] = soup.find(property="v:average").text
        movie['release'] = re.findall(r'[(](.*?)[)]', soup.find(class_="year").text)[0]
        movie['introduction'] = ''.join(soup.find(property="v:summary").text.split())
        movie['douban_tags'] = soup.find(class_='tags-body').text.strip().split('\n')
        return movie

    @run_time
    def get_top_250(self):
        movies = dict()
        queue = Queue()
        producer_running = True
        count = 0
        
        def producer(start=0):
            nonlocal producer_running
            params={
                'start':start,
                'filter':''
            }
            response = self.request_get('https://movie.douban.com/top250', params=params)
            soup = BeautifulSoup(response.text)
            items = soup.find_all(class_='item')
            queue.put(items)
            if start < 225:
                producer(start+25)
            else:
                producer_running=False
        
        def customer():
            nonlocal movies
            nonlocal count
            while producer_running is True or queue.empty() is False:
                items = queue.get()
                for item in items:
                    index = item.find('em').text
                    url = item.a.attrs['href']
                    info = self.get_movie_info(url)
                    movies[index] = info
                    count += 1
                print("count={}".format(count))
                time.sleep(int(random.choice([0.5, 0.2, 0.3])))
            
        
        threads = list()
        
        thread_p = threading.Thread(target=producer)
        thread_p.start()
        threads.append(thread_p)
        
        for _ in range(10):
            thread_c = threading.Thread(target=customer)
            thread_c.start()
            threads.append(thread_c)
            
        for thread in threads:
            thread.join(timeout=0.5)
#             thread.join()
        
        return movies
        
    
    def get_recently_hot_movie(self, **kwargs):
        pass
    
    @run_time
    def get_comments(self, **kwargs):
        
        count = 0
        queue = Queue()
        producer_running = True
        
        def producer(url=None, movie_id=None, page_max=50, page=0):
            nonlocal producer_running
            if producer_running is True and page <= page_max:
                movie_url = None
                if movie_id is not None:
                    movie_url = 'https://movie.douban.com/subject/{}/comments'.format(str(movie_id))
                else:
                    movie_url = url
                print("开始爬取第{0}页***********************************************************************：".format(page+1))
                params={
                    'start':page*20,
                    'limit':20,
                    'status':'P',
                    'sort':'new_score'
                }
                html = self.session_get(movie_url, params=params)
                soup = BeautifulSoup(html.content, 'html.parser')
                comments = soup.find(id='comments').find_all(class_='comment-item')
                if len(comments) > 1:
                    queue.put(comments)
                    producer(url=movie_url, page=page+1, page_max=page_max)
                else:
                    producer_running=False
            else:
                producer_running=False
            
        def customer():
            nonlocal count
            while producer_running is True or queue.empty() is False:
                comments = queue.get()
                for comment in comments:
                    content = comment.find(class_='comment-content').text
                    user_name = comment.find(class_='comment-info').a.text
                    comment_time = comment.find(class_='comment-info').find(class_='comment-time').attrs['title']
                    
                    print('comment:{}, user_name:{}, comment_time:{}'.format(content,user_name,comment_time))
                    count += 1
                print("count={}".format(count))
                time.sleep(int(random.choice([0.5, 0.2, 0.3])))
                
        threads = list()
        
        thread_p = threading.Thread(target=producer, kwargs=kwargs)
        thread_p.start()
        threads.append(thread_p)
        
        for _ in range(10):
            thread_c = threading.Thread(target=customer)
            thread_c.start()
            threads.append(thread_c)
            
        for thread in threads:
            thread.join(timeout=0.5)

In [46]:
class DoubanMovie(Douban):
    
    @run_time
    def get_comment(self, url=None, movie_id=None, page_max=50):
        movie_url = None
#         if movie_name is not None:
            
        if movie_id is not None:
            movie_url = 'https://movie.douban.com/subject/{}/comments'.format(str(movie_id))
        else:
            movie_url = url
        count = 0
        for page in range(page_max+1):
            print("开始爬取第{0}页***********************************************************************：".format(page+1))
            params={
                'start':page*20,
                'limit':20,
                'status':'P',
                'sort':'new_score'
            }
            html = self.session_get(movie_url, params=params)
            print(html.url)
            soup = BeautifulSoup(html.content, 'html.parser')
            comments = soup.find(id='comments').find_all(class_='comment-item')
            if len(comments) > 1:
                for comment in comments:
                    content = comment.find(class_='comment-content').text
                    user_name = comment.find(class_='comment-info').a.text
                    comment_time = comment.find(class_='comment-info').find(class_='comment-time').attrs['title']
                    print('comment:{}, user_name:{}, comment_time:{}'.format(content,user_name,comment_time))
                    count += 1
                print("count={}".format(count))
                time.sleep(int(random.choice([0.5, 0.2, 0.3])))
            else:
                print("大约共{0}页评论".format(page+1))
                break
            
#     def get_review_by_url(self, url , page_max=50):
#         res = dict()
#         self.set_url(url)
#         count = 0
#         for page in range(page_max+1):
#             print("开始爬取第{0}页***********************************************************************：".format(page+1))
#             self.set_params({
#                 'start':page*20
#             })
#             html = self.session_get()
#             soup = BeautifulSoup(html.content, 'html.parser')
#             reviews = soup.find(class_='review-list').find_all(class_='review-item')
#             for review in reviews:
#                 user = review.find(class_='main-hd')
#                 user_name = user.find(class_='name').text
#                 user_link = user.find(class_='name').a.get(href)
                
#                 content = review.find(class_='main-bd')
#                 title = content.find('h2').text
#                 link = content.find('h2').a.get('href')
#                 self.set_url(link)
#                 review_response = self.request_get()
#                 review_soup = BeautifulSoup(review_response, 'html.parser')
#                 article = review_soup.find(class_='article')
#                 review_content = article.find(class_='review-content').text
#                 res[title] = {
#                     'user_name':user_name,
#                     'user_link':user_link,
#                     'link':link,
#                     'review_content':review_content
#                 }
#             return res
        
#     def get_review_by_id(self, movie_id, page_max=50):
#         url = 'https://movie.douban.com/subject/{}/reviews'.format(str(movie_id))
#         self.get_review_by_url(url, page_max)
    
    def search(self, movie_name):
        result = super().search(movie_name, cat='1002')
        return result

In [12]:
user = DoubanUser()

new Douban
new UserAgent
init UserAgent
login success!


In [13]:
user.get_user_info()

'{"status":"success","message":"success","description":"处理成功","payload":{"account_info":{"name":"Ljc970412","weixin_binded":false,"phone":"18664678368","avatar":{"medium":"https://img3.doubanio.com\\/icon\\/user_large.jpg","median":"https://img1.doubanio.com\\/icon\\/user_normal.jpg","large":"https://img3.doubanio.com\\/icon\\/user_large.jpg","raw":"https://img3.doubanio.com\\/icon\\/user_large.jpg","small":"https://img1.doubanio.com\\/icon\\/user_normal.jpg","icon":"https://img3.doubanio.com\\/pics\\/icon\\/user_icon.jpg"},"id":"217184556","uid":"217184556"}}}'

In [14]:
dm = DoubanMovie(user)

In [16]:
crawl = Crawl()

In [17]:
crawl.request_get('https://movie.douban.com/subject/1305690/')

ERROR:root:get invalid status code 403 while scraping https://movie.douban.com/subject/1305690/


In [15]:
movie = dm.get_movie_info('https://movie.douban.com/subject/1305690/')

ERROR:root:get invalid status code 403 while scraping https://movie.douban.com/subject/1305690/


AttributeError: 'NoneType' object has no attribute 'text'

In [30]:
top_250 = dm.get_top_250()

running 5.532549858093262 s


In [41]:
top_250

{'1': {'movie_id': '1292052',
  'title': '肖申克的救赎',
  'director': ['弗兰克·德拉邦特'],
  'scriptwriter': ['弗兰克·德拉邦特', '斯蒂芬·金'],
  'actors': ['蒂姆·罗宾斯',
   '摩根·弗里曼',
   '鲍勃·冈顿',
   '威廉姆·赛德勒',
   '克兰西·布朗',
   '吉尔·贝罗斯',
   '马克·罗斯顿',
   '詹姆斯·惠特摩',
   '杰弗里·德曼',
   '拉里·布兰登伯格',
   '尼尔·吉恩托利',
   '布赖恩·利比',
   '大卫·普罗瓦尔',
   '约瑟夫·劳格诺',
   '祖德·塞克利拉',
   '保罗·麦克兰尼',
   '芮妮·布莱恩',
   '阿方索·弗里曼',
   'V·J·福斯特',
   '弗兰克·梅德拉诺',
   '马克·迈尔斯',
   '尼尔·萨默斯',
   '耐德·巴拉米',
   '布赖恩·戴拉特',
   '唐·麦克马纳斯'],
  'region': ['美国'],
  'score': '9.7',
  'release': '1994',
  'introduction': '一场谋杀案使银行家安迪（蒂姆•罗宾斯TimRobbins饰）蒙冤入狱，谋杀妻子及其情人的指控将囚禁他终生。在肖申克监狱的首次现身就让监狱“大哥”瑞德（摩根•弗里曼MorganFreeman饰）对他另眼相看。瑞德帮助他搞到一把石锤和一幅女明星海报，两人渐成患难之交。很快，安迪在监狱里大显其才，担当监狱图书管理员，并利用自己的金融知识帮助监狱官避税，引起了典狱长的注意，被招致麾下帮助典狱长洗黑钱。偶然一次，他得知一名新入狱的小偷能够作证帮他洗脱谋杀罪。燃起一丝希望的安迪找到了典狱长，希望他能帮自己翻案。阴险伪善的狱长假装答应安迪，背后却派人杀死小偷，让他唯一能合法出狱的希望泯灭。沮丧的安迪并没有绝望，在一个电闪雷鸣的风雨夜，一场暗藏几十年的越狱计划让他自我救赎，重获自由！老朋友瑞德在他的鼓舞和帮助下，也勇敢地奔向自由。本片获得1995年奥...',
  'douban_tags': ['经典', '励志', '信念', '自由', '人性', '人生', '美国', '希望']},
 '26':

In [100]:
dm_col = DoubanMovieCollection()

In [101]:
dm_col.find_one(top_250['1'])

{'_id': ObjectId('5ff41531eebd224986385d3b'),
 'movie_id': '1292052',
 'title': '肖申克的救赎',
 'director': ['弗兰克·德拉邦特'],
 'scriptwriter': ['弗兰克·德拉邦特', '斯蒂芬·金'],
 'actors': ['蒂姆·罗宾斯',
  '摩根·弗里曼',
  '鲍勃·冈顿',
  '威廉姆·赛德勒',
  '克兰西·布朗',
  '吉尔·贝罗斯',
  '马克·罗斯顿',
  '詹姆斯·惠特摩',
  '杰弗里·德曼',
  '拉里·布兰登伯格',
  '尼尔·吉恩托利',
  '布赖恩·利比',
  '大卫·普罗瓦尔',
  '约瑟夫·劳格诺',
  '祖德·塞克利拉',
  '保罗·麦克兰尼',
  '芮妮·布莱恩',
  '阿方索·弗里曼',
  'V·J·福斯特',
  '弗兰克·梅德拉诺',
  '马克·迈尔斯',
  '尼尔·萨默斯',
  '耐德·巴拉米',
  '布赖恩·戴拉特',
  '唐·麦克马纳斯'],
 'region': ['美国'],
 'score': '9.7',
 'release': '1994',
 'introduction': '一场谋杀案使银行家安迪（蒂姆•罗宾斯TimRobbins饰）蒙冤入狱，谋杀妻子及其情人的指控将囚禁他终生。在肖申克监狱的首次现身就让监狱“大哥”瑞德（摩根•弗里曼MorganFreeman饰）对他另眼相看。瑞德帮助他搞到一把石锤和一幅女明星海报，两人渐成患难之交。很快，安迪在监狱里大显其才，担当监狱图书管理员，并利用自己的金融知识帮助监狱官避税，引起了典狱长的注意，被招致麾下帮助典狱长洗黑钱。偶然一次，他得知一名新入狱的小偷能够作证帮他洗脱谋杀罪。燃起一丝希望的安迪找到了典狱长，希望他能帮自己翻案。阴险伪善的狱长假装答应安迪，背后却派人杀死小偷，让他唯一能合法出狱的希望泯灭。沮丧的安迪并没有绝望，在一个电闪雷鸣的风雨夜，一场暗藏几十年的越狱计划让他自我救赎，重获自由！老朋友瑞德在他的鼓舞和帮助下，也勇敢地奔向自由。本片获得1995年奥...',
 'douban_tags': ['经典', '励志', '信念', '自由', '人性', '人生', '美国', '希望']}

In [102]:
dm_col.insert(top_250['1'])

DuplicateKeyError: E11000 duplicate key error collection: douban.movie index: _id_ dup key: { _id: ObjectId('5ff41531eebd224986385d3b') }, full error: {'index': 0, 'code': 11000, 'keyPattern': {'_id': 1}, 'keyValue': {'_id': ObjectId('5ff41531eebd224986385d3b')}, 'errmsg': "E11000 duplicate key error collection: douban.movie index: _id_ dup key: { _id: ObjectId('5ff41531eebd224986385d3b') }"}

In [97]:
type(dm_col.find_one(top_250['1']))

dict

In [98]:
for index, movie in top_250.items():
    dm_col.insert(movie)

DuplicateKeyError: E11000 duplicate key error collection: douban.movie index: _id_ dup key: { _id: ObjectId('5ff41531eebd224986385d3b') }, full error: {'index': 0, 'code': 11000, 'keyPattern': {'_id': 1}, 'keyValue': {'_id': ObjectId('5ff41531eebd224986385d3b')}, 'errmsg': "E11000 duplicate key error collection: douban.movie index: _id_ dup key: { _id: ObjectId('5ff41531eebd224986385d3b') }"}