In [1]:
import logging
logging.basicConfig(
#         filename='file1.log',
        level=logging.WARNING,
        format='%(asctime)s %(levelname)s:\t%(message)s',
        datefmt='%Y-%m-%d %H:%M:%S')

In [2]:
import re
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm_notebook
from lxml import etree, html as lhtml
import pandas as pd
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException        
import random
import html
from time import sleep

In [49]:
def process_one_page(card_data):
    def check_exists_by_xpath(xpath, driver):
        try:
            driver.find_element_by_xpath(xpath)
        except NoSuchElementException:
            return False
        return True
    def find_name(driver, f_write):
        name = driver.find_element_by_xpath('//h1[@class="moviename-big"]').text
        print('\t{}'.format(name), file=f_write, end='')
    def find_favors(driver, f_write):
        favors = driver.find_element_by_xpath('//div[@class="num"]').text
        if favors != '':
            favors = int(favors)
        print('\t{}'.format(favors), file=f_write, end='')
    def find_foldrs(driver, f_write):
        foldrs = re.search(r'[0-9]+', driver.find_element_by_xpath('//div[@class="text"]').text)
        if foldrs is not None:
            foldrs = int(foldrs[0])
        print('\t{}'.format(foldrs), file=f_write, end='')
    def find_awards(driver, f_write):
        awards = check_exists_by_xpath('//table[@id="awardsBlock"]', driver)
        print('\t{}'.format(awards), file=f_write, end='')
    def find_best_oscar(driver, f_write):
        path1 = '//div[@class="awardsLargeBlockP Oscar best_serial"]'
        path2 = '//div[@class="awardsLargeBlockP Oscar best_serial"]//span[@class="num"]'
        best_oscar = 0
        if check_exists_by_xpath(path1, driver):
            best_oscar = 1
            if check_exists_by_xpath(path2, driver):
                best_oscar = driver.find_element_by_xpath(path2).text
        print('\t{}'.format(best_oscar), file=f_write, end='')
    def find_not_best_oscar(driver, f_write):
        path1 = '//div[@class="awardsLargeBlockP Oscar notbest_serial"]'
        path2 = '//div[@class="awardsLargeBlockP Oscar notbest_serial"]//span[@class="num"]'
        not_best_oscar = 0
        if check_exists_by_xpath(path1, driver):   
            not_best_oscar = 1
            if check_exists_by_xpath(path2, driver):
                not_best_oscar = driver.find_element_by_xpath(path2).text
        print('\t{}'.format(not_best_oscar), file=f_write, end='')
    def find_rating(driver, f_write):
        path = '//i[@id="avg_rating"]'
        rating = ''
        if check_exists_by_xpath(path, driver):
            rating = driver.find_element_by_xpath(path).text
        print('\t{}'.format(rating), file=f_write, end='')
    def find_screenwriter(driver, f_write):
        path = '//div[@id="writer"]//span[@class="viewingCount"]'
        writer = 0
        if check_exists_by_xpath(path, driver):
            writer = driver.find_element_by_xpath(path).text[1:-1]
            if writer == '':
                writer = 1
        print('\t{}'.format(writer), file=f_write, end='')
    def find_producer(driver, f_write):
        path = '//div[@id="producer"]//span[@class="viewingCount"]'
        producer = 0
        if check_exists_by_xpath(path, driver):
            producer = driver.find_element_by_xpath(path).text[1:-1]
            if producer == '':
                producer = 1
        print('\t{}'.format(producer), file=f_write, end='')
    def find_director(driver, f_write):
        path = '//div[@id="director"]//span[@class="viewingCount"]'
        director = 0
        if check_exists_by_xpath(path, driver):
            director = driver.find_element_by_xpath(path).text[1:-1]
            if director == '':
                director = 1
        print('\t{}'.format(director), file=f_write, end='')
    def find_editor(driver, f_write):
        path = '//div[@id="editor"]//span[@class="viewingCount"]'
        editor = 0
        if check_exists_by_xpath(path, driver):   
            editor = driver.find_element_by_xpath(path).text[1:-1]
            if editor == '':
                editor = 1
        print('\t{}'.format(editor), file=f_write, end='')
    def find_actor(driver, f_write):
        path = '//div[@id="actor"]//span[@class="viewingCount"]'
        actor = 0
        if check_exists_by_xpath(path, driver):   
            actor = driver.find_element_by_xpath(path).text[1:-1]
            if actor == '':
                actor = 1
        print('\t{}'.format(actor), file=f_write, end='')
    def process_describe_field(driver, f_write):
        describe = (e.text for e in driver.find_elements_by_xpath('//table[@class="info"]//tr'))
        describe = list(filter(None, map(str.lower, describe)))  
        fields = ['male', 'career', 'length', 'day', 'mounth', 'year', 'dth_day', 'dth_mounth', 'dth_year', \
                  'city', 'cntr_1', 'cntr_2', 'dth_city', 'dth_cntr_1', 'dth_cntr_2', \
                  'genres', 'chldrns', 'spouses', 'films_number', 'career_start', 'career_finish']
        d_fields = dict()
        for f in fields:
            d_fields[f] = ''
        for item in describe: 
            if item[:7] == 'карьера':
                d_fields['career'] = item[8:].replace(', ', ';').replace(',', ';')
                if re.search('(А|а)ктриса', d_fields['career']) is not None:
                    d_fields['male'] = 0
                elif re.search('(А|а)кт(е|ё)р', d_fields['career']) is not None:
                    d_fields['male'] = 1
            elif item[:4] == 'рост':
                d_fields['length'] = float(item[5:-2]) 
            elif item[:13] == 'дата рождения':
                b_day = item[14:].split(sep=',') 
                if len(b_day) != 2:
                    continue
                day_and_mounth = b_day[0].split()
                if len(day_and_mounth) != 2:
                    continue 
                d_fields['day'] = int(day_and_mounth[0])
                d_fields['mounth']  = day_and_mounth[1]
                d_fields['year'] = int(re.search(r'[0-9]{4,}', b_day[1])[0])
            elif item[:11] == 'дата смерти':
                b_day = item[12:].split(sep=',') 
                if len(b_day) != 2:
                    continue
                d_fields['dth_day'] = int(b_day[0].split()[0])
                d_fields['dth_mounth']  = b_day[0].split()[1]
                d_fields['dth_year'] = int(re.search(r'[0-9]{4,}', b_day[1])[0])
            elif item[:14] == 'место рождения':
                b_place = item[15:].split(sep=',')
                d_fields['city'] = b_place[0]
                if len(b_place) != 1:
                    d_fields['cntr_1'] = b_place[1][1:]
                    if len(b_place) != 2:
                        d_fields['cntr_2'] = b_place[2][1:]
            elif item[:12] == 'место смерти':
                b_place = item[13:].split(sep=',')
                d_fields['dth_city'] = b_place[0]
                if len(b_place) != 1:
                    d_fields['dth_cntr_1'] = b_place[1][1:]
                    if len(b_place) != 2:
                        d_fields['dth_cntr_2'] = b_place[2][1:]
            elif item[:5] == 'жанры':
                d_fields['genres'] = ';'.join([i[1:] for i in item[5:].split(sep=',')])
            elif item[:7] == 'супруга':
                if d_fields['male'] != 0:
                    d_fields['male'] = 1
                family = item[8:].split(sep='\n')
                d_fields['chldrns'] = ';'.join([re.search(r'\.\.\. .*(ребен|детей)', i)[0][4:-6] for i in family \
                                     if re.search(r'\.\.\. .*(ребен|детей)', i) is not None])
                d_fields['spouses'] = ';'.join([i for i in family \
                                                if re.search(r'\.\.\. .*(ребен|детей)', i) is None])
            elif item[:6] == 'супруг':
                if d_fields['male'] != 1:
                    d_fields['male'] = 0
                family = item[7:].split(sep='\n')
                d_fields['chldrns'] = ';'.join([re.search(r'\.\.\. .*(ребен|детей)', i)[0][4:-6] for i in family \
                                     if re.search(r'\.\.\. .*(ребен|детей)', i) is not None])
                d_fields['spouses'] = ';'.join([i for i in family \
                                                if re.search(r'\.\.\. .*(ребен|детей)', i) is None])
            elif item[:13] == 'всего фильмов':
                total_films = item[14:].split(sep=',')
                d_fields['films_number'] = int(total_films[0])
                if len(d_fields) != 2:
                    continue
                career_years = total_films[1][1:].split(sep=' — ')
                d_fields['career_start'] = int(career_years[0])
                if len(career_years) != 1:                
                    d_fields['career_finish'] = int(career_years[1])
        for f in fields:
            print('\t{}'.format(d_fields[f]), file=f_write, end='')
        
    url, driver, f_write, rank = card_data
    driver.get(url)      
    sleep(random.randint(2,20))

    print('{}'.format(rank), file=f_write, end='')
    find_name(driver, f_write)
    find_favors(driver, f_write)
    find_foldrs(driver, f_write)
    find_awards(driver, f_write)
    find_best_oscar(driver, f_write)
    find_not_best_oscar(driver, f_write)
    find_rating(driver, f_write)
    find_screenwriter(driver, f_write)
    find_producer(driver, f_write)
    find_director(driver, f_write)
    find_editor(driver, f_write)
    find_actor(driver, f_write)
    
    process_describe_field(driver, f_write)

    print('', file=f_write)
    logging.info('success:\t\t{}'.format(url))  
    
fields = ['rank', 'name', 'favors', 'folders', 'awards', 'oscar', 'nom_oscar', 'rating', \
          'writer', 'producer', 'director', 'editor', 'actor', \
          'male', 'career', 'height', 'day', 'mounth', 'year', 'dth_day', 'dth_mounth', 'dth_year', \
          'city', 'cntr_1', 'cntr_2', 'dth_city', 'dth_cntr_1', 'dth_cntr_2', \
          'genres', 'chldrns', 'spouses', 'total_films', 'career_start', 'caree_finish']

# 2 этап. Обкачка этих ссылок.

In [77]:
nrun = 66
start = 4998
finish = 5001

def count_lines(filename):
    with open(filename) as file:
        return sum(chunk.count('\n') for chunk in iter(lambda: file.read(), ''))

cart_table_txt = 'data/cart_table({}).txt'.format(nrun)
actors_links_txt = 'data/actors_links_ranked.txt'
total_actors = count_lines(actors_links_txt)

In [78]:
pbar = tqdm_notebook(total=total_actors)
pbar.update(start)

with webdriver.Firefox() as driver:
    with open(cart_table_txt, 'w') as f_write:
        print('\t'.join(fields), file=f_write)
        card_tuples = []
        with open(actors_links_txt, 'r') as f_read:
            for rank, item in enumerate(f_read):
                card_tuples.append((item[:-2], driver, f_write, rank+1))
        
        for card_data in card_tuples[start:finish]:
#             try:
            process_one_page(card_data)
#             except:
#                 logging.error('fail:\t\t{}'.format(card_data[0]))              
            pbar.update(1)

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))

In [79]:
little_df = pd.read_csv(cart_table_txt, sep='\t')
total_df = pd.read_csv('data/total_cart_table({})'.format(nrun-1), sep='\t')
total_df = pd.concat([total_df, little_df])
total_df.to_csv('data/total_cart_table({})'.format(nrun), sep='\t', index=False)