In [1]:
import requests
from bs4 import BeautifulSoup as bss
import re
from operator import attrgetter, itemgetter
import json
from multiprocessing.dummy import Pool, Queue
from tqdm import tqdm
from tqdm import tqdm_notebook
import gzip
import codecs

# Получение ссылок на игры

In [2]:
def get_page(url, n_attempts=5, t_sleep=1):
    for i in range(n_attempts):
        res = requests.get(url)
        if res.ok:
            return res
        time.sleep(t_sleep)

In [3]:
def collect_urls_from_page(num, n_attempts=5, t_sleep=1):
    url = 'https://gg.deals/games/?sort=metascore&type=1&page={}'.format(num)
    prefix = 'https://gg.deals'
    temp = []
    for i in range(n_attempts):
        res = requests.get(url)
        if res.ok:
            soup = bss(res.text, 'lxml')
            if soup is not None:
                obj = soup.find('div', id="games-list")
                if obj is not None:
                    obj1 = obj.find('div', class_='grid-list')
                    if obj1 is not None:
                        obj2 = obj1.find_all('a', class_='full-link')
                        if obj2 is not None:
                            obj3 = list(map(itemgetter('href'), obj2))
                            with lock:
                                pbar.update(1)
                            return [prefix+x for x in obj3]
    with lock:
        pbar.update(1)
    return temp

In [4]:
limit = 100
with Pool(processes=8) as pool, tqdm(range(1, limit)) as pbar:
    lock = pbar.get_lock()
    res = pool.map(collect_urls_from_page, range(1, limit))
data_urls = [elem for res_in in res for elem in res_in]

100%|██████████| 99/99 [00:28<00:00,  3.48it/s]


In [5]:
len(data_urls)

2376

# Получение информации об играх

In [6]:
def process_page(url):
    data_pack = dict()
    data_pack['url'] = url
    row_data = get_page(url)
    soup = bss(row_data.text, 'lxml')
    temp1 = soup.find(id='game-card')
    temp1 = temp1.find('div', class_=lambda s: s and
                       s.startswith("game-info-actions"))
    w1 = "game-collection-actions"
    w2 = 'data-game-id'
    code = int(temp1.find('div', class_=lambda s: s and s.startswith(w1))[w2])
    data_pack['status'] = True
    obj = soup.find('a', class_='active', itemprop='item')
    if obj is not None:
        obj1 = obj.find('span')
        if obj1 is not None:
            data_pack['name'] = obj1.text
    obj_cur = soup.find('div', class_="game-info-image")
    if obj_cur is not None:
        obj1 = obj_cur.find_all('img')
        if (obj1 is not None) and obj1 != []:
            data_pack["image"] = obj1[0].get('src')
    w1 = 'game-info-widget'
    obj0 = soup.find('div', class_=lambda s: s and s.startswith(w1))
    if obj0 is not None:
        obj = obj0.find('div', class_='game-info-details')
        if obj is not None:
            w1 = 'section-release'
            obj1 = obj.find('div', class_=lambda s: s and s.endswith(w1))
            if obj1 is not None:
                obj1 = obj1.find('p')
                if obj1 is not None:
                    data_pack['release_date'] = obj1.text
            w1 = 'section-developer'
            obj1 = obj.find('div', class_=lambda s: s and s.endswith(w1))
            if obj1 is not None:
                obj1 = obj1.find('p')
                if obj1 is not None:
                    data_pack['developer'] = obj1.text
            w1 = 'score-metascore'
            obj1 = obj.find('a', class_=lambda s: s and s.endswith(w1))
            if obj1 is not None:
                obj1 = obj1.find('span', class_="overlay")
                if obj1 is not None:
                    data_pack['metacritic_score'] = int(obj1.text)
            w1 = 'score-userscore'
            obj1 = obj.find('a', class_=lambda s: s and s.endswith(w1))
            if obj1 is not None:
                obj1 = obj1.find('span', class_="overlay")
                if obj1 is not None:
                    data_pack["user_score"] = float(obj1.text)
            obj1 = obj.find('div', class_='score-col full')
            if obj1 is not None:
                obj1 = obj1.find_all('span')
                if (obj1 is not None) or obj1 != []:
                    data_pack["review_label"] = re.sub(r'[1234567890,()]',  "",
                                                       obj1[0].text).strip()
                    temp = obj1[0].get('title')
                    if temp is not None:
                        try:
                            w1 = "review_positive_pctg"
                            data_pack[w1] = int(temp.split()[0][:-1])
                        except Exception as e:
                            pass
                    temp = obj1[0].find(class_="semi-transparent")
                    if temp is not None:
                        try:
                            res = re.sub(r'[,)( ]', "", temp.text)
                            data_pack['review_count'] = int(res)
                        except Exception as e:
                            pass
        obj = obj0.find('div', id="game-info-genres")
        if obj is not None:
            obj = obj.find('div', class_="tags-list link-list")
            if obj is not None:
                data_pack["genres"] = []
                obj = obj.find_all('a', class_='badge')
                if obj is not None:
                    data_pack["genres"] = list(map(attrgetter('text'), obj))
        obj = obj0.find('div', id="game-info-tags")
        if obj is not None:
            obj = obj.find('div', class_="tags-list tags-list-dotdotdot")
            if obj is not None:
                data_pack["tags"] = []
                obj = obj.find_all('a', class_='badge')
                if obj is not None:
                    data_pack["tags"] = list(map(attrgetter('text'), obj))
        obj = obj0.find('div', id="game-info-features")
        if obj is not None:
            obj = obj.find('div', class_="tags-list tags-list-dotdotdot")
            if obj is not None:
                data_pack["features"] = []
                obj = obj.find_all('a', class_='badge')
                if obj is not None:
                    data_pack["features"] = list(map(attrgetter('text'), obj))
    obj = soup.find('div', class_="game-requirements-tabs")
    if obj is not None:
        data_pack["pc_systems"] = []
        w2 = "menu-item"
        obj1 = obj.find_all('li', class_=lambda s: s and s.endswith(w2))
        if obj1 is not None:
            data_pack["pc_systems"] = list(map(attrgetter('text'), obj1))
    obj = soup.find('div', class_='game-info-actions')
    if obj is not None:
        w1 = "wishlisted-game"
        obj1 = obj.find('div', class_=lambda s: s and s.startswith(w1))
        if obj1 is not None:
            obj1 = obj1.find('span', class_='count')
            if obj1 is not None:
                data_pack["wishlist_count"] = obj1.text
        w1 = "alerted-game"
        obj1 = obj.find('div', class_=lambda s: s and s.startswith(w1))
        if obj1 is not None:
            obj1 = obj1.find('span', class_='count')
            if obj1 is not None:
                data_pack["alert_count"] = obj1.text
        w1 = "owned-game"
        obj1 = obj.find('div', class_=lambda s: s and s.startswith(w1))
        if obj1 is not None:
            obj1 = obj1.find('span', class_='count')
            if obj1 is not None:
                data_pack["owners_count"] = obj1.text
    w1 = "offer-section"
    w2 = "game-dlcs"
    obj = soup.find('section', class_=lambda s: s and s.endswith(w1), id=w2)
    data_pack["dlcs"] = []
    data_pack["packs"] = []
    if obj is not None:
        obj1 = obj.find_all('a', class_="full-link")
        if obj1 is not None:
            temp = list(map(itemgetter('href'), obj1))
            for i in range(0, len(temp)):
                temp[i] = "https://gg.deals"+temp[i]
            data_pack["dlcs"] = temp
    w1 = "offer-section"
    w2 = "game-packs"
    obj = soup.find('section', class_=lambda s: s and s.endswith(w1), id=w2)
    if obj is not None:
        obj1 = obj.find_all('a', class_="full-link")
        if obj1 is not None:
            temp = list(map(itemgetter('href'), obj1))
            for i in range(0, len(temp)):
                temp[i] = "https://gg.deals" + temp[i]
            data_pack["packs"] = temp
    if obj_cur is not None:
        obj1 = obj_cur.find('a', class_='game-link-widget')
        if obj1 is not None:
            temp = obj1.get('href')
            if temp is not None:
                try:
                    temp = requests.get(temp).url
                    data_pack["market_url"] = temp
                except Exception as e:
                    pass
    headers = {'X-Requested-With': 'XMLHttpRequest'}
    part1 = 'https://gg.deals/ru/games/chartHistoricalData/'
    part2 = '/?hideKeyshops=0'
    url_pr = part1 + str(code) + part2
    data_pack["price_history"] = []
    try:
        data1 = requests.get(url_pr, headers=headers).json()
        for elem in data1['chartData']['deals']:
            data_pack["price_history"].append({'ts': elem['x'],
                                               "price": elem['y'],
                                               'shop': elem['shop']})
    except Exception as e:
        pass
    return data_pack

In [7]:
queue = Queue()

for url in data_urls:
    queue.put(url)

In [8]:
def process_page_wrapper(i):
    with gzip.open('part_{:05d}.jsonl.gz'.format(i), mode='wb') as f_json:
        f_json = codecs.getwriter('utf_8')(f_json)

        while not queue.empty():
            cur_url = queue.get()
            try:
                record = process_page(cur_url)
                if record == {}:
                    with lock:
                        pbar.update(1)
                    continue
                record_str = json.dumps(record, ensure_ascii=False)
                print(record_str, file=f_json)

            # счетчик должен атомарно обновиться
                with lock:
                    pbar.update(1)
            except Exception as e:
                message = cur_url+" : "+str(e)
                print(message, file=sys.stderr)
                with lock:
                    pbar.update(1)


with Pool(processes=8) as pool, tqdm(total=queue.qsize()) as pbar:
    lock = pbar.get_lock()
    pool.map(process_page_wrapper, range(pool._processes))

100%|██████████| 2376/2376 [13:00<00:00,  3.04it/s]
