# Class for parsing individual game html

In [28]:
import requests
from bs4 import BeautifulSoup
import re


class GameParser:
    def __init__(self):
        self.session = requests.Session()

    def get_game_html(self, url):
        try:
            r = self.session.get(url)
            r.raise_for_status()
            return r.content
        except requests.exceptions.HTTPError as err:
            print('error: ', err)

    def strip_features(self, html):
        # object to return 
        game = {}

        # desired features to gather
        keys = ['SteamURL', 'Genres', 'Indie', 'Soundtrack', 'Franchise',
                'OriginalCost', 'DiscountedCost', 'Players', 'Controller',
                'Languages', 'Graphics', 'Storage', 'RAM', 'Tags']

        # corresponding values for the desired features
        values = []

        # turn the html into a searchable soup object
        soup = BeautifulSoup(html)

        # steam_url
        steam_url = soup.find(
            'div', class_='breadcrumbs').find_all('a')[-1].get('href')
        values.append(steam_url)

        # contains several details we want
        details = soup.find('div', class_='details_block').get_text()

        # genres
        genres = self.__re_detail('genre', details)
        values.append(genres)

        # list of tags for extracting features
        tags = [tag.get_text().strip()
                for tag in soup.find_all('a', class_='app_tag')]

        # indie and soundtrack
        for tag in ['Indie', 'Soundtrack']:
            if tag in tags:
                tags.remove(tag)
                values.append(True)
            else:
                values.append(False)

        # franchise
        franchise = self.__re_detail('franchise', details)
        values.append(franchise)

        # cost
        cost_details = soup.find('div', class_='game_purchase_action')
        original_cost, discounted_cost = self.__get_prices(cost_details)
        values.append(original_cost)
        values.append(discounted_cost)

        # players
        game_details = soup.find_all('div', class_='game_area_details_specs')
        players = self.__get_players(game_details, tags)
        values.append(players)

        # controller
        controller = self.__get_controller(game_details)
        values.append(controller)

        # languages
        language_details = soup.find('table', class_='game_language_options')
        languages = self.__get_languages(language_details)
        values.append(languages)

        # system requirements
        sys_requirements = soup.find('div', 'sysreq_contents')
        sys_requirements = self.__parse_sys_reqs(sys_requirements)

        # graphics
        graphics = self.__get_graphics(sys_requirements)
        values.append(graphics)

        # storage
        storage = self.__get_storage(sys_requirements)
        values.append(storage)

        # ram
        ram = self.__get_ram(sys_requirements)
        values.append(ram)

        # add the rest of the tags as seperate feature
        values.append(', '.join(tags))

        game = zip(keys, values)
        return game

    def __re_detail(self, detail, text):
        '''Get the value after 'Detail: ' type string'''
        try:
            expression = f'{detail.title()}:[\s\n].*'
            return re.search(expression, text).group().split(':')[1].strip()
        except:
            return None

    def __get_prices(self, details):
        discounted = details.find(
            'div', 'discount_block game_purchase_discount')
        if discounted:
            discounted_cost = discounted.find(
                'div', 'discount_final_price').text.strip()
            original_cost = discounted.find(
                'div', 'discount_original_price').text.strip()
        else:
            discounted_cost = None
            original_cost = details.find(
                'div', 'game_purchase_price price').text.strip()

        return original_cost, discounted_cost

    def __get_players(self, details, tags):
        player_choices = ['singleplayer', 'multiplayer', 'pvp', 'online pvp',
                          'lan pvp', 'shared/split screen pvp', 'coop',
                          'online coop', 'lan coop', 'shared/split screen coop',
                          'shared/split screen', 'crossplatform multiplayer']

        details = [detail.get_text().lower().replace('-', '').split(' ')
                   for detail in details]
        tags = [tag.lower().replace('-', '') for tag in tags]
        details += tags
        players = [choice for choice in player_choices if choice in details]
        return ', '.join(players)

    def __get_controller(self, details):
        all_details_text = [detail.get_text().lower().replace('-', '')
                            for detail in details]
        return 'controller' in ' '.join(all_details_text)

    def __get_languages(self, details):
        languages = [detail.get_text().strip()
                     for detail in details.find_all('td')]
        languages = [language for language in languages if language.isalpha()]
        return ', '.join(languages)

    def __parse_sys_reqs(self, sys_reqs):
        sys_reqs = sys_reqs.find_all('li')
        sys_reqs = [ req.get_text() for req in sys_reqs ]

    def __get_graphics(self, sys_reqs):
        gpu = self.__re_detail('Graphics', ', '.join(sys_reqs)).split(', ')[:-1]
        return ', '.join(gpu)
    
    def __get_storage(self, sys_reqs):
        storage = self.__re_detail('Storage', ', '.join(sys_reqs))
        if storage:
            return ', '.join(storage.split(', ')[:-1])
        else:
            hard_drive = self.__re_detail('Hard Drive', ', '.join(sys_reqs))
            return ', '.join(hard_drive.split(', ')[:-1])

    def __get_ram(self, sys_reqs):


## Test

In [29]:
# instantiate
s = GameParser()

discounted_game_url = 'https://store.steampowered.com/app/447040/Watch_Dogs_2/'
normal_game_url = 'https://store.steampowered.com/app/42700/Call_of_Duty_Black_Ops/'
# get game information
html = s.get_game_html(discounted_game_url)
# html = s.get_game_html(normal_game_url)

# pull features from html
features = s.strip_features(html)
for key, val in features:
    print(f'{key}: {val}')

SteamURL: https://store.steampowered.com/app/447040/?snr=1_5_9__205
Genres: Action, Adventure
Indie: False
Soundtrack: False
Franchise: None
OriginalCost: $49.99
DiscountedCost: $9.99
Players: singleplayer, multiplayer, coop
Controller: True
Languages: English, French, Italian, German, Arabic, Czech, Dutch, Hungarian, Japanese, Korean, Polish, Russian
Graphics: NVIDIA GeForce GTX 660 with 2 GB VRAM or AMD Radeon HD 7870, with 2 GB VRAM or better - See supported List*
Storage: 27 GB available space
Tags: Illuminati, Open World, Hacking, Action, Parkour, Multiplayer, Stealth, Third Person, Shooter, Crime, Singleplayer, Third-Person Shooter, Adventure, Co-op, Sexual Content, Comedy, Mature, Atmospheric, Cute, Gore
