# Scrapping Steam 

Building a dataset by scraping all of the products on steam.

In [1]:
# dependencies
from bs4 import BeautifulSoup
import requests
import json
import pandas as pd
import re
import numpy as np

import time

# mutlitasking
from joblib import Parallel, delayed
import multiprocessing

The base page show the top 50 results. But if we want more then just the top 50. Because the page is a infinite scroll page, the easiest way to get the information is to look for is the query which calls for the next 50 game.

![image info](images/where.png)
![image info](images/qurey.png)

[All Products](https://store.steampowered.com/search/?term=)
<br/>
[Query](https://store.steampowered.com/search/results/?query&start=350&count=50&dynamic_data=&sort_by=_ASC&snr=1_7_7_230_7&infinite=1)

In [2]:
# scrap url and return html
def scrap_data(url):
    r = requests.get(url)
    data = dict(r.json())
    return data['results_html']

In [3]:
# total product count
def total_results(url):
    r = requests.get(url)
    data = dict(r.json())
    total_results = data['total_count']
    return int(total_results)

url = 'https://store.steampowered.com/search/results/?query&start=0&count=50&dynamic_data=&sort_by=_ASC&snr=1_7_7_230_7&infinite=1'
print("Total product count - " + str((total_results(url))))

Total product count - 124488


In [4]:
# get data

def get_features(data):
    """
    data = page html

        find all anchor tags, for each anchor ie 'game' return price, dicounted price, 
        discount percent, review info, game page hyperlink

        request games page html and return user added tags, genres,  developer, publisher,
        game description

    """

    games_list = []

    soup = BeautifulSoup(data, 'html.parser')
    games = soup.find_all('a')
    for game in games:
        title = game.find('span', {'class': 'title'}).text
        try:
            price = game.find(
                'div', {'class': 'search_price'}).get_text("", strip=True).split('R')[1]
        except:
            price = "No price"
        try:
            discount_price = game.find(
                'div', {'class': 'search_price'}).get_text("", strip=True).split('R')[2]
        except:
            discount_price = price
        discount_percent = game.find(
            'div', {'class': 'search_discount'}).get_text("", strip=True)
        release_date = game.find('div', {'class': 'search_released'}).text

        # review count and status
        try:
            reviews = str(
                game.find('span', {'data-tooltip-html': True})).split('=')[2].replace('&lt;br&gt;', ' ')
            reviews = re.search('"(.*)"', reviews).group(1)
        except:
            reviews = 'No reviews yet'

        # get game link
        game_href = game.get('href')
        # get games page data
        game_data = requests.get(game_href).text

        soup = BeautifulSoup(game_data, 'html.parser')
        # is game or product
        try:
            game_product = soup.find(
                'div', class_='breadcrumbs').text.replace('\t', '').replace('\n', '').replace('\r', ' ')
        except:
            game_product = 'NoneType'

        # user added tags
        game_data = soup.find_all('a', class_='app_tag')
        tags = []
        [[tags.append(i.strip()) for i in tag] for tag in game_data]
        # game genres
        try:
            # return full block
            # split block into list
            # find genre and developer and return all inbetween
            #
            block = soup.find('div', {'class': 'block_content_inner'}).text
            block = block.split()
            genre_start = block.index('Genre:')
            genre_end = block.index('Developer:')
            all_genres = block[genre_start + 1:genre_end]
            first_genre = block[genre_start + 1].replace(',', '')
            second_genre = block[genre_start + 2].replace(',', '')
            if first_genre == 'Free':
                first_genre = 'Free to play'
            elif first_genre == 'Massively':
                first_genre = 'Massively multiplayer'
            elif first_genre == 'Early':
                first_genre = 'Early access'
            # space
            if second_genre == 'Free':
                second_genre = 'Free to play'
            elif second_genre == 'Massively':
                second_genre = 'Massively multiplayer'
            elif second_genre == 'Early':
                second_genre = 'Early access'
            elif second_genre == 'Developer:':
                second_genre = 'No genre given'
        except:
            all_genres = 'no genre given'
            first_genre = 'no genre given'
            second_genre = 'no genre given'

        # DEVELOPER / PUBLISHER
        dev = soup.find("a", href=re.compile("developer"))
        pub = soup.find("a", href=re.compile("publisher"))
        if dev != None:
            dev = dev.text
        if pub != None:
            pub = pub.text

        # game description
        try:
            description = soup.find(
                'div', {'id': 'game_area_description'}).text
            description = description.replace(
                '\r', ' ').replace('\n', ' ').replace('\t', ' ')
            description = ' '.join(description.split())
        except:
            description = 'no description given'

        games_dict = {
            'title': title,
            'hyperlink': game_href,
            'is game': game_product,
            'Genres': all_genres,
            'main_genre': first_genre,
            'sub_genre': second_genre,
            'price_in_rand': price,
            'discount_price': discount_price,
            'discount_percent': discount_percent,
            'release_date': release_date,
            'reviews': reviews,
            'tags': tags,
            'developer': dev,
            'publisher': pub,
            'game description': description}
        games_list.append(games_dict)

    return games_list

In [5]:
# url = 'https://store.steampowered.com/search/results/?query&start=0&count=50&dynamic_data=&sort_by=_ASC&snr=1_7_7_7000_7&filter=topsellers&infinite=1'
url = 'https://store.steampowered.com/search/results/?query&start=0&count=50&dynamic_data=&sort_by=_ASC&snr=1_7_7_230_7&infinite=1'
data = scrap_data(url)
test = get_features(data)
test[0]

{'title': 'Counter-Strike: Global Offensive',
 'hyperlink': 'https://store.steampowered.com/app/730/CounterStrike_Global_Offensive/?snr=1_7_7_230_150_1',
 'is game': 'All Games > Free to Play Games > Counter-Strike: Global Offensive',
 'Genres': ['Action,', 'Free', 'to', 'Play'],
 'main_genre': 'Action',
 'sub_genre': 'Free to play',
 'price_in_rand': 'No price',
 'discount_price': 'No price',
 'discount_percent': '',
 'release_date': '21 Aug, 2012',
 'reviews': 'Very Positive 88% of the 6,545,979 user reviews for this game are positive.',
 'tags': ['FPS',
  'Shooter',
  'Multiplayer',
  'Competitive',
  'Action',
  'Team-Based',
  'eSports',
  'Tactical',
  'First-Person',
  'PvP',
  'Online Co-Op',
  'Co-op',
  'Strategy',
  'Military',
  'War',
  'Difficult',
  'Trading',
  'Realistic',
  'Fast-Paced',
  'Moddable'],
 'developer': 'Valve',
 'publisher': 'Valve',
 'game description': 'About This Game Counter-Strike: Global Offensive (CS: GO) expands upon the team-based action gamepla

In [6]:
def build_dataset(start):
    increment = 50
    end = start + 500
    url = 'https://store.steampowered.com/search/results/?query&start={}&count=50&dynamic_data=&sort_by=_ASC&snr=1_7_7_230_7&infinite=1'
    temp_list = []
    for i in range(start, end, increment):
        data = scrap_data(url.format(i))
        results = get_features(data)
        temp_list.append(results)
        print('Games scraped - {}'.format(i))
        time.sleep(1.5)
    return temp_list

In [13]:
# create list of starting values for 'build_dataset' function
inputs_one = []
for i in range(0,25000,500):
    inputs_one.append(i)
print(inputs_one)

inputs_two = []
for i in range(25000,50000,500):
    inputs_two.append(i)
print(inputs_two)

inputs_three = []
for i in range(50000,75000,500):
    inputs_three.append(i)
print(inputs_three)

inputs_four = []
for i in range(75000,90000,500):
    inputs_four.append(i)
print(inputs_four)

[0, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500, 5000, 5500, 6000, 6500, 7000, 7500, 8000, 8500, 9000, 9500, 10000, 10500, 11000, 11500, 12000, 12500, 13000, 13500, 14000, 14500, 15000, 15500, 16000, 16500, 17000, 17500, 18000, 18500, 19000, 19500, 20000, 20500, 21000, 21500, 22000, 22500, 23000, 23500, 24000, 24500]
[25000, 25500, 26000, 26500, 27000, 27500, 28000, 28500, 29000, 29500, 30000, 30500, 31000, 31500, 32000, 32500, 33000, 33500, 34000, 34500, 35000, 35500, 36000, 36500, 37000, 37500, 38000, 38500, 39000, 39500, 40000, 40500, 41000, 41500, 42000, 42500, 43000, 43500, 44000, 44500, 45000, 45500, 46000, 46500, 47000, 47500, 48000, 48500, 49000, 49500]
[50000, 50500, 51000, 51500, 52000, 52500, 53000, 53500, 54000, 54500, 55000, 55500, 56000, 56500, 57000, 57500, 58000, 58500, 59000, 59500, 60000, 60500, 61000, 61500, 62000, 62500, 63000, 63500, 64000, 64500, 65000, 65500, 66000, 66500, 67000, 67500, 68000, 68500, 69000, 69500, 70000, 70500, 71000, 71500, 72000, 72500,

In [8]:
# docs - https://joblib.readthedocs.io/en/latest/parallel.html
# running functions in parallel
num_cores = multiprocessing.cpu_count()
print(num_cores)
# 
results_one = Parallel(n_jobs=num_cores)(delayed(build_dataset)(i) for i in inputs_one)
print("One finished")

12
One finished


In [9]:
results_two = Parallel(n_jobs=num_cores)(delayed(build_dataset)(i) for i in inputs_two)
print("Two finished")

Two finished


In [10]:
results_three = Parallel(n_jobs=num_cores)(delayed(build_dataset)(i) for i in inputs_three)
print("Three finished")

Three finished


In [14]:
results_four = Parallel(n_jobs=num_cores)(delayed(build_dataset)(i) for i in inputs_four)
print("Four finished")

Four finished


In [16]:
results = results_one + results_two + results_three + results_four

In [17]:
print(len(results))
print(len(results[0]))
print(len(results[0][0]))
print(len(results[0][0][0]))

180
10
50
15


In [18]:
games = []
for i in range(len(results)):
    for j in range(len(results[i])):
        games.append(results[i][j])

In [19]:
print(len(games))
print(len(games[0]))
print(len(games[0][0]))

1800
50
15


In [20]:
# save data as csv
def output(results):
    df = pd.concat([pd.DataFrame(g) for g in results])
    df.to_csv('steam_data.csv', index=False)
    print('Finished - CSV saved')

output(games)

Finished - CSV saved
