# Steam top selling games

Building a dataset by scraping steams top selling games.


In [1]:
# dependencies
from bs4 import BeautifulSoup
import requests
import json
import pandas as pd
import re
import numpy as np

import time

The base page show the top 50 games at that moment in time. But if we want more then just the top 50. Because the page is a infinite scroll page, the easiest way to get the information is to look for is the query which calls for the next 50 game.

![image info](images/xhr.png)

![image info](images/xhr_query.png)

In [2]:
# url to html for top selling games on steam 
url = 'https://store.steampowered.com/search/results/?query&start=0&count=50&dynamic_data=&sort_by=_ASC&snr=1_7_7_7000_7&filter=topsellers&infinite=1'

In [3]:
# scrap url and return html
def scrap_data(url):
    r = requests.get(url)
    data = dict(r.json())
    return data['results_html']

In [4]:
data = scrap_data(url)

In [5]:
# get data

def parse_data(data):
    games_list = []
    
    soup = BeautifulSoup(data, 'html.parser')
    games = soup.find_all('a')
    for game in games:
        title = game.find('span', {'class': 'title'}).text
        try:
            price = game.find(
                'div', {'class': 'search_price'}).get_text("", strip=True).split('R')[1]
        except:
            price = "No price"
        try:
            discount_price = game.find(
                'div', {'class': 'search_price'}).get_text("", strip=True).split('R')[2]
        except:
            discount_price = price
        discount_percent = game.find(
            'div', {'class': 'search_discount'}).get_text("", strip=True)
        release_date = game.find('div', {'class': 'search_released'}).text

        # review count and status
        try:
            reviews = str(
                game.find('span', {'data-tooltip-html': True})).split('=')[2].replace('&lt;br&gt;', ' ')
            reviews = re.search('"(.*)"', reviews).group(1)
        except:
            reviews = 'No reviews yet'

        # get game link
        game_href = game.get('href')
        # get games page data
        game_data = requests.get(game_href).text

        soup = BeautifulSoup(game_data, 'html.parser')
        game_data = soup.find_all('a', class_='app_tag')
        tags = []
        [[tags.append(i.strip()) for i in tag] for tag in game_data]
        # game genres
        try:
            block = soup.find('div', {'class': 'block_content_inner'}).text
            block = block.split()
            genre_index = block.index('Genre:')
            genre_one = block[genre_index + 1].replace(',','')
            genre_two = block[genre_index + 2].replace(',','')
        except:
            genre_one = 'dont know'
            genre_two = 'dont know'
        
        # DEVELOPER / PUBLISHER
        dev = soup.find("a", href=re.compile("developer"))
        pub = soup.find("a", href=re.compile("publisher"))
        if dev != None:
            dev = dev.text
        if pub != None:
            pub = pub.text

#         print(title)
#         print(price)
#         print(discount_price)
#         print(discount_percent)
#         print(release_date)
#         print(reviews)
#         print(tags)
#         print(dev)
#         print(pub)
#         print()

        games_dict = {
            'title': title,
            'main_genre': genre_one,
            'sub_genre' : genre_two,
            'price_in_rand': price,
            'discount_price': discount_price,
            'discount_percent': discount_percent,
            'release_date': release_date,
            'reviews': reviews,
            'tags': tags,
            'developer': dev,
            'publisher': pub }
        games_list.append(games_dict)

    return games_list

In [6]:
games_data = parse_data(data)

In [13]:
games_data[0]

{'title': 'Stray',
 'main_genre': 'Adventure',
 'sub_genre': 'Indie',
 'price_in_rand': ' 195.00',
 'discount_price': ' 195.00',
 'discount_percent': '',
 'release_date': '19 Jul, 2022',
 'reviews': 'Overwhelmingly Positive 97% of the 13,901 user reviews for this game are positive.',
 'tags': ['Cats',
  'Adventure',
  'Cyberpunk',
  'Atmospheric',
  'Cute',
  'Third Person',
  'Exploration',
  'Sci-fi',
  'Singleplayer',
  'Robots',
  'Indie',
  'Beautiful',
  'Horror',
  'Puzzle',
  'Mystery',
  'Open World',
  'Dystopian',
  'Stealth',
  'Colorful',
  'Action'],
 'developer': 'BlueTwelve Studio',
 'publisher': 'Annapurna Interactive'}

In [8]:
# total games count
def total_results(url):
    r = requests.get(url)
    data = dict(r.json())
    total_results = data['total_count']
    return int(total_results)

In [9]:
num_games = total_results(url)

In [10]:
num_games / 10

2672.1

In [11]:
games_one = []
games_two = []
games_three = []
games_four = []
games_five = []
start = 0
end = 5000
increment = 5000

In [12]:
for i in range(start, end, 50):
    scraped = scrap_data(
        'https://store.steampowered.com/search/results/?query&start={}&count=50&dynamic_data=&sort_by=_ASC&snr=1_7_7_7000_7&filter=topsellers&infinite=1'.format(i))
    results = parse_data(scraped)
    games_one.append(results)

start = end
end += increment

Games scrapped: 0%
Games scrapped: 1%
Games scrapped: 2%
Games scrapped: 3%
Games scrapped: 4%
Games scrapped: 5%
Games scrapped: 6%
Games scrapped: 7%
Games scrapped: 7%
Games scrapped: 8%
Games scrapped: 9%
Games scrapped: 10%
Games scrapped: 11%
Games scrapped: 12%
Games scrapped: 13%
Games scrapped: 14%
Games scrapped: 15%
Games scrapped: 16%
Games scrapped: 17%
Games scrapped: 18%


In [13]:
len(games_one)

100

In [14]:
for i in range(start, end, 50):
    scraped = scrap_data(
        'https://store.steampowered.com/search/results/?query&start={}&count=50&dynamic_data=&sort_by=_ASC&snr=1_7_7_7000_7&filter=topsellers&infinite=1'.format(i))
    results = parse_data(scraped)
    games_two.append(results)
    if i % 250 == 0:
        print('Games scrapped: {}%'.format((round((i / num_games)*100))))
    time.sleep(1.5)

start = end
end += increment

Games scrapped: 19%
Games scrapped: 20%
Games scrapped: 21%
Games scrapped: 22%
Games scrapped: 22%
Games scrapped: 23%
Games scrapped: 24%
Games scrapped: 25%
Games scrapped: 26%
Games scrapped: 27%
Games scrapped: 28%
Games scrapped: 29%
Games scrapped: 30%
Games scrapped: 31%
Games scrapped: 32%
Games scrapped: 33%
Games scrapped: 34%
Games scrapped: 35%
Games scrapped: 36%
Games scrapped: 36%


In [15]:
len(games_two)

100

In [19]:
games_three = []
for i in range(start, end, 50):
    scraped = scrap_data(
        'https://store.steampowered.com/search/results/?query&start={}&count=50&dynamic_data=&sort_by=_ASC&snr=1_7_7_7000_7&filter=topsellers&infinite=1'.format(i))
    results = parse_data(scraped)
    games_three.append(results)
    if i % 250 == 0:
        print('Games scrapped: {}%'.format((round((i / num_games)*100))))
    time.sleep(1.5)

start = end
end += increment

Games scrapped: 37%
Games scrapped: 38%
Games scrapped: 39%
Games scrapped: 40%
Games scrapped: 41%
Games scrapped: 42%
Games scrapped: 43%
Games scrapped: 44%
Games scrapped: 45%
Games scrapped: 46%
Games scrapped: 47%
Games scrapped: 48%
Games scrapped: 49%
Games scrapped: 50%
Games scrapped: 51%
Games scrapped: 51%
Games scrapped: 52%
Games scrapped: 53%
Games scrapped: 54%
Games scrapped: 55%


In [20]:
len(games_three)

100

In [21]:
for i in range(start, end, 50):
    scraped = scrap_data(
        'https://store.steampowered.com/search/results/?query&start={}&count=50&dynamic_data=&sort_by=_ASC&snr=1_7_7_7000_7&filter=topsellers&infinite=1'.format(i))
    results = parse_data(scraped)
    games_four.append(results)
    if i % 250 == 0:
        print('Games scrapped: {}%'.format((round((i / num_games)*100))))
    time.sleep(1.5)

start = end
end += increment

Games scrapped: 56%
Games scrapped: 57%
Games scrapped: 58%
Games scrapped: 59%
Games scrapped: 60%
Games scrapped: 61%
Games scrapped: 62%
Games scrapped: 63%
Games scrapped: 64%
Games scrapped: 65%
Games scrapped: 65%
Games scrapped: 66%
Games scrapped: 67%
Games scrapped: 68%
Games scrapped: 69%
Games scrapped: 70%
Games scrapped: 71%
Games scrapped: 72%
Games scrapped: 73%
Games scrapped: 74%


In [22]:
len(games_four)

100

In [23]:
for i in range(start, end, 50):
    scraped = scrap_data(
        'https://store.steampowered.com/search/results/?query&start={}&count=50&dynamic_data=&sort_by=_ASC&snr=1_7_7_7000_7&filter=topsellers&infinite=1'.format(i))
    results = parse_data(scraped)
    games_five.append(results)
    if i % 250 == 0:
        print('Games scrapped: {}%'.format((round((i / num_games)*100))))
    time.sleep(1.5)
    
start = end
end+=increment

Games scrapped: 75%
Games scrapped: 76%
Games scrapped: 77%
Games scrapped: 78%
Games scrapped: 79%
Games scrapped: 80%
Games scrapped: 80%
Games scrapped: 81%
Games scrapped: 82%
Games scrapped: 83%
Games scrapped: 84%
Games scrapped: 85%
Games scrapped: 86%
Games scrapped: 87%
Games scrapped: 88%
Games scrapped: 89%
Games scrapped: 90%
Games scrapped: 91%
Games scrapped: 92%
Games scrapped: 93%


In [24]:
len(games_five)

100

In [25]:
games = games_one + games_two + games_three + games_four + games_five

In [26]:
len(games)

500

In [27]:
def output(results):
    df = pd.concat([pd.DataFrame(g) for g in results])
    df.to_csv('games.csv',index=True)
    print('Finished - CSV saved')

In [28]:
output(games)

Finished - CSV saved


93% is just as good as 100% 

In [32]:
# read csv
df = pd.read_csv('games.csv')

In [33]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,main_genre,sub_genre,price_in_rand,discount_price,discount_percent,release_date,reviews,tags,developer,publisher
0,0,Stray,Adventure,Indie,195.0,195.0,,"19 Jul, 2022","Overwhelmingly Positive 97% of the 1,712 user ...","['Cats', 'Adventure', 'Cyberpunk', 'Cute', 'Th...",BlueTwelve Studio,Annapurna Interactive
1,1,MultiVersus Founder's Pack - Standard Edition,dont know,dont know,719.0,719.0,,"19 Jul, 2022",Mixed 68% of the 19 user reviews for games in ...,[],Player First Games,Warner Bros. Games
2,2,Dinkum,Indie,"RPG,",130.0,117.0,-10%,"14 Jul, 2022","Overwhelmingly Positive 95% of the 1,378 user ...","['Early Access', 'Adventure', 'Co-op', 'Online...",James Bendon,James Bendon
3,3,Raft,Adventure,"Indie,",219.0,219.0,,"20 Jun, 2022","Very Positive 93% of the 165,711 user reviews ...","['Survival', 'Open World Survival Craft', 'Mul...",Redbeet Interactive,Axolot Games
4,4,Half-Life: Alyx,Action,Adventure,329.0,164.5,-50%,"23 Mar, 2020","Overwhelmingly Positive 98% of the 62,107 user...","['VR', 'FPS', 'Story Rich', 'Horror', 'Female ...",Valve,Valve


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        25000 non-null  int64 
 1   title             25000 non-null  object
 2   main_genre        25000 non-null  object
 3   sub_genre         25000 non-null  object
 4   price_in_rand     25000 non-null  object
 5   discount_price    25000 non-null  object
 6   discount_percent  5881 non-null   object
 7   release_date      22384 non-null  object
 8   reviews           25000 non-null  object
 9   tags              25000 non-null  object
 10  developer         22946 non-null  object
 11  publisher         21626 non-null  object
dtypes: int64(1), object(11)
memory usage: 2.3+ MB


Alrighty lets fix some of these data types and create new features.

In [35]:
def string_to_num(price):
    try:
        return np.float(price.replace(' ', ""))
    except:
        return 0


def discount_percent(amount):
    try:
        return int(''.join(filter(str.isdigit, amount)))
    except:
        return 0


def get_year(release_date):
    try:
        return int(release_date[-4:])
    except:
        return 0


def get_month(release_date):
    try:
        return release_date[2:6]
    except:
        return 'no date'


def get_day(release_date):
    try:
        return int(release_date[:2])
    except:
        return 0
    
def percent_positive(review):
    if review == 'No reviews yet':
        return 0
    else:
        return int(re.findall("\d*%",review)[0][:-1])
    
def reviews_count(review):
    result = re.findall('(?<= the )(.*)(?= user )', review)
    if len(result) < 1:
        return 0
    else:
        return int(result[0].replace(',',''))

In [52]:
df.rename(columns={'Unnamed: 0':'rank'}, inplace=True)
df['price_in_rand'] = df['price_in_rand'].apply(string_to_num)
df['discount_price'] = df['discount_price'].apply(string_to_num)
df['discount_percent'] = df['discount_percent'].apply(discount_percent)
df['release_year'] = df['release_date'].apply(get_year)
df['release_month'] = df['release_date'].apply(get_month)
df['release_day'] = df['release_date'].apply(get_day)
df['percent_of_reviews_positive'] = df['reviews'].apply(percent_positive)
df['number_of_reviews'] = df['reviews'].apply(reviews_count)

In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 17 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   rank                         25000 non-null  int64  
 1   title                        25000 non-null  object 
 2   main_genre                   25000 non-null  object 
 3   sub_genre                    25000 non-null  object 
 4   price_in_rand                25000 non-null  float64
 5   discount_price               25000 non-null  float64
 6   discount_percent             25000 non-null  int64  
 7   release_date                 22384 non-null  object 
 8   reviews                      25000 non-null  object 
 9   tags                         25000 non-null  object 
 10  developer                    22946 non-null  object 
 11  publisher                    21626 non-null  object 
 12  release_year                 25000 non-null  int64  
 13  release_month   

In [55]:
df.head()

Unnamed: 0,rank,title,main_genre,sub_genre,price_in_rand,discount_price,discount_percent,release_date,reviews,tags,developer,publisher,release_year,release_month,release_day,percent_of_reviews_positive,number_of_reviews
0,0,Stray,Adventure,Indie,195.0,195.0,0,"19 Jul, 2022","Overwhelmingly Positive 97% of the 1,712 user ...","['Cats', 'Adventure', 'Cyberpunk', 'Cute', 'Th...",BlueTwelve Studio,Annapurna Interactive,2022,Jul,19,97,1712
1,1,MultiVersus Founder's Pack - Standard Edition,dont know,dont know,719.0,719.0,0,"19 Jul, 2022",Mixed 68% of the 19 user reviews for games in ...,[],Player First Games,Warner Bros. Games,2022,Jul,19,68,19
2,2,Dinkum,Indie,"RPG,",130.0,117.0,10,"14 Jul, 2022","Overwhelmingly Positive 95% of the 1,378 user ...","['Early Access', 'Adventure', 'Co-op', 'Online...",James Bendon,James Bendon,2022,Jul,14,95,1378
3,3,Raft,Adventure,"Indie,",219.0,219.0,0,"20 Jun, 2022","Very Positive 93% of the 165,711 user reviews ...","['Survival', 'Open World Survival Craft', 'Mul...",Redbeet Interactive,Axolot Games,2022,Jun,20,93,165711
4,4,Half-Life: Alyx,Action,Adventure,329.0,164.5,50,"23 Mar, 2020","Overwhelmingly Positive 98% of the 62,107 user...","['VR', 'FPS', 'Story Rich', 'Horror', 'Female ...",Valve,Valve,2020,Mar,23,98,62107


In [56]:
df.to_csv('games_updated.csv')

In [59]:
df['main_genre'].unique()

array(['Adventure', 'dont know', 'Indie', 'Action', 'Racing', 'Racin',
       'Casual', 'Actio', 'Strateg', 'RP', 'Adventur', 'Simulation',
       'Indi', 'Massivel', 'Simulatio', 'Casua', 'RPG', 'Strategy',
       'Animatio', 'Audi', 'Fre', 'Utilitie', 'Sports', 'Utilities',
       'Educatio', 'Earl', 'Education', 'Violent', 'Sport', 'Desig', 'We',
       'Developer', 'Accounting', 'Vide', 'Softwar', 'Phot', 'Gam',
       'Sexua'], dtype=object)

In [62]:
df[df['main_genre']=='Casua']

Unnamed: 0,rank,title,main_genre,sub_genre,price_in_rand,discount_price,discount_percent,release_date,reviews,tags,developer,publisher,release_year,release_month,release_day,percent_of_reviews_positive,number_of_reviews
122,22,UNO,Casua,Developer:,169.0,67.60,60,"3 Jan, 2017","Mostly Positive 73% of the 31,378 user reviews...","['Card Game', 'Multiplayer', 'Tabletop', 'Boar...",Ubisoft Entertainment,Ubisoft Entertainment,2017,"Jan,",3,73,31378
135,35,Among Us,Casua,Developer:,42.0,33.60,20,"16 Nov, 2018","Very Positive 92% of the 552,978 user reviews ...","['Multiplayer', 'Online Co-Op', 'Social Deduct...",Innersloth,Innersloth,2018,Nov,16,92,552978
581,31,MONOPOLY® PLUS,Casua,Developer:,249.0,249.00,0,"7 Sep, 2017","Mixed 43% of the 9,371 user reviews for this g...","['Multiplayer', 'Board Game', 'Casual', 'Table...",Ubisoft Pune,Ubisoft,2017,"Sep,",7,43,9371
970,20,Spice&Wolf VR2,Casua,Developer:,150.0,90.00,40,"9 Dec, 2020",Very Positive 97% of the 378 user reviews for ...,"['VR', 'Cute', 'Anime', 'FMV', 'Fantasy', 'Cas...",SpicyTails,SpicyTails,2020,"Dec,",9,97,378
1794,44,Peggle Pack,Casua,Developer:,82.5,82.50,0,,"Overwhelmingly Positive 97% of the 4,675 user ...",[],"PopCap Games, Inc.","PopCap Games, Inc.",0,no date,0,97,4675
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24883,33,Solitaire Halloween Story,Casua,Developer:,42.0,4.20,90,"23 Nov, 2020",No reviews yet,"['Casual', 'Card Game', 'Singleplayer', 'Adven...",Creobit,8floor,2020,Nov,23,0,0
24896,46,Jigsaw Tour,Casua,Developer:,42.0,4.20,90,"23 Nov, 2020",No reviews yet,"['Casual', 'Puzzle', 'Singleplayer', 'Adventur...",Creobit,8floor,2020,Nov,23,0,0
24912,12,Egypt Picross Pharaohs Riddles,Casua,Developer:,42.0,4.20,90,"31 Aug, 2020",No reviews yet,"['Casual', 'Puzzle', 'Adventure', 'Indie', 'Si...",Somer Games,8floor,2020,Aug,31,0,0
24916,16,Fill And Cross Christmas Riddles,Casua,Developer:,42.0,4.20,90,"21 Dec, 2020",No reviews yet,"['Casual', 'Puzzle', 'Singleplayer', 'Adventur...",Creobit,8floor,2020,Dec,21,0,0


In [110]:
one = 'one'

In [111]:
one.replace(',','')

'one'