Imports

In [1]:
import pandas as pd
import requests
import time
import json
import pyarrow.parquet as pq
import pyarrow as pa
import os
import numpy as np

<h3>Getting App list</h3>
<h4> for games updated within the last 2 months (Access Token needs to be manually refreshed)</h4>

In [2]:
six_month_time_frame = int(time.time() - (30*24*60*60))
app_list_url = f"https://api.steampowered.com/IStoreService/GetAppList/v1/?access_token=eyAidHlwIjogIkpXVCIsICJhbGciOiAiRWREU0EiIH0.eyAiaXNzIjogInI6MDAwOV8yNTU1RUVCNl8wMUI4NiIsICJzdWIiOiAiNzY1NjExOTgxNzUxMzI5NzQiLCAiYXVkIjogWyAid2ViOnN0b3JlIiBdLCAiZXhwIjogMTczMTQ1ODcxNCwgIm5iZiI6IDE3MjI3MzEzNTIsICJpYXQiOiAxNzMxMzcxMzUyLCAianRpIjogIjAwMTZfMjU1NUVGNjdfMDRFMjciLCAib2F0IjogMTczMTEwMDE4OCwgInJ0X2V4cCI6IDE3NDkwNDY1MjQsICJwZXIiOiAwLCAiaXBfc3ViamVjdCI6ICIxMDcuMjIxLjEzNy43NSIsICJpcF9jb25maXJtZXIiOiAiMTIuNzUuNDEuODYiIH0.-ofwzKDPW9G9lKwh0klvYQWrZznqi7sXciKZCBdtU_1YfgTKWGTUdQ3UipufF27PdSMbiEaYmQraz_-Gc-8eCQ&if_modified_since={six_month_time_frame}&have_description_language=english&include_games=true&max_results=50000"

In [3]:
response = requests.get(app_list_url)
if response.status_code == 200:
    data = response.json()
else:
    print(f' Error getting app list from steam: {response.status_code}')
    print(f' Error getting app list from steam: {response.text}')
app_list_df = pd.DataFrame(data['response']['apps'])
app_ids = list(app_list_df['appid'])

In [4]:
app_ids.sort()
duplicate_count = app_list_df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")
print(f"Number of apps to process: ", len(app_ids))
print(f"Largest appId in list: ", max(app_ids))

Number of duplicate rows: 0
Number of apps to process:  18137
Largest appId in list:  3339450


Testing Steam API - App Details Endpoint

In [5]:
test_app_details_url = f"https://store.steampowered.com/api/appdetails?appids={10}"
test_response = requests.get(test_app_details_url)
test_data = test_response.json()
print(test_data)

{'10': {'success': True, 'data': {'type': 'game', 'name': 'Counter-Strike', 'steam_appid': 10, 'required_age': 0, 'is_free': False, 'detailed_description': "Play the world's number 1 online action game. Engage in an incredibly realistic brand of terrorist warfare in this wildly popular team-based game. Ally with teammates to complete strategic missions. Take out enemy sites. Rescue hostages. Your role affects your team's success. Your team's success affects your role.", 'about_the_game': "Play the world's number 1 online action game. Engage in an incredibly realistic brand of terrorist warfare in this wildly popular team-based game. Ally with teammates to complete strategic missions. Take out enemy sites. Rescue hostages. Your role affects your team's success. Your team's success affects your role.", 'short_description': "Play the world's number 1 online action game. Engage in an incredibly realistic brand of terrorist warfare in this wildly popular team-based game. Ally with teammates

In [6]:
test_app_data = test_data['10']['data']

developers = test_app_data.get('developers', [])
publishers = test_app_data.get('publishers', [])
price_overview = test_app_data.get('price_overview', {})
platforms = test_app_data.get('platforms', {})
metacritic = test_app_data.get('metacritic', {})
categories = test_app_data.get('categories', [])
genres = test_app_data.get('genres', [])
release_date = test_app_data.get('release_date', {})

print("Developers:", developers)
print("Publishers:", publishers)
print("Price Overview:", price_overview)
print("Platforms:", platforms)
print("Metacritic:", metacritic)
print("Categories:", categories)
print("Genres:", genres)
print("Release Date:", release_date)

Developers: ['Valve']
Publishers: ['Valve']
Price Overview: {'currency': 'USD', 'initial': 999, 'final': 999, 'discount_percent': 0, 'initial_formatted': '', 'final_formatted': '$9.99'}
Platforms: {'windows': True, 'mac': True, 'linux': True}
Metacritic: {'score': 88, 'url': 'https://www.metacritic.com/game/pc/counter-strike?ftag=MCD-06-10aaa1f'}
Categories: [{'id': 1, 'description': 'Multi-player'}, {'id': 49, 'description': 'PvP'}, {'id': 36, 'description': 'Online PvP'}, {'id': 37, 'description': 'Shared/Split Screen PvP'}, {'id': 8, 'description': 'Valve Anti-Cheat enabled'}, {'id': 62, 'description': 'Family Sharing'}]
Genres: [{'id': '1', 'description': 'Action'}]
Release Date: {'coming_soon': False, 'date': 'Nov 1, 2000'}


Getting app details from Steam

In [None]:
steam_apps_parquet_file = '../data/steam_app_details.parquet'
steam_apps_json_cache = '../data/processed_steam_apps.json'
batch_size = 200
batch_data = []

In [None]:
steam_apps_schema = pa.schema([
    ('appid', pa.string()),
    ('name', pa.string()),
    ('developers', pa.list_(pa.string())),
    ('publishers', pa.list_(pa.string())),
    ('initial_price', pa.float64()),
    ('final_price', pa.float64()),
    ('platforms', pa.struct([
        ('linux', pa.bool_()),
        ('mac', pa.bool_()),
        ('windows', pa.bool_())
    ])),
    ('metacritic', pa.float64()),  
    ('genres', pa.list_(pa.struct([
        ('description', pa.string()),
        ('id', pa.string())
    ]))),
    ('release_date', pa.string())
])

In [27]:
try:
    with open(steam_apps_json_cache,'r') as f:
        processed_apps = set(json.load(f))
except FileNotFoundError:
    processed_apps = set()

In [36]:
def fetch_steam_app_details(app_id):
    while True:
        try:
            response = requests.get(f"https://store.steampowered.com/api/appdetails?appids={app_id}")
            app_id = str(app_id)
            if response.status_code == 200:
                data = response.json()
                if app_id in data and data[app_id].get('success', False):
                    app_data = data[app_id].get('data', {})
                    return (
                        {
                            'appid': app_id,
                            'name': app_data.get('name', "Unknown"),
                            'developers': app_data.get('developers', []),
                            'publishers': app_data.get('publishers', []),
                            'initial_price': app_data.get('price_overview', {}).get('initial', float('nan')),
                            'final_price': app_data.get('price_overview', {}).get('final', float('nan')),
                            'platforms': app_data.get('platforms', {'linux': False, 'mac': False, 'windows': False}),
                            'metacritic': app_data.get('metacritic', {}).get('score', float('nan')),
                            'genres': app_data.get('genres', []),
                            'release_date': app_data.get('release_date', {}).get('date', "Unknown")
                        }
                    )
                else:
                    return None
            elif response.status_code == 429:
                    print(f"Status code {response.status_code} for {app_id}. ZZZZzzzzz respecting stupid rate limits...")
                    time.sleep(300)
            elif response.status_code == 403:
                raise Exception("403 Forbidden: Mission failed soldier, We\"ll get \'em next time....")
            else:
                print(f"Failed to fetch data for appid, {app_id}, with status code {response.status_code}")
                return None
        except requests.exceptions.JSONDecodeError:
            print(f"Failed to decode JSON for appid {app_id}")
            return None

In [15]:
def write_to_parquet(batch, file, schema):
    df = pd.DataFrame(batch)
    table = pa.Table.from_pandas(df, schema=schema)

    if os.path.exists(file):
        existing_data = pq.read_table(file)
        combined_data = pa.concat_tables([existing_data, table])
        pq.write_table(combined_data, file)
    else:
        pq.write_table(table,file)

In [37]:
for app_id in app_ids:
    if app_id in processed_apps:
        continue

    app = fetch_steam_app_details(app_id)

    if app is not None:
        batch_data.append(app)
        processed_apps.add(app_id)

    if len(batch_data) >= batch_size:
        write_to_parquet(batch_data, steam_apps_parquet_file, steam_apps_schema)
        batch_data = []
        with open(steam_apps_json_cache, 'w') as f:
            json.dump(list(processed_apps), f)

if batch_data:
    write_to_parquet(batch_data, steam_apps_parquet_file, steam_apps_schema)

with open(steam_apps_json_cache, 'w') as f:
    json.dump(list(processed_apps), f)

Failed to decode JSON for appid 3168870
Status code 429 for 3175650. ZZZZzzzzz respecting stupid rate limits...
Failed to decode JSON for appid 3187750
Status code 429 for 3188360. ZZZZzzzzz respecting stupid rate limits...
Status code 429 for 3199760. ZZZZzzzzz respecting stupid rate limits...
Status code 429 for 3212120. ZZZZzzzzz respecting stupid rate limits...
Status code 429 for 3224480. ZZZZzzzzz respecting stupid rate limits...
Status code 429 for 3235600. ZZZZzzzzz respecting stupid rate limits...
Status code 429 for 3247480. ZZZZzzzzz respecting stupid rate limits...
Status code 429 for 3258370. ZZZZzzzzz respecting stupid rate limits...
Status code 429 for 3268180. ZZZZzzzzz respecting stupid rate limits...
Status code 429 for 3277840. ZZZZzzzzz respecting stupid rate limits...
Status code 429 for 3288100. ZZZZzzzzz respecting stupid rate limits...
Status code 429 for 3296840. ZZZZzzzzz respecting stupid rate limits...
Status code 429 for 3305370. ZZZZzzzzz respecting stupid

Resetting appids list

In [None]:
current_data = pq.read_table('../data/steam_app_details.parquet')
temp_df = pa.Table.to_pandas(current_data)
print(len(temp_df))
app_ids = list(temp_df['appid'])
print(len(app_ids))
del temp_df
del current_data

18096
18096


Testing SteamSpy API

In [16]:
test_steam_spy_response = requests.get(f"https://steamspy.com/api.php?request=appdetails&appid={10}")
test_data = test_steam_spy_response.json()
print(test_data)

{'appid': 10, 'name': 'Counter-Strike', 'developer': 'Valve', 'publisher': 'Valve', 'score_rank': '', 'positive': 236255, 'negative': 6239, 'userscore': 0, 'owners': '10,000,000 .. 20,000,000', 'average_forever': 0, 'average_2weeks': 0, 'median_forever': 0, 'median_2weeks': 0, 'price': '999', 'initialprice': '999', 'discount': '0', 'ccu': 14998, 'languages': 'English, French, German, Italian, Spanish - Spain, Simplified Chinese, Traditional Chinese, Korean', 'genre': 'Action', 'tags': {'Action': 5480, 'FPS': 4908, 'Multiplayer': 3455, 'Shooter': 3408, 'Classic': 2831, 'Team-Based': 1905, 'First-Person': 1747, 'Competitive': 1640, 'Tactical': 1379, "1990's": 1235, 'e-sports': 1222, 'PvP': 915, 'Old School': 813, 'Military': 657, 'Strategy': 630, 'Survival': 316, 'Score Attack': 297, '1980s': 279, 'Assassin': 238, 'Nostalgia': 186}}


In [18]:
for key, value in test_data.items():
    print(f"{key}: {type(value).__name__}")

appid: int
name: str
developer: str
publisher: str
score_rank: str
positive: int
negative: int
userscore: int
owners: str
average_forever: int
average_2weeks: int
median_forever: int
median_2weeks: int
price: str
initialprice: str
discount: str
ccu: int
languages: str
genre: str
tags: dict


Getting data from steam spy

In [25]:
steam_spy_schema = pa.schema([
    ('appid', pa.int32()),
    ('score_rank', pa.string()),
    ('positive', pa.int32()),
    ('negative', pa.int32()),
    ('userscore', pa.int32()),
    ('owners',  pa.string()),
    ('average_playtime',  pa.int32()),
    ('average_playtime_2weeks',  pa.int32()),
    ('median_playtime',  pa.int32()),
    ('median_playtime_2weeks',  pa.int32()),
    ('ccu',  pa.int32()),
    ('current_price', pa.string()),
    ('initial_price', pa.string())
])

In [None]:
steam_spy_parquet_file = '../data/steam_spy_details.parquet'
steam_spy_json_cache = '../data/processed_steam_spy_apps.json'
batch_size = 200
batch_data = []

In [27]:
try:
    with open(steam_spy_json_cache,'r') as f:
        steam_spy_processed_apps = set(json.load(f))
except FileNotFoundError:
    steam_spy_processed_apps = set()

In [29]:
def fetch_steam_spy_apps(app_id):
    response = requests.get(f"https://steamspy.com/api.php?request=appdetails&appid={app_id}")
    if response.status_code == 200:
        app_id = int(app_id)
        try:
            data = response.json() 
            return {
                'appid': app_id,
                'score_rank': str(data.get('score_rank', '')) if data.get('score_rank') is not None else None,
                'positive':  data.get('positive', None),
                'negative': data.get('negative', None),
                'userscore': data.get('userscore', None),
                'owners':  data.get('owners', None),
                'average_playtime':  data.get('average_forever', None),
                'average_playtime_2weeks': data.get('average_2weeks', None),
                'median_playtime':  data.get('median_forever', None),
                'median_playtime_2weeks':  data.get('median_2weeks', None),
                'ccu': data.get('ccu', None),
                'current_price': data.get('price', None),
                'initial_price': data.get('initialprice', None)

            }
        except requests.exceptions.JSONDecodeError:
            print(f"Failed to decode JSON for appid {app_id}")
            return None
    else:
        print(f"Error fetching {app_id} data: response code {response.status_code}")


In [30]:
for app_id in app_ids:
    if app_id in steam_spy_processed_apps:
        continue

    app = fetch_steam_spy_apps(app_id)
    time.sleep(0.5)
    
    if app is not None:
        batch_data.append(app)
        steam_spy_processed_apps.add(app_id)

    if len(batch_data) >= batch_size:
        write_to_parquet(batch_data, steam_spy_parquet_file, steam_spy_schema)
        batch_data = []
        with open(steam_spy_json_cache, 'w') as f:
            json.dump(list(steam_spy_processed_apps), f)

if batch_data:
    write_to_parquet(batch_data, steam_spy_parquet_file, steam_spy_schema)

with open(steam_spy_json_cache, 'w') as f:
    json.dump(list(steam_spy_processed_apps), f)