## IGDB Dataset Extraction
Pulls game data information from the IGDB API

#### Imports

In [1]:
import requests
import yaml
import time
import sys
import os
import json
from PIL import Image
from io import BytesIO
from tqdm import tqdm
import base64

### Setup IGDB API Access and functions

In [2]:
# setup POST for authorization
url = "https://id.twitch.tv/oauth2/token"

config_yaml = "twitch.yaml"     # add your configuration file here
# Load configuration from YAML file
try:
    with open(config_yaml, 'r') as file:
        config = yaml.safe_load(file)
except FileNotFoundError:
    print(f"Configuration file {config_yaml} not found.")

params = {
    "client_id": config.get("CLIENT_ID", "your_client_id"),
    "client_secret": config.get("CLIENT_SECRET", "your_client_secret"),
    "grant_type": config.get("GRANT_TYPE", "client_credentials")
}


In [3]:
def getAccessToken():
    if 'CLIENT_ID' not in config or 'CLIENT_SECRET' not in config:
        print("Error: CLIENT_ID and CLIENT_SECRET must be set in the configuration file.")
        return None, 0

    response = requests.post(url, params=params)
    if response.status_code != 200:
        print(f"Error: {response.status_code} - {response.text}")
        return None, 0

    # Parse the JSON response
    data = response.json()
    if 'access_token' not in data:
        print("Error: 'access_token' not found in the response.")
        return None, 0

    access_token = data['access_token']
    timeout = data.get('expires_in', 0)
    return access_token, timeout

In [4]:
# set up the access token
acc_token, timeout = getAccessToken()
print(f"Client ID: {config['CLIENT_ID']}")
print(f"Access token: {acc_token}")
print(f"Timeout: {timeout} seconds")

Client ID: jzdf0pxiz7y7nqawzhfp4g4t37q8j9
Access token: 0uoj7drnpf2egh54299km8f096d475
Timeout: 5380715 seconds


### Get Top Games from IGDB website (Scraping)

In [5]:
def getGameGenres(genre_ids):
    if not genre_ids:
        return []
    url = f"https://api.igdb.com/v4/genres"
    headers = {
        'Client-ID': config['CLIENT_ID'],
        'Authorization': f'Bearer {acc_token}',
        'Content-Type': 'application/json'
    }
    body = f"fields name; where id = ({','.join(map(str, genre_ids))});"
    response = requests.post(url, headers=headers, data=body)
    if response.status_code != 200:
        print(f"Error: {response.status_code} - {response.text}")
        return []
    return [genre['name'] for genre in response.json()]


def getGameKeywords(keyword_ids):
    if not keyword_ids:
        return []
    
    url = f"https://api.igdb.com/v4/keywords"
    headers = {
        'Client-ID': config['CLIENT_ID'],
        'Authorization': f'Bearer {acc_token}',
        'Content-Type': 'application/json'
    }
    body = f"fields name; where id = ({','.join(map(str, keyword_ids))});"
    response = requests.post(url, headers=headers, data=body)
    if response.status_code != 200:
        print(f"Error: {response.status_code} - {response.text}")
        return []
    return [keyword['name'] for keyword in response.json()]

In [10]:
# get basic data for a game
def getGameAPIDat(game_id):
    search_games = requests.post(
        "https://api.igdb.com/v4/games/",
        headers={
            "Accept": "application/json",
            "Client-ID": config['CLIENT_ID'],
            "Authorization": f"Bearer {acc_token}"
        },
        data=f'fields id, name, rating, genres, keywords, rating_count, summary; where id = {game_id};')

    if search_games.status_code != 200:
        print(f"Error: {search_games.status_code} - {search_games.text}")
        return None
    
    if not search_games.json():
        print(f"No data found for game ID {game_id}")
        return None

    game_dat = search_games.json()[0]

    # convert ids to names for genres and keywords
    game_dat['genres'] = getGameGenres(game_dat.get('genres', []))
    game_dat['keywords'] = getGameKeywords(game_dat.get('keywords', []))

    return game_dat

# get the cover image for a game
def getGameCover(game_id):
    search_covers = requests.post(
        "https://api.igdb.com/v4/covers/",
        headers={
            "Accept": "application/json",
            "Client-ID": config['CLIENT_ID'],
            "Authorization": f"Bearer {acc_token}"
        },
        data=f'fields image_id, url; where game = {game_id};')
    
    if search_covers.status_code != 200:
        print(f"Error: {search_covers.status_code} - {search_covers.text}")
        return None, None
    
    covers = search_covers.json()
    if not covers:
        #print(f"No covers found for game ID {game_id}")
        return None, None

    return covers[0]['image_id'], 'http:'+covers[0]['url'].replace('t_thumb', 't_cover_big')

In [11]:
getGameAPIDat(14593)

{'id': 14593,
 'genres': ['Platform', 'Adventure', 'Indie'],
 'keywords': ['hand-drawn',
  'metroidvania',
  'shop keeper',
  'skill-based',
  'crowdfunding',
  'the game awards - best debut indie game - nominee',
  '2d',
  'action-adventure',
  'steam achievements',
  'shielded enemies'],
 'name': 'Hollow Knight',
 'rating': 92.13928799478947,
 'rating_count': 1711,
 'summary': "A 2D metroidvania with an emphasis on close combat and exploration in which the player enters the once-prosperous now-bleak insect kingdom of Hallownest, travels through its various districts, meets friendly inhabitants, fights hostile ones and uncovers the kingdom's history while improving their combat abilities and movement arsenal by fighting bosses and accessing out-of-the-way areas."}

In [12]:
games_ids = [] 
num_games = 500

for i in range(num_games // 500):
    # use the popularity api
    pop_games = requests.post(
        "https://api.igdb.com/v4/popularity_primitives/",
        headers={
            "Accept": "application/json",
            "Client-ID": config['CLIENT_ID'],
            "Authorization": f"Bearer {acc_token}"
        },
        data=f'fields game_id; sort value desc; limit 500; offset {i*500}; where popularity_type = 1;')

    if pop_games.status_code == 200:
        games_ids.extend([game['game_id'] for game in pop_games.json()])
    else:
        print(f"Error fetching popular games: {pop_games.status_code}")

print(len(games_ids))

500


In [13]:
# get the game IDs and their data from the API
games_full_dat = {}
with tqdm(total=len(games_ids), desc="Processing games") as pbar:
    for game_id in games_ids:
        
        pbar.set_description(f"Processing game ID: {game_id}")
        game_dat = getGameAPIDat(game_id)
        game_cover = getGameCover(game_id)

        time.sleep(0.25)  # to avoid hitting the rate limit

        if game_dat is None:
            pbar.set_description(f"Skipping game ID {game_id} due to missing data")
            continue
    
        #print(game_dat)
        pbar.set_description(f"{game_id}: {game_dat['name']}")

        games_full_dat[game_id] = {
            'name': game_dat['name'],
            'rating': game_dat.get('rating', 0),
            'genres': game_dat.get('genres', []),
            'keywords': game_dat.get('keywords', []),
            'rating_count': game_dat.get('rating_count', 0),
            'summary': game_dat.get('summary', ''),
            'cover_image_id': game_cover[0] if game_cover else None,
            'cover_image_url': game_cover[1] if game_cover else None
        }

        pbar.update(1)


# save the data to a JSON file
with open(f'igdb_data_{num_games}.json', 'w') as f:
    json.dump(games_full_dat, f, indent=4)

Processing game ID: 340038:  41%|████      | 204/500 [07:05<09:26,  1.91s/it]                           

No data found for game ID 340038


Processing game ID: 348971:  48%|████▊     | 242/500 [08:19<08:28,  1.97s/it]                     

No data found for game ID 348971


11137: Sea of Thieves: 100%|█████████▉| 498/500 [16:29<00:03,  1.99s/it]                                    
