In [1]:
# ---------------------------------------------------------
# DATA CLEANING
# ---------------------------------------------------------

import pandas as pd
import os
import re

file_path = os.path.join('datasets', 'steam-games.csv')
df = pd.read_csv(file_path)
print(f'Righe originali: {len(df)}')

df.dropna(subset=['title', 'genres'], inplace=True)
print(f'Righe dopo pulizia: {len(df)}')

# Review
df['review_score'] = df['overall_review_%'].fillna(0)
df['review_count'] = df['overall_review_count'].fillna(0).astype(int)

# Anno
def extract_year(x):
    match = re.search(r'\d{4}', str(x))
    return int(match.group(0)) if match else 2020

df['release_year'] = df['release_date'].apply(extract_year)

# Generi
def clean_genres(text):
    if pd.isna(text):
        return ''
    return ' '.join([t.strip().replace(' ', '_') for t in str(text).split(',')])

df['genres'] = df['genres'].apply(clean_genres)

# Export
final_cols = ['app_id', 'title', 'review_score', 'review_count', 'release_year', 'genres']
df_out = df[final_cols]

print('\nAnteprima:')
print(df_out.head(10).to_string())

output_path = 'datasets/steam-games-cleaned.csv'
df_out.to_csv(output_path, index=False)
print(f'\nSalvato: {output_path}')

Righe originali: 42497
Righe dopo pulizia: 42410

Anteprima:
    app_id                             title  review_score  review_count  release_year                                               genres
0      730                  Counter-Strike 2          87.0       8062218          2012                                  Action Free_to_Play
1      570                            Dota 2          81.0       2243112          2013                         Action Strategy Free_to_Play
2  2215430  Ghost of Tsushima DIRECTOR'S CUT          89.0         12294          2024                                     Action Adventure
3  1245620                        ELDEN RING          93.0        605191          2022                                           Action RPG
4  1085660                         Destiny 2          80.0        594713          2019                        Action Adventure Free_to_Play
5  1091500                    Cyberpunk 2077          83.0        654304          2020             

In [1]:
# ---------------------------------------------------------
# FETCH TAG DA STEAMSPY
# ---------------------------------------------------------

import pandas as pd
import requests
import time
import os
import json as json_lib
from IPython.display import clear_output

STEAMSPY_API = 'https://steamspy.com/api.php'
STEAMSPY_OUTPUT = 'datasets/steamspy-tags.csv'
REQUEST_DELAY = 1.1
TOP_N_TAGS = 5

def fetch_tags(app_id: int) -> dict:
    try:
        resp = requests.get(STEAMSPY_API, params={'request': 'appdetails', 'appid': app_id}, timeout=10)
        resp.raise_for_status()
        tags = resp.json().get('tags', {})
        if isinstance(tags, dict) and tags:
            sorted_tags = sorted(tags.items(), key=lambda x: x[1], reverse=True)
            return {'app_id': app_id, 'steam_tags': ', '.join([t[0] for t in sorted_tags[:TOP_N_TAGS]]), 'success': True}
        return {'app_id': app_id, 'steam_tags': '', 'success': True}
    except Exception:
        return {'app_id': app_id, 'steam_tags': '', 'success': False}

def load_processed_ids() -> set:
    if os.path.exists(STEAMSPY_OUTPUT):
        try:
            return set(pd.read_csv(STEAMSPY_OUTPUT)['app_id'].tolist())
        except Exception:
            pass
    return set()

def append_result(result: dict, is_first: bool):
    pd.DataFrame([result]).to_csv(
        STEAMSPY_OUTPUT, 
        mode='w' if is_first else 'a', 
        header=is_first or not os.path.exists(STEAMSPY_OUTPUT), 
        index=False
    )

df_source = pd.read_csv('datasets/steam-games.csv')
all_ids = df_source['app_id'].dropna().astype(int).unique().tolist()
processed = load_processed_ids()
remaining = [i for i in all_ids if i not in processed]

print(f'Totale giochi: {len(all_ids)}')
print(f'Gia processati: {len(processed)}')
print(f'Da processare: {len(remaining)}')
print(f'Tempo stimato: {len(remaining) * REQUEST_DELAY / 3600:.1f} ore')

if remaining:
    print('\nAvvio download... (Kernel > Interrupt per fermare)')
    is_first = len(processed) == 0
    
    for idx, app_id in enumerate(remaining, 1):
        result = fetch_tags(app_id)
        append_result(result, is_first and idx == 1)
        
        if idx % 50 == 0:
            clear_output(wait=True)
            total_done = len(processed) + idx
            print(f'Progresso: {total_done}/{len(all_ids)} ({total_done/len(all_ids)*100:.1f}%)')
        
        time.sleep(REQUEST_DELAY)
    
    print('Download completato.')
else:
    print('Nessun gioco da processare.')

if os.path.exists(STEAMSPY_OUTPUT):
    df_tags = pd.read_csv(STEAMSPY_OUTPUT)
    print(f'\nFile: {STEAMSPY_OUTPUT} ({len(df_tags)} righe)')
    print(df_tags[['app_id', 'steam_tags']].head(5))

Progresso: 12584/42497 (29.6%)


KeyboardInterrupt: 

In [None]:
# ---------------------------------------------------------
# MERGE TAG STEAMSPY
# ---------------------------------------------------------

import pandas as pd
import os

CLEANED_PATH = 'datasets/steam-games-cleaned.csv'
STEAMSPY_PATH = 'datasets/steamspy-tags.csv'
OUTPUT_PATH = 'datasets/steam-games-final.csv'

if not os.path.exists(STEAMSPY_PATH):
    print(f'Errore: {STEAMSPY_PATH} non trovato.')
else:
    df_cleaned = pd.read_csv(CLEANED_PATH)
    df_tags = pd.read_csv(STEAMSPY_PATH)
    
    print(f'Dataset pulito: {len(df_cleaned)} righe')
    print(f'Tag SteamSpy: {len(df_tags)} righe')
    
    if 'success' in df_tags.columns:
        df_tags = df_tags[df_tags['success'] == True]
    
    df_merged = pd.merge(df_cleaned, df_tags[['app_id', 'steam_tags']], on='app_id', how='left')
    df_merged.rename(columns={'steam_tags': 'tags'}, inplace=True)
    
    n_with_tags = df_merged['tags'].notna().sum()
    print(f'\nGiochi con tag: {n_with_tags}/{len(df_merged)}')
    
    print('\nAnteprima:')
    print(df_merged[['title', 'genres', 'tags']].head(10).to_string())
    
    df_merged.to_csv(OUTPUT_PATH, index=False)
    print(f'\nSalvato: {OUTPUT_PATH}')