In [1]:
import pandas as pd
import json
from concurrent.futures import ThreadPoolExecutor

import requests
from bs4 import BeautifulSoup
import lxml
import numpy as np
import os

In [2]:
def get_steam_app_list():
    url = 'https://api.steampowered.com/ISteamApps/GetAppList/v2/'
    response = requests.get(url)
    data = response.json()

    app_list = []
    if 'applist' in data and 'apps' in data['applist']:
        app_list = data['applist']['apps']
    
    return app_list

# 获取 Steam App 列表
steam_app_list = get_steam_app_list()

# 转换为 DataFrame 对象
df = pd.DataFrame(steam_app_list)

# 选择需要的字段
df = df[['appid', 'name']]
df.replace('', np.nan, inplace=True)
df.dropna(inplace=True)
# 保存到 CSV 文件
df.to_csv('steam_app_id_list.csv', index=False)

In [2]:
steam_ids = pd.read_csv('steam_app_id_list.csv')
steam_ids = steam_ids['appid'].to_list()
len(steam_ids)

164392

In [3]:
def get_steam_app_data(start_index, end_index):
    app_data = []
    banned_id = [81692, 0000]
    
    for app_id in steam_ids[start_index:end_index]:
        if app_id not in banned_id:
            result = parse_steam_detail(app_id)
        else:
            continue
        
        if result == -1 or result['all_reviews'] == 0:
            if result == -1:
                print(f'connected failed:----{app_id}')
                
        else:
            app_data.append(result)
            
            
    return app_data


def parse_steam_detail(app_id):    
    temp = {
        'appid': '',
        'name': '',
        'released_date': '',
        'tags': '',
        'url': '',
        'description': '',
        'developers': '',
        'publishers': '',
        'price': '',
        'all_reviews': 0,
        'positice_reviews': 0,
        'review_score': 0.0,
        'supported_languages': '',
        # 'minimum_pc_requirements': '',
        # 'recommended_pc_requirements': '',
    }
    
    
    url = f"https://store.steampowered.com/app/{app_id}"
    # print(app_id)
    
    try:
        response = requests.get(url)
    except:
        return -1
    
    soup = BeautifulSoup(response.content, 'html.parser')
    
    temp['appid'] = app_id
    
    try:
        temp['name'] = soup.find('div', {'class': 'apphub_AppName'}).text
    except:
        temp['name'] = ''
    
    try:
        temp['released_date'] = soup.find('div', class_='date').text.strip()
    except:
        temp['released_date'] = ''
    
    try:
        temp['tags'] = [tag.text.strip() for tag in soup.find_all('a', class_='app_tag')]
    except:
        temp['tags'] = ''
    
    temp['url'] = url
    
    try:
        description = soup.find('div', class_='game_description_snippet').text
        temp['description'] = description.replace('\t', '').replace('\r', '').replace('\n', '')
    except:
        temp['description'] = ''
    
    try:
        developers = soup.find('div', {'id': 'developers_list'}).find_all('a')
        temp['developers'] = ', '.join([i.text for i in developers])
    except:
        temp['developers'] = ''

    try:
        publishers = soup.find('div', class_='glance_ctn_responsive_left').find_all('a')
        temp['publishers'] = ', '.join([i.text for i in publishers if 'publisher' in i['href']])
    except:
        temp['publishers'] = ''
    
    
    try:
        temp['price'] = soup.find('div', {'class': 'discount_original_price'}).text.strip()
    except:
        pass
    
    try:
        temp['price'] = soup.find('div', {'class': 'game_purchase_price'}).text.strip()
    except:
        pass
         
    
    try:
        all_reviews = soup.select('#review_summary_num_reviews')
        int_all_reviews = int(str(all_reviews).split('value=')[-1].split('"')[1])
        temp['all_reviews'] = int_all_reviews
        positive_reviews = soup.select('#review_summary_num_positive_reviews')
        int_positive_reviews = int(str(positive_reviews).split('value=')[-1].split('"')[1])
        temp['positice_reviews'] = int_positive_reviews
        temp['review_score'] = round((int_positive_reviews / int_all_reviews) * 100, 2)
    except:
        temp['review_score'] = 0.0
    
    try:
        supported_languages = soup.find('table', {'class': 'game_language_options'}).find_all('td', class_='ellipsis')
        temp['supported_languages'] = ', '.join([i.text.replace('\r', '').replace('\n', '').replace('\t', '') for i in supported_languages])
    except:
        temp['supported_languages'] = ''

    # try:
    #     pc_requirements = soup.select('[data-os="win"]')
    #     for i in pc_requirements:
    #         temp['minimum_pc_requirements'] = i.text.strip().replace('\t', '').replace('\r', '').replace('\n', '').split('Recommended:')[0]
    #         temp['recommended_pc_requirements'] = i.text.strip().replace('\t', '').replace('\r', '').replace('\n', '').split('Recommended:')[-1]
    # except:
    #     temp['minimum_pc_requirements'] = ''
    #     temp['recommended_pc_requirements'] = ''
        
    return temp

def save_to_csv(app_data, filename):
    df = pd.DataFrame(app_data)
    df.to_csv(filename, index=False, mode='a', header=False)

# parse_steam_detail(81692)

In [4]:
filename = 'steam_app_data.csv'
column_names = ['appid',
        'name',
        'released_date',
        'tags',
        'url',
        'description',
        'developers',
        'publishers',
        'price',
        'all_reviews',
        'positice_reviews',
        'review_score',
        'supported_languages']
        

if not os.path.isfile(filename):
    # 文件不存在，创建 DataFrame，并设置列名
    df = pd.DataFrame(columns=column_names)
    df.to_csv(filename, index=False)
    
# 定义爬取的起始和结束索引
start_index = 0
end_index = len(steam_ids)


start_id = 0
# 检查之前是否有中断的进度
try:
    existing_data = pd.read_csv('steam_app_data.csv')
    if not existing_data.empty and len(existing_data) > 0:
        # start_index = existing_data.index[-1] + 1
        start_id = existing_data['appid'].iloc[-1]
except pd.errors.EmptyDataError:
    pass


print(f'start_id:{start_id}')
for i in range(len(steam_ids)):
    if steam_ids[i] == start_id:
        start_index = i + 1
        break
print(f'start_index:{start_index}')



# 每次解析并保存一定数量的数据
batch_size = 30
for i in range(start_index, end_index, batch_size):
    print(f'id:{i}')
    batch_start = i
    batch_end = min(i + batch_size, end_index)

    # 爬取数据
    app_data = get_steam_app_data(batch_start, batch_end)

    # 保存到 CSV 文件
    save_to_csv(app_data, 'steam_app_data.csv')

start_id:81528
start_index:146301
id:146301
id:146331
id:146361
id:146391
id:146421
id:146451
id:146481
id:146511
id:146541
id:146571
id:146601
id:146631
id:146661
id:146691
id:146721
id:146751
id:146781
id:146811
id:146841
id:146871
id:146901
id:146931
id:146961
id:146991
id:147021
id:147051
id:147081
id:147111
id:147141
id:147171
id:147201
id:147231
id:147261
id:147291
id:147321
id:147351
id:147381
id:147411
id:147441
id:147471
id:147501
id:147531
id:147561
id:147591
id:147621
id:147651
id:147681
id:147711
id:147741
id:147771
id:147801
id:147831
id:147861
id:147891
id:147921
id:147951
id:147981
id:148011
id:148041
id:148071
id:148101
id:148131
id:148161
id:148191
id:148221
id:148251
id:148281
id:148311
id:148341
id:148371
id:148401
id:148431
id:148461
id:148491
id:148521
id:148551
id:148581
id:148611
id:148641
id:148671
id:148701
id:148731
id:148761
id:148791
id:148821
id:148851
id:148881
id:148911
id:148941
id:148971
id:149001
id:149031
id:149061
id:149091
id:149121
id:149151
id:149