In [1]:
import pandas as pd
import os
import time
from nba_api.stats.endpoints import boxscorefourfactorsv2
from nba_api.stats.endpoints import leaguegamefinder

# 定义文件名
PLAYER_STATS_FILE = "boxscore_fourfactors_player_stats.csv"
TEAM_STATS_FILE = "boxscore_fourfactors_team_stats.csv"
PROCESSED_GAMES_FILE = "processed_games_fourfactors.txt"

# 读取已处理的比赛 ID
def load_processed_games():
    if os.path.exists(PROCESSED_GAMES_FILE):
        with open(PROCESSED_GAMES_FILE, "r") as f:
            return set(line.strip() for line in f.readlines())
    return set()

# 保存已处理的比赛 ID
def save_processed_game(game_id):
    with open(PROCESSED_GAMES_FILE, "a") as f:
        f.write(f"{game_id}\n")

# 加载已有数据
def load_existing_data(file_path):
    if os.path.exists(file_path):
        return pd.read_csv(file_path)
    return pd.DataFrame()

# 保存新数据
def save_data(df, file_path):
    if os.path.exists(file_path):
        df.to_csv(file_path, mode='a', header=False, index=False)
    else:
        df.to_csv(file_path, index=False)

# 从 NBA API 获取比赛数据
def fetch_game_data_with_retry(game_id, max_retries=3, wait_time=5):
    for attempt in range(max_retries):
        try:
            boxscore_data = boxscorefourfactorsv2.BoxScoreFourFactorsV2(
                game_id=game_id,
                start_period=1,
                end_period=10,
                start_range=0,
                end_range=0,
                range_type=0,
                timeout=60  # 增加超时时间
            )
            return boxscore_data
        except Exception as e:
            print(f"Attempt {attempt + 1} failed for Game ID {game_id}: {e}")
            time.sleep(wait_time)
    print(f"Failed to fetch data for Game ID {game_id} after {max_retries} retries.")
    return None

# 主抓取流程
def process_games(game_ids):
    processed_games = load_processed_games()
    player_stats_all = load_existing_data(PLAYER_STATS_FILE)
    team_stats_all = load_existing_data(TEAM_STATS_FILE)

    for index, game_id in enumerate(game_ids, start=1):
        if game_id in processed_games:
            print(f"Skipping already processed Game ID: {game_id}")
            continue
        
        print(f"Fetching data for Game {index}/{len(game_ids)}: Game ID {game_id}")
        boxscore_data = fetch_game_data_with_retry(game_id)
        
        if boxscore_data:
            df_player_stats = boxscore_data.get_data_frames()[0]  # Player stats
            df_team_stats = boxscore_data.get_data_frames()[1]    # Team stats

            # 保存数据到本地
            save_data(df_player_stats, PLAYER_STATS_FILE)
            save_data(df_team_stats, TEAM_STATS_FILE)

            # 更新已处理游戏 ID
            save_processed_game(game_id)

            print(f"Successfully saved data for Game ID: {game_id}")
        
        # 添加延迟以避免请求过快
        time.sleep(1)

    print("Data fetching completed.")


In [None]:
# 获取 2022-23 赛季常规赛比赛 ID
def get_season_game_ids(season="2022-23"):
    print("Fetching all games...")
    gamefinder = leaguegamefinder.LeagueGameFinder(season_nullable=season, season_type_nullable='Regular Season')
    games = gamefinder.get_data_frames()[0]

    # 打印列名，确保数据结构正确
    print("Columns in the games data:", games.columns)

    # 过滤仅包含 2022-23 赛季的常规赛
    games = games[games['SEASON_ID'].astype(str).str.endswith('2022')]

    # 删除重复的比赛记录（基于 GAME_ID）
    games = games.drop_duplicates(subset=['GAME_ID'])

    # 过滤掉异常的MATCHUP（仅保留NBA常规赛球队）
    valid_teams = ['ATL', 'BOS', 'BKN', 'CHA', 'CHI', 'CLE', 'DAL', 'DEN', 'DET', 
                   'GSW', 'HOU', 'IND', 'LAC', 'LAL', 'MEM', 'MIA', 'MIL', 'MIN', 
                   'NOP', 'NYK', 'OKC', 'ORL', 'PHI', 'PHX', 'POR', 'SAC', 'SAS', 
                   'TOR', 'UTA', 'WAS']
    
    # 保留主客场球队都在NBA队伍名单的比赛
    games = games[games['MATCHUP'].apply(lambda x: x.split(' ')[0] in valid_teams and x.split(' ')[-1] in valid_teams)]

    print(f"Filtered regular season games count: {len(games)}")
    
    # 提取比赛 ID，并去重
    game_ids = games['GAME_ID'].tolist()
    print(f"Total games found for {season}: {len(game_ids)}")
    return game_ids

# 获取 2022-23 赛季的比赛 ID 列表
game_ids = get_season_game_ids()

# 运行数据抓取
process_games(game_ids)

Fetching all games...
Columns in the games data: Index(['SEASON_ID', 'TEAM_ID', 'TEAM_ABBREVIATION', 'TEAM_NAME', 'GAME_ID',
       'GAME_DATE', 'MATCHUP', 'WL', 'MIN', 'PTS', 'FGM', 'FGA', 'FG_PCT',
       'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB',
       'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PLUS_MINUS'],
      dtype='object')
Filtered regular season games count: 1230
Total games found for 2022-23: 1230
Fetching data for Game 1/1230: Game ID 0022201224
Successfully saved data for Game ID: 0022201224
Fetching data for Game 2/1230: Game ID 0022201217
Successfully saved data for Game ID: 0022201217
Fetching data for Game 3/1230: Game ID 0022201222
Successfully saved data for Game ID: 0022201222
Fetching data for Game 4/1230: Game ID 0022201225
Successfully saved data for Game ID: 0022201225
Fetching data for Game 5/1230: Game ID 0022201216
Successfully saved data for Game ID: 0022201216
Fetching data for Game 6/1230: Game ID 0022201218
Successfully saved data

In [None]:
import pandas as pd

# 定义函数来去除重复行并保存文件
def remove_duplicate_rows(file_path):
    try:
        # 读取 CSV 文件
        df = pd.read_csv(file_path)

        # 去除完全相同的行（保留一行）
        df_cleaned = df.drop_duplicates(keep='first')

        # 保存去重后的数据
        df_cleaned.to_csv(file_path, index=False)

        print(f"Successfully removed duplicates and saved the cleaned file: {file_path}")
        print(f"Original rows: {len(df)}, After deduplication: {len(df_cleaned)}")
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

# 处理 player stats 和 team stats 文件
remove_duplicate_rows("boxscore_fourfactors_player_stats.csv")
remove_duplicate_rows("boxscore_fourfactors_team_stats.csv")

In [None]:
df1=pd.read_csv("boxscore_fourfactors_player_stats.csv")
df2=pd.read_csv("boxscore_fourfactors_team_stats.csv")
print(df1.shape,df2.shape)