In [1]:
import pandas as pd
import os
import time
from nba_api.stats.endpoints import playbyplayv2
from nba_api.stats.endpoints import leaguegamefinder

# 定义文件目录和文件命名规则
OUTPUT_DIR = "playbyplayv2_data"
PROCESSED_GAMES_FILE = "processed_games_playbyplayv2.txt"

# 确保输出目录存在
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# 读取已处理的比赛 ID
def load_processed_games():
    if os.path.exists(PROCESSED_GAMES_FILE):
        with open(PROCESSED_GAMES_FILE, "r") as f:
            return set(line.strip() for line in f.readlines())
    return set()

# 保存已处理的比赛 ID
def save_processed_game(game_id):
    with open(PROCESSED_GAMES_FILE, "a") as f:
        f.write(f"{game_id}\n")

# 保存数据到 CSV 文件，按月份分类
def save_data(df, game_date):
    month = pd.to_datetime(game_date).strftime('%Y-%m')
    file_name = f"playbyplayv2_2022-23_{month}.csv"
    file_path = os.path.join(OUTPUT_DIR, file_name)
    
    if os.path.exists(file_path):
        df.to_csv(file_path, mode='a', header=False, index=False)
    else:
        df.to_csv(file_path, index=False)
    
    print(f"Saved data to {file_path}")

# 获取 2022-23 赛季常规赛的所有比赛 ID
def get_season_game_ids(season="2022-23"):
    print("Fetching all games...")
    gamefinder = leaguegamefinder.LeagueGameFinder(season_nullable=season, season_type_nullable='Regular Season')
    games = gamefinder.get_data_frames()[0]

    # 过滤仅包含 2022-23 赛季的常规赛
    games = games[games['SEASON_ID'].astype(str).str.endswith('2022')]

    # 删除重复的比赛记录（基于 GAME_ID）
    games = games.drop_duplicates(subset=['GAME_ID'])

    # 过滤掉异常的MATCHUP（仅保留NBA常规赛球队）
    valid_teams = ['ATL', 'BOS', 'BKN', 'CHA', 'CHI', 'CLE', 'DAL', 'DEN', 'DET', 
                   'GSW', 'HOU', 'IND', 'LAC', 'LAL', 'MEM', 'MIA', 'MIL', 'MIN', 
                   'NOP', 'NYK', 'OKC', 'ORL', 'PHI', 'PHX', 'POR', 'SAC', 'SAS', 
                   'TOR', 'UTA', 'WAS']
    
    # 保留主客场球队都在NBA队伍名单的比赛
    games = games[games['MATCHUP'].apply(lambda x: x.split(' ')[0] in valid_teams and x.split(' ')[-1] in valid_teams)]

    print(f"Filtered regular season games count: {len(games)}")
    
    return games[['GAME_ID', 'GAME_DATE']].values.tolist()

# 获取比赛的PlayByPlayV2数据
def fetch_game_playbyplayv2(game_id, start_period=1, end_period=10, max_retries=3, wait_time=5):
    for attempt in range(max_retries):
        try:
            pbp_data = playbyplayv2.PlayByPlayV2(
                game_id=game_id,
                start_period=start_period,
                end_period=end_period,
                timeout=60
            )
            return pbp_data.get_data_frames()[0]
        except Exception as e:
            print(f"Attempt {attempt + 1} failed for Game ID {game_id}: {e}")
            time.sleep(wait_time)
    print(f"Failed to fetch data for Game ID {game_id} after {max_retries} retries.")
    return None

# 处理所有比赛
def process_games(games):
    processed_games = load_processed_games()

    for index, (game_id, game_date) in enumerate(games, start=1):
        if game_id in processed_games:
            print(f"Skipping already processed Game ID: {game_id}")
            continue
        
        print(f"Fetching data for Game {index}/{len(games)}: Game ID {game_id}")
        pbp_df = fetch_game_playbyplayv2(game_id)

        if pbp_df is not None and not pbp_df.empty:
            pbp_df['GAME_DATE'] = game_date  # 添加比赛日期
            save_data(pbp_df, game_date)
            save_processed_game(game_id)
            print(f"Successfully saved data for Game ID: {game_id}")
        else:
            print(f"No data fetched for Game ID: {game_id}")

        # 添加延迟以避免被 NBA API 限制
        time.sleep(1)

    print("Data fetching completed.")

# 获取 2022-23 赛季比赛 ID 列表
games = get_season_game_ids()

# 运行数据抓取
process_games(games)


Fetching all games...
Filtered regular season games count: 1230
Fetching data for Game 1/1230: Game ID 0022201224
Saved data to playbyplayv2_data\playbyplayv2_2022-23_2023-04.csv
Successfully saved data for Game ID: 0022201224
Fetching data for Game 2/1230: Game ID 0022201221
Saved data to playbyplayv2_data\playbyplayv2_2022-23_2023-04.csv
Successfully saved data for Game ID: 0022201221
Fetching data for Game 3/1230: Game ID 0022201217
Saved data to playbyplayv2_data\playbyplayv2_2022-23_2023-04.csv
Successfully saved data for Game ID: 0022201217
Fetching data for Game 4/1230: Game ID 0022201220
Saved data to playbyplayv2_data\playbyplayv2_2022-23_2023-04.csv
Successfully saved data for Game ID: 0022201220
Fetching data for Game 5/1230: Game ID 0022201225
Saved data to playbyplayv2_data\playbyplayv2_2022-23_2023-04.csv
Successfully saved data for Game ID: 0022201225
Fetching data for Game 6/1230: Game ID 0022201230
Saved data to playbyplayv2_data\playbyplayv2_2022-23_2023-04.csv
Succes

KeyboardInterrupt: 

In [2]:
import pandas as pd
import os

# 定义目标文件夹路径
folder_path = "playbyplayv2_data/"

# 定义函数来去除重复行并保存文件
def remove_duplicate_rows(file_path):
    try:
        # 读取 CSV 文件
        df = pd.read_csv(file_path)

        # 去除完全相同的行（保留一行）
        df_cleaned = df.drop_duplicates(keep='first')

        # 保存去重后的数据
        df_cleaned.to_csv(file_path, index=False)

        print(f"Processed file: {file_path}")
        print(f"Original rows: {len(df)}, After deduplication: {len(df_cleaned)}")
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

# 遍历文件夹下所有 CSV 文件
for file_name in os.listdir(folder_path):
    if file_name.endswith(".csv"):
        file_path = os.path.join(folder_path, file_name)
        remove_duplicate_rows(file_path)

print("Deduplication process completed for all CSV files.")


  df = pd.read_csv(file_path)


Processed file: playbyplayv2_data/playbyplayv2_2022-23_2022-10.csv
Original rows: 49234, After deduplication: 49234
Processed file: playbyplayv2_data/playbyplayv2_2022-23_2022-11.csv
Original rows: 103724, After deduplication: 103724
Processed file: playbyplayv2_data/playbyplayv2_2022-23_2022-12.csv
Original rows: 104337, After deduplication: 104337
Processed file: playbyplayv2_data/playbyplayv2_2022-23_2023-01.csv
Original rows: 103724, After deduplication: 103724
Processed file: playbyplayv2_data/playbyplayv2_2022-23_2023-02.csv
Original rows: 75730, After deduplication: 75730
Processed file: playbyplayv2_data/playbyplayv2_2022-23_2023-03.csv
Original rows: 105463, After deduplication: 105463
Processed file: playbyplayv2_data/playbyplayv2_2022-23_2023-04.csv
Original rows: 32192, After deduplication: 32192
Deduplication process completed for all CSV files.
