In [14]:
import os
import json
import pandas as pd
import requests
from dotenv import load_dotenv

In [15]:
year = 2023  # adjust as needed
folder_path = f"../../data/nba/api_data/match_stats/{year}"

# List all JSON files in the folder
json_files = [f for f in os.listdir(folder_path) if f.endswith(".json")]
print(f"📂 Found {len(json_files)} game files")


📂 Found 1348 game files


In [16]:
rows = []

for filename in json_files:
    file_path = os.path.join(folder_path, filename)

    try:
        with open(file_path, "r") as f:
            game_data = json.load(f)
        if game_data.get("status")!="closed":
            continue
        # Loop for home & away
        for side in ["home", "away"]:
            team = game_data.get(side, {})
            if not team:
                continue
            # print("-----------")
            common_info = {
                # game details
                "game_id": game_data.get("id"),
                "game_sr_id": game_data.get("sr_id"),
                "scheduled": game_data.get("scheduled"),
                "duration": game_data.get("duration"),
                "game_date":game_data.get("scheduled").split('T')[0],
                "status": game_data.get("status"),
                "attendance":game_data.get("attendance"),
                "track_on_court":game_data.get("track_on_court", "not preasent"),
                # team details
                "team_id": team.get("id"),
                "team_sr_id": team.get("sr_id"),
                "team_name": team.get("name"),
                "team_alias": team.get("alias"),
                "team_market": team.get("market"),
                "points": team.get("points"),
                "bonus": team.get("bonus"),
                "timeouts_remaining": team.get("remaining_timeouts"),
                # team records
                # "record_wins":team.get("record").get("wins"),
                # "record_losses":team.get("record").get("losses"),
                
            }
            
            record = team.get("record", {})
            common_info.update({
                     "record_wins":record.get("wins"),
                     "record_losses":record.get("losses")
                })
            # venue details
            venue = game_data.get("venue", {})
            common_info.update({
                "venue_id": venue.get("id"),
                "venue_name": venue.get("name"),
                "venue_capacity": venue.get("capacity"),
                "venue_address": venue.get("address"),
                "venue_city": venue.get("city"),
                "venue_state": venue.get("state"),
                "venue_zip": venue.get("zip"),
                "venue_country": venue.get("country"),
                "venue_sr_id": venue.get("sr_id"),
            })

            # location details (if present inside venue)
            location = venue.get("location", {})
            common_info.update({
                "venue_lat": location.get("lat"),
                "venue_lon": location.get("lng"),
            })
            
            # player statistics
            player_stats_data = team.get("players", [])
            if player_stats_data:  # list of players
                for player in player_stats_data:
                    row =common_info.copy()
                    # basic info (excluding statistics key)
                    for k, v in player.items():
                        if k != "statistics":
                            row[f"player_{k}"] = v

                    # player statistics
                    player_statistics = player.get("statistics", {})
                    for stat_name, stat_value in player_statistics.items():
                        row[f"{stat_name}"] = stat_value

                    # player periodic statistics
                    periods = player_statistics.get("periods", [])
                    for p in periods:
                        period_no = p.get("number")
                        suffix = f"{period_no}th_period"
                        for stat_name, value in p.items():
                            if stat_name != "number":
                                row[f"{suffix}_{stat_name}"] = value
                    rows.append(row)
                
            

    except Exception as e:
        print(f"❌ Failed {filename}: {e}")
        break


In [17]:
%tb

No traceback available to show.


In [18]:
df = pd.DataFrame(rows)
df.drop(columns=['periods'],inplace=True)
# Composite key = (game_id, team_id)
print(f"✅ Processed {len(df)} rows from {len(json_files)} files")

output_path = f"../../data/nba/api_data/player_stats/nba_player_stats_{year}.csv"
df.to_csv(output_path, index=False)
print(f"📄 Saved CSV → {output_path}")


✅ Processed 46193 rows from 1348 files
📄 Saved CSV → ../../data/nba/api_data/player_stats/nba_player_stats_2023.csv


In [19]:
# df = pd.read_csv('../../data/ncaamb/api_data/')
df = pd.read_csv(output_path)
df.shape

  df = pd.read_csv(output_path)


(46193, 347)

In [20]:
# Number of player of a team
unique_players_per_game = df.groupby('game_id')['player_id'].nunique()
unique_players_dict = unique_players_per_game.to_dict()
print(unique_players_per_game)
# print(unique_players_dict)


game_id
00aca98b-9680-4900-8dfd-277e7767b696    34
00cd35d2-a20a-445c-9c4b-4eb73e78d2ff    36
00d9697d-e3f1-40dc-b914-c0bc9c5de45c    36
01e3aa9e-ed9b-4862-adae-144a861fe035    36
01e3abdc-1b56-4d58-8cf0-6498a97698ac    35
                                        ..
fe57a1fd-a1bc-42b8-bdcd-84c2d3263ab5    35
fe96e30d-fbdc-4e9c-8fca-0156e5aed851    35
fedaa6b6-196b-4ad2-af51-2b52350bca68    35
ff3c2cfe-dee6-45af-b66c-5208aee4e6b2    30
ff739695-a74f-40f7-a0df-1a6248fbf1ab    34
Name: player_id, Length: 1323, dtype: int64


In [21]:
# Check for duplicate game_id + player_id pairs
duplicate_pairs = df.duplicated(subset=['game_id', 'player_id'], keep=False)
if duplicate_pairs.any():
    print("There are duplicate game_id + player_id pairs:")
    print(df[duplicate_pairs])
else:
    print("All game_id + player_id pairs are unique.")


All game_id + player_id pairs are unique.


In [22]:
# If each game id have unique 2 team id
games_per_team = df.groupby('game_id')['team_id'].nunique()
print(games_per_team)
invalid_games = games_per_team[games_per_team != 2]
if not invalid_games.empty:
    print("Games with teams other than 2:")
    print(invalid_games)
else:
    print("All games have exactly 2 teams.")


game_id
00aca98b-9680-4900-8dfd-277e7767b696    2
00cd35d2-a20a-445c-9c4b-4eb73e78d2ff    2
00d9697d-e3f1-40dc-b914-c0bc9c5de45c    2
01e3aa9e-ed9b-4862-adae-144a861fe035    2
01e3abdc-1b56-4d58-8cf0-6498a97698ac    2
                                       ..
fe57a1fd-a1bc-42b8-bdcd-84c2d3263ab5    2
fe96e30d-fbdc-4e9c-8fca-0156e5aed851    2
fedaa6b6-196b-4ad2-af51-2b52350bca68    2
ff3c2cfe-dee6-45af-b66c-5208aee4e6b2    2
ff739695-a74f-40f7-a0df-1a6248fbf1ab    2
Name: team_id, Length: 1323, dtype: int64
All games have exactly 2 teams.


In [23]:
# Total number of team, games, players in each team
games_per_team = df.groupby('team_id')['game_id'].nunique()
players_per_team = df.groupby('team_id')['player_id'].nunique()
team_summary = pd.DataFrame({
    'total_games': games_per_team,
    'total_players': players_per_team
}).reset_index()

print(team_summary)


                                 team_id  total_games  total_players
0   2674a061-2cb1-4a0b-b0b6-e237ff267f45            1              8
1   583ec5fd-fb46-11e1-82cb-f4ce4684ea4c           84             20
2   583ec70e-fb46-11e1-82cb-f4ce4684ea4c           95             28
3   583ec773-fb46-11e1-82cb-f4ce4684ea4c           94             21
4   583ec7cd-fb46-11e1-82cb-f4ce4684ea4c          100             23
5   583ec825-fb46-11e1-82cb-f4ce4684ea4c           83             18
6   583ec87d-fb46-11e1-82cb-f4ce4684ea4c           89             30
7   583ec8d4-fb46-11e1-82cb-f4ce4684ea4c           82             27
8   583ec928-fb46-11e1-82cb-f4ce4684ea4c           82             32
9   583ec97e-fb46-11e1-82cb-f4ce4684ea4c           82             27
10  583ec9d6-fb46-11e1-82cb-f4ce4684ea4c           82             23
11  583eca2f-fb46-11e1-82cb-f4ce4684ea4c           98             20
12  583eca88-fb46-11e1-82cb-f4ce4684ea4c           82             35
13  583ecae2-fb46-11e1-82cb-f4ce46