In [6]:
import os
import json
import pandas as pd
import requests
from dotenv import load_dotenv

In [7]:
year = 2024  # adjust as needed
folder_path = f"../../data/ncaamb/api_data/match_stats/{year}"

# List all JSON files in the folder
json_files = [f for f in os.listdir(folder_path) if f.endswith(".json")]
print(f"📂 Found {len(json_files)} game files")


📂 Found 6298 game files


In [8]:
rows = []

for filename in json_files:
    file_path = os.path.join(folder_path, filename)

    try:
        with open(file_path, "r") as f:
            game_data = json.load(f)
        if game_data.get("status")!="closed":
            continue
        # Loop for home & away
        for side in ["home", "away"]:
            team = game_data.get(side, {})
            if not team:
                continue
            # print("-----------")
            row = {
                # game details
                "game_id": game_data.get("id"),
                "game_sr_id": game_data.get("sr_id"),
                "scheduled": game_data.get("scheduled"),
                "duration": game_data.get("duration"),
                "game_date":game_data.get("scheduled").split('T')[0],
                "status": game_data.get("status"),
                "attendance":game_data.get("attendance"),
                "track_on_court":game_data.get("track_on_court", "not preasent"),
                # team details
                "team_id": team.get("id"),
                "team_sr_id": team.get("sr_id"),
                "team_name": team.get("name"),
                "team_alias": team.get("alias"),
                "team_market": team.get("market"),
                "points": team.get("points"),
                "bonus": team.get("bonus"),
                "timeouts_remaining": team.get("remaining_timeouts"),
                # team records
                # "record_wins":team.get("record").get("wins"),
                # "record_losses":team.get("record").get("losses"),
                
            }
            
            record = team.get("record", {})
            row.update({
                     "record_wins":record.get("wins"),
                     "record_losses":record.get("losses")
                })
            # venue details
            venue = game_data.get("venue", {})
            row.update({
                "venue_id": venue.get("id"),
                "venue_name": venue.get("name"),
                "venue_capacity": venue.get("capacity"),
                "venue_address": venue.get("address"),
                "venue_city": venue.get("city"),
                "venue_state": venue.get("state"),
                "venue_zip": venue.get("zip"),
                "venue_country": venue.get("country"),
                "venue_sr_id": venue.get("sr_id"),
            })

            # location details (if present inside venue)
            location = venue.get("location", {})
            row.update({
                "venue_lat": location.get("lat"),
                "venue_lon": location.get("lng"),
            })
            
            # player statistics
            player_stats_data = team.get("players", [])
            if player_stats_data:  # list of players
                for player in player_stats_data:
                    # basic info (excluding statistics key)
                    for k, v in player.items():
                        if k != "statistics":
                            row[f"player_{k}"] = v

                    # player statistics
                    player_statistics = player.get("statistics", {})
                    for stat_name, stat_value in player_statistics.items():
                        row[f"{stat_name}"] = stat_value

                    # player periodic statistics
                    periods = player_statistics.get("periods", [])
                    for p in periods:
                        period_no = p.get("number")
                        suffix = f"{period_no}th_period"
                        for stat_name, value in p.items():
                            if stat_name != "number":
                                row[f"{suffix}_{stat_name}"] = value
                    rows.append(row)
                
            

    except Exception as e:
        print(f"❌ Failed {filename}: {e}")
        break


In [9]:
df = pd.DataFrame(rows)
df.drop(columns=['periods'],inplace=True)
# Composite key = (game_id, team_id)
print(f"✅ Processed {len(df)} rows from {len(json_files)} files")

output_path = f"../../data/ncaamb/api_data/player_stats/ncaamb_player_stats_{year}.csv"
df.to_csv(output_path, index=False)
print(f"📄 Saved CSV → {output_path}")


✅ Processed 202101 rows from 6298 files
📄 Saved CSV → ../../data/ncaamb/api_data/player_stats/ncaamb_player_stats_2024.csv


In [10]:
# df = pd.read_csv('../../data/ncaamb/api_data/')
df = pd.read_csv(output_path)
df.shape

  df = pd.read_csv(output_path)


(202101, 259)