## 1. Introduction

### This is portfolio project where I explore Euroleague data and analyse it to gather insights beyond of what is possible with current easily accessible resources. I try to utilize best data visualisation techniques for different types of data. 

In [37]:
import re
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import json
from datetime import datetime
from time import sleep
import os, glob, json
import pandas as pd
from pathlib import Path

## 2. Data Collection

### Fetching data from the Euroleague API.

API (JSON) → Python(DataFrame) → SQL → Analysi

In [2]:
uri_dict = {
    'Header': 'Game metadata (teams, date, location, scores)',
    'BoxScore': 'Detailed player statistics per game',
    'Comparison': 'Team comparison stats (rebounds, assists, etc.)',
    'PlayByPlay': 'Timeline of in-game events (fouls, points, substitutions)',
    'Points': 'Aggregated scoring statistics',
    'ShootingGraphic': 'Shot chart data (location and success)',
    'Evolution': 'Score evolution over quarters and OT'
}

In [3]:
#Settings
max_game_code = 400
base_url = "https://live.euroleague.net/api"
output_dir = "data"
# Generate last 4 Euroleague season codes based on current year
current_year = datetime.now().year
last_4_seasons = [f"E{year}" for year in range(current_year - 1, current_year - 5, -1)]
last_4_seasons

['E2024', 'E2023', 'E2022', 'E2021']

In [4]:
# #test
# import requests
# print(requests.get("https://live.euroleague.net/api/Header?gamecode=1&seasoncode=E2024").json())

In [5]:
# Create folder structure
for season in last_4_seasons:
    for uri in uri_dict.keys():
        path = os.path.join(output_dir, season, uri)
        os.makedirs(path, exist_ok=True)

In [48]:
def fetch_and_save(uri, gamecode, seasoncode):
    url = f"{base_url}/{uri}?gamecode={gamecode}&seasoncode={seasoncode}"
    try:
        response = requests.get(url)
        if response.status_code == 200 and response.content.strip() not in [b"", b"null"]:
            data = response.json()
            filename = os.path.join(output_dir, seasoncode, uri, f"game_{gamecode}.json")
            os.makedirs(os.path.dirname(filename), exist_ok=True)
            with open(filename, "w", encoding="utf-8") as f:
                json.dump(data, f, ensure_ascii=False, indent=2)
            print(f"✅ Saved: {uri} for game {gamecode}, {seasoncode}")
            return True
        else:
            print(f"❌ Skipped (empty or error): {uri} for game {gamecode}, {seasoncode}")
            return False
    except Exception as e:
        print(f"❌ Exception: {uri} game {gamecode}, {seasoncode} — {e}")
        return False


In [49]:
def get_downloaded_gamecodes(root_dir="data"):
    downloaded = {}  # {season: {uri: set(gamecodes)}}

    root = Path(root_dir)
    for season_path in root.iterdir():
        if not season_path.is_dir():
            continue
        season = season_path.name
        downloaded[season] = {}
        
        for uri_path in season_path.iterdir():
            if not uri_path.is_dir():
                continue
            uri = uri_path.name
            gamecodes = set()

            for file in uri_path.glob("game_*.json"):
                match = re.match(r"game_(\d+)\.json", file.name)
                if match:
                    gamecodes.add(int(match.group(1)))
            
            downloaded[season][uri] = gamecodes
    return downloaded


In [52]:
def update_euroleague_data(last_4_seasons, max_game_code, uri_dict, root_dir="data", consecutive_skip_limit=5):
    downloaded = get_downloaded_gamecodes(root_dir)
    fetched_count = 0
    skipped_count = 0
    failed_count = 0

    for season in last_4_seasons:
        print(f"\n📅 Season: {season}")
        consecutive_skips_or_fails = 0

        for gamecode in range(1, max_game_code + 1):
            skip_round = True  # Assume we will skip or fail every URI unless one succeeds

            for uri in uri_dict:
                already_downloaded = downloaded.get(season, {}).get(uri, set())
                if gamecode in already_downloaded:
                    print(f"⏩ Skipped {season} {uri} game {gamecode:03d}")
                    skip_round = False
                    consecutive_skips_or_fails = 0
                    continue

                try:
                    success = fetch_and_save(uri, gamecode, season)
                    if success:
                        fetched_count += 1
                        consecutive_skips_or_fails = 0
                        skip_round = False
                    else:
                        failed_count += 1
                except Exception as e:
                    print(f"❌ Failed {season} {uri} game {gamecode:03d} — {e}")
                    failed_count += 1

            if skip_round:
                consecutive_skips_or_fails += 1
                print(f"⚠️ Skipped or failed all URIs for game {gamecode:03d} ({consecutive_skips_or_fails}/{consecutive_skip_limit})")
                if consecutive_skips_or_fails >= consecutive_skip_limit:
                    print(f"🚪 Exiting season {season} early after {consecutive_skip_limit} consecutive skips/fails.")
                    break

    print("\n=== Update Summary ===")
    print(f"✅ Fetched: {fetched_count}")
    print(f"❌ Failed:  {failed_count}")
    print(f"🕒 Finished at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

In [53]:
update_euroleague_data(last_4_seasons, max_game_code, uri_dict, root_dir="data")


📅 Season: E2024
⏩ Skipped E2024 Header game 001
⏩ Skipped E2024 BoxScore game 001
⏩ Skipped E2024 Comparison game 001
⏩ Skipped E2024 PlayByPlay game 001
⏩ Skipped E2024 Points game 001
⏩ Skipped E2024 ShootingGraphic game 001
⏩ Skipped E2024 Evolution game 001
⏩ Skipped E2024 Header game 002
⏩ Skipped E2024 BoxScore game 002
⏩ Skipped E2024 Comparison game 002
⏩ Skipped E2024 PlayByPlay game 002
⏩ Skipped E2024 Points game 002
⏩ Skipped E2024 ShootingGraphic game 002
⏩ Skipped E2024 Evolution game 002
⏩ Skipped E2024 Header game 003
⏩ Skipped E2024 BoxScore game 003
⏩ Skipped E2024 Comparison game 003
⏩ Skipped E2024 PlayByPlay game 003
⏩ Skipped E2024 Points game 003
⏩ Skipped E2024 ShootingGraphic game 003
⏩ Skipped E2024 Evolution game 003
⏩ Skipped E2024 Header game 004
⏩ Skipped E2024 BoxScore game 004
⏩ Skipped E2024 Comparison game 004
⏩ Skipped E2024 PlayByPlay game 004
⏩ Skipped E2024 Points game 004
⏩ Skipped E2024 ShootingGraphic game 004
⏩ Skipped E2024 Evolution game 004

## Data Exploration

### Inspecting data from different URI’s. Identifying structure and key fields.

In [8]:
# Load a 'header' sample file
with open('data/E2024/Header/game_1.json', 'r', encoding='utf-8') as f:
    data_h = json.load(f)

# Preview the top-level keys and structure
print(json.dumps(data_h, indent=2))


{
  "Live": false,
  "Round": "1",
  "Date": "03/10/2024",
  "Hour": "18:45 ",
  "Stadium": "UBER ARENA",
  "Capacity": "11856",
  "TeamA": "ALBA BERLIN",
  "TeamB": "PANATHINAIKOS AKTOR ATHENS",
  "CodeTeamA": "BER",
  "TVCodeA": "BER",
  "CodeTeamB": "PAN",
  "TVCodeB": "PAO",
  "imA": "BER       ",
  "imB": "PAN       ",
  "ScoreA": "77",
  "ScoreB": "87",
  "CoachA": "GONZALEZ, ISRAEL",
  "CoachB": "ATAMAN, ERGIN",
  "GameTime": "40:00",
  "RemainingPartialTime": "00:00",
  "wid": "80",
  "Quarter": "",
  "FoultsA": "16",
  "FoultsB": "11",
  "TimeoutsA": "3",
  "TimeoutsB": "2",
  "ScoreQuarter1A": 17,
  "ScoreQuarter2A": 36,
  "ScoreQuarter3A": 57,
  "ScoreQuarter4A": 77,
  "ScoreExtraTimeA": 0,
  "ScoreQuarter1B": 26,
  "ScoreQuarter2B": 45,
  "ScoreQuarter3B": 62,
  "ScoreQuarter4B": 87,
  "ScoreExtraTimeB": 0,
  "Phase": "REGULAR SEASON",
  "PhaseReducedName": "R S ",
  "Competition": "EUROLEAGUE 2024-25",
  "CompetitionReducedName": "E2024     ",
  "pcom": "E2024     ",
  "Re

Header URI – Game metadata (teams, date, location, scores, referees). '2024' in E2024 indicates first year of the season. E2023 indicates 2023-2024; E2024 indicates 2024-2025 season, etc.

In [9]:
# Load a 'boxscore' sample file
with open('data/E2024/boxscore/game_300.json', 'r', encoding='utf-8') as f:
    data_b = json.load(f)

# Preview the top-level keys and structure
print(json.dumps(data_b, indent=2))


{
  "Live": false,
  "Referees": "MOGULKOC, EMIN, RYZHYK, BORYS, SUKYS, ARTURAS",
  "Attendance": "8548",
  "ByQuarter": [
    {
      "Team": "EA7 EMPORIO ARMANI MILAN",
      "Quarter1": 31,
      "Quarter2": 31,
      "Quarter3": 20,
      "Quarter4": 29
    },
    {
      "Team": "BASKONIA VITORIA-GASTEIZ",
      "Quarter1": 18,
      "Quarter2": 17,
      "Quarter3": 26,
      "Quarter4": 28
    }
  ],
  "EndOfQuarter": [
    {
      "Team": "EA7 EMPORIO ARMANI MILAN",
      "Quarter1": 31,
      "Quarter2": 62,
      "Quarter3": 82,
      "Quarter4": 111
    },
    {
      "Team": "BASKONIA VITORIA-GASTEIZ",
      "Quarter1": 18,
      "Quarter2": 35,
      "Quarter3": 61,
      "Quarter4": 89
    }
  ],
  "Stats": [
    {
      "Team": "EA7 EMPORIO ARMANI MILAN",
      "Coach": "MESSINA, ETTORE",
      "PlayersStats": [
        {
          "Player_ID": "P011064   ",
          "IsStarter": 1,
          "IsPlaying": 0,
          "Team": "MIL",
          "Dorsal": "2",
          "P

Boxscore data seems to be the most useful for this project. It shows all of the main stats of the players in that game. Accumulated boxscore statistics  for all of the games throughout the season may reveal interesting information. All columns are readable as-is.

In [10]:
# Load a 'Comparison' sample file
with open('data/E2024/comparison/game_300.json', 'r', encoding='utf-8') as f:
    data_c = json.load(f)

# Preview the top-level keys and structure
print(json.dumps(data_c, indent=2))

{
  "DefensiveReboundsA": 21,
  "OffensiveReboundsB": 15,
  "OffensiveReboundsA": 15,
  "DefensiveReboundsB": 23,
  "TurnoversStartersA": 0,
  "TurnoversBenchA": 3,
  "TurnoversStartersB": 4,
  "TurnoversBenchB": 11,
  "StealsStartersA": 4,
  "StealsBenchA": 3,
  "StealsStartersB": 0,
  "StealsBenchB": 3,
  "AssistsStartersA": 20,
  "AssistsBenchA": 12,
  "AssistsStartersB": 8,
  "AssistsBenchB": 10,
  "PointsStartersA": 70,
  "PointsBenchA": 41,
  "PointsStartersB": 37,
  "PointsBenchB": 52,
  "maxA": 14,
  "minutePrevA": 12,
  "prevA": "36-23",
  "minuteStrA": 15,
  "strA": "50-23",
  "maxB": 7,
  "minutePrevB": 25,
  "prevB": "77-44",
  "minuteStrB": 26,
  "strB": "77-51",
  "maxLeadA": 33,
  "maxLeadB": 2,
  "minuteMaxLeadA": 25,
  "minuteMaxLeadB": 1,
  "puntosMaxLeadA": "77-44",
  "puntosMaxLeadB": "0-2",
  "minutoActual": 1,
  "isLive": false
}


There is no indication of team names, so only readable in tandem with data from another URI. Not all columns are easily readable.
Explanation of harder to read columns:

Field	Explanation
maxA, maxB	Largest scoring run by Team A and Team B, respectively (i.e. most points scored without interruption).
minutePrevA, minutePrevB	Minute when the last significant run ended for Team A and B.
prevA, prevB	Score at the end of the previous significant run for Team A and B.
minuteStrA, minuteStrB	Minute when the strongest run of the game started for Team A and B.
strA, strB	Score at the start of the strongest run for Team A and B.
maxLeadA, maxLeadB	Maximum lead (in points) achieved by Team A and B.
minuteMaxLeadA, minuteMaxLeadB	Minute at which Team A and B reached their maximum lead.
puntosMaxLeadA, puntosMaxLeadB	Scoreline at the time of maximum lead for Team A and B.
minutoActual	Current minute (useful during live games — static for past games).
isLive	Boolean flag for whether the game is ongoing (true) or finished (false).

In [11]:
# Load a 'Points' sample file
with open('data/E2024/points/game_250.json', 'r', encoding='utf-8') as f:
    data_p = json.load(f)

# Preview the top-level keys and structure
print(json.dumps(data_p, indent=2))

{
  "Rows": [
    {
      "NUM_ANOT": 6,
      "TEAM": "ASV       ",
      "ID_PLAYER": "P007027   ",
      "PLAYER": "MALEDON, THEO",
      "ID_ACTION": "3FGA",
      "ACTION": "Missed Three Pointer",
      "POINTS": 0,
      "COORD_X": 677,
      "COORD_Y": 62,
      "ZONE": "I",
      "FASTBREAK": "0",
      "SECOND_CHANCE": "0",
      "POINTS_OFF_TURNOVER": "0",
      "MINUTE": 1,
      "CONSOLE": "09:39",
      "POINTS_A": 0,
      "POINTS_B": 0,
      "UTC": "20250307190227"
    },
    {
      "NUM_ANOT": 8,
      "TEAM": "TEL       ",
      "ID_PLAYER": "P011219   ",
      "PLAYER": "SORKIN, ROMAN",
      "ID_ACTION": "2FGM",
      "ACTION": "Two Pointer",
      "POINTS": 2,
      "COORD_X": -43,
      "COORD_Y": 94,
      "ZONE": "B",
      "FASTBREAK": "1",
      "SECOND_CHANCE": "0",
      "POINTS_OFF_TURNOVER": "0",
      "MINUTE": 1,
      "CONSOLE": "09:29",
      "POINTS_A": 0,
      "POINTS_B": 2,
      "UTC": "20250307190236"
    },
    {
      "NUM_ANOT": 10,
      "TE

Points - individual scoring breakdown (including missed shots). Each scoring action documented.

Unknown column explanation:

Field	Explanation
NUM_ANOT	Sequential event number (i.e., this is the 17th recorded scoring event).

ID_PLAYER	Unique player ID (internal Euroleague code, not standardized globally).

ID_ACTION	Type of action that led to points. Common codes: FTM (Free Throw Made), 2PM (2pt Made), 3PM (3pt Made), etc.

COORD_X, COORD_Y	Shot coordinates on the court. -1 means coordinates not recorded (e.g., for free throws). 

ZONE	Court zone. Often blank (" ") if not explicitly recorded. 🏀 Common "ZONE" values and likely meanings:

Zone Code	Likely Area on Court
"A"	Under the basket (paint / restricted area)
"B"	Mid-range, baseline
"C"	Corner 3-point area
"D"	Wing 3-point area
"E"	Top of the key / above the arc
"F"	Near free-throw line / elbow area
"G"	Long-range 3-pointers (deep shots)

FASTBREAK	1 if this was a fast-break basket, 0 otherwise.

SECOND_CHANCE	1 if points came from an offensive rebound (second chance), 0 otherwise.

POINTS_OFF_TURNOVER	1 if points were a direct result of a turnover, 0 otherwise.

MINUTE	Game minute during which the scoring event happened.

CONSOLE	Timestamp in game time format — MM:SS remaining in quarter.

POINTS_A, POINTS_B	Updated score after the basket: Team A’s and Team B’s score after this event.

UTC	UTC timestamp of when the event occurred (YYYYMMDDHHMMSS). Helpful for ordering.

In [31]:
# Load a 'PlayByPlay' sample file
with open('data/E2024/playbyplay/game_250.json', 'r', encoding='utf-8') as f:
    data_pbp = json.load(f)

# # Preview the top-level keys and structure
# print(json.dumps(data_pbp, indent=2))

In [13]:
# Load a 'ShootingGraphic' sample file
with open('data/E2024/shootinggraphic/game_30.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Preview the top-level keys and structure
print(json.dumps(data, indent=2))

{
  "FastbreakPointsA": 9,
  "FastbreakPointsB": 13,
  "TurnoversPointsA": 18,
  "TurnoversPointsB": 15,
  "SecondChancePointsA": 8,
  "SecondChancePointsB": 21
}


Missleading URI name. This data is an indicator on how well the team is getting extra points (points of TO, 2nd chances, fastbreak points)

In [32]:
# Load a 'Evolution' sample file
with open('data/E2024/evolution/game_30.json', 'r', encoding='utf-8') as f:
    data_e = json.load(f)

# # Preview the top-level keys and structure
# print(json.dumps(data_e, indent=2))

PointsList
A list of two lists: PointsList[0] = team A's cumulative score at each minute, PointsList[1] = team B’s score.

Each index corresponds to a minute in MinutesList.

MinutesList
An array of game minutes (0 to 40). This aligns with indices in PointsList, ScoreDiffPerMinute, etc.

ScoreDiffPerMinute
A list of two lists, just like PointsList.

ScoreDiffPerMinute[0] → how much team A led by at each minute (positive if leading, negative if trailing).

ScoreDiffPerMinute[1] → how much team B led by at each minute.

Values are null if no score change occurred or not applicable.

LargestDifference
Shows the largest point difference in favor of each team.

Format: [[teamA_largest_lead, null], [null, teamB_largest_lead]]

So:
"[[11, null], [null, 4]]"
Means:

Team A’s biggest lead: 11 points

Team B’s biggest lead: 4 points

MinuteMaxA, MinuteMaxB
The minute at which each team had their largest lead.

ScoreMaxA, ScoreMaxB
The score snapshot when each team had their largest lead.

"ScoreMaxA": "58 - 47" → Team A led by 11 at minute 21.

(Optional) difp, dA, dB
These are sometimes used for additional derived stats, but are often null or inconsistently populated — you can usually ignore these.

Use Case Ideas:
Plot score evolution graphs (line charts)

Identify turning points or key momentum shifts

Create game summaries (e.g., "Team A took control in Q2 with an 11-point lead at minute 21.")

In [15]:
#Validating if efficiency in this dataset is calculated the same way as official Euroleague rulebook.
#(Points + Rebounds + Assists + Steals + Blocks + Fouls Drawn) - (Missed Field Goals + Missed Free Throws + Turnovers + Shots Rejected + Fouls Committed).
Leday_efficiency_test=(32+5+2+2)-(8+1+1)
print(Leday_efficiency_test)
          # "Player": "LEDAY, ZACH",

          # "Points": 32,
          # "FieldGoalsMade2": 6,
          # "FieldGoalsAttempted2": 10,
          # "FieldGoalsMade3": 6,
          # "FieldGoalsAttempted3": 10,
          # "FreeThrowsMade": 2,
          # "FreeThrowsAttempted": 3,
          # "OffensiveRebounds": 0,
          # "DefensiveRebounds": 5,
          # "TotalRebounds": 5,
          # "Assistances": 2,
          # "Steals": 0,
          # "Turnovers": 0,
          # "BlocksFavour": 0,
          # "BlocksAgainst": 0,
          # "FoulsCommited": 1,
          # "FoulsReceived": 2,
          # "Valuation": 31,
          # "Plusminus": 14

31


In [16]:
hall_efficiency_test=(10+1+1)-(1+1+1)
print(hall_efficiency_test)
# "Player": "HALL, DONTA",
#           "Minutes": "18:16",
#           "Points": 10,
#           "FieldGoalsMade2": 5,
#           "FieldGoalsAttempted2": 6,
#           "FieldGoalsMade3": 0,
#           "FieldGoalsAttempted3": 0,
#           "FreeThrowsMade": 0,
#           "FreeThrowsAttempted": 0,
#           "OffensiveRebounds": 1,
#           "DefensiveRebounds": 0,
#           "TotalRebounds": 1,
#           "Assistances": 1,
#           "Steals": 0,
#           "Turnovers": 1,
#           "BlocksFavour": 0,
#           "BlocksAgainst": 0,
#           "FoulsCommited": 1,
#           "FoulsReceived": 0,
#           "Valuation": 9,

9


Seems that PIR rating is calculated correctly in this dataset

## Data Preprocessing & Cleaning

In [17]:
def convert_minutes_str_to_float(minutes_str):
    if isinstance(minutes_str, str):
        minutes_str = minutes_str.strip().upper()
        if minutes_str in ["DNP", ""]:
            return 0.0
        try:
            minutes, seconds = map(int, minutes_str.split(":"))
            return round(minutes + seconds / 60, 2)
        except Exception:
            return 0.0
    return 0.0


In [18]:
def load_euroleague_boxscores(root_dir="data"):
    records = []
    pattern = os.path.join(root_dir, "*", "BoxScore", "game_*.json")
    
    for filepath in glob.glob(pattern, recursive=True):
        # Extract season and game code from the path
        p = Path(filepath)
        seasoncode = p.parents[1].name   # e.g. "2023-24"
        gamecode = p.stem.replace("game_", "")
        
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                data = json.load(f)
        except Exception:
            continue  # Skip malformed JSON files

        try:
            # Get final scores from "totr" block inside "Stats"
            team_scores = {}
            for team in data.get("Stats", []):
                team_name = team.get("Team", "")
                totr = team.get("totr", {})
                points = int(totr.get("Points", 0))
                team_scores[team_name] = points

            if len(team_scores) != 2:
                continue  # Skip if both teams' scores are not found

            scores = list(team_scores.values())
            max_score = max(scores)
            min_score = min(scores)

            # Flatten each team's player stats
            for team in data.get("Stats", []):
                team_name = team.get("Team", "")
                team_score = team_scores.get(team_name, 0)
                team_win = int(team_score == max_score)

                for player in team.get("PlayersStats", []):
                    rec = dict(player)  # copy player stats dict
                    rec["team"] = team_name
                    rec["seasoncode"] = seasoncode
                    rec["gamecode"] = gamecode
                    rec["teamwin"] = team_win
                    records.append(rec)

        except Exception:
            continue  # Skip games with unexpected structure

    # Build DataFrame if any records were collected
    if not records:
        return pd.DataFrame()  # Return empty DataFrame if nothing parsed

    df = pd.DataFrame(records)

    # Convert minutes to float if present
    if 'Minutes' in df.columns:
        df['Minutes'] = df['Minutes'].apply(convert_minutes_str_to_float)

    return df

In [19]:
df = load_euroleague_boxscores()

In [20]:
# Show all columns
pd.set_option('display.max_columns', None)

# Optionally, also control the width of the display to avoid line-wrapping
pd.set_option('display.width', None)


print(df.head(16))



     Player_ID  IsStarter  IsPlaying Team Dorsal                   Player  \
0   P011225             0          0  MCO      5               LEE, PARIS   
1   P002543             1          0  MCO      9          WESTERMANN, LEO   
2   P005856             0          1  MCO     10             THOMAS, WILL   
3   P011226             0          1  MCO     11            DIALLO, ALPHA   
4   P006599             0          0  MCO     12             MOTUM, BROCK   
5   P008923             0          0  MCO     16         BOUTSIELE, JERRY   
6   PLCZ                1          1  MCO     20      MOTIEJUNAS, DONATAS   
7   P004197             1          0  MCO     24         OUATTARA, YAKUBA   
8   P010428             1          0  MCO     28           FAYE, IBRAHIMA   
9   P000956             0          1  MCO     33         ANDJUSIC, DANILO   
10  P011196             0          0  MCO     45              HALL, DONTA   
11  P005985             1          1  MCO     55              JAMES, MIKE   

## 4. Exploratory Data Analysis (EDA) & Feature Engineering

In [21]:
def calculate_fantasy_points(row):
    try:
        points = row.get('Points', 0)
        dreb = row.get('DefensiveRebounds', 0)
        oreb = row.get('OffensiveRebounds', 0)
        assists = row.get('Assistances', 0)
        steals = row.get('Steals', 0)
        blocks = row.get('BlocksFavour', 0)
        drawn_fouls = row.get('FoulsReceived', 0)
        team_win = row.get('TeamWin', False)  # Boolean or can be calculated externally
        team_loss = not team_win

        missed_shots = (row.get('FieldGoalsAttempted', 0) - row.get('FieldGoalsMade', 0)) \
                     + (row.get('ThreePointsAttempted', 0) - row.get('ThreePointsMade', 0))
        missed_free_throws = row.get('FreeThrowsAttempted', 0) - row.get('FreeThrowsMade', 0)
        turnovers = row.get('Turnovers', 0)
        block_against = row.get('BlocksAgainst', 0)
        fouled_out = row.get('FoulsCommited', 0) >= 5  # Assuming 5 fouls = disqualification

        total_rebounds = dreb + oreb

        # Bonuses
        double_double = sum(x >= 10 for x in [points, total_rebounds, assists, steals, blocks]) >= 2
        triple_double = sum(x >= 10 for x in [points, total_rebounds, assists, steals, blocks]) >= 3
        quadruple_double = sum(x >= 10 for x in [points, total_rebounds, assists, steals, blocks]) >= 4

        score = 0
        score += points
        score += dreb * 1
        score += oreb * 1.5
        score += assists * 1.5
        score += steals * 1.5
        score += blocks * 1
        score += drawn_fouls * 1
        score += 1.5 if team_win else -1.5
        score -= missed_shots * 1
        score -= missed_free_throws * 1
        score -= turnovers * 1.5
        score -= block_against * 0.5
        score -= 5 if fouled_out else 0
        score += 10 if double_double else 0
        score += 30 if triple_double else 0
        score += 100 if quadruple_double else 0

        return round(score, 2)
    except Exception:
        return 0.0


In [22]:
# Effective Field Goal Percentage
def calculate_efg(df):
    df['efg'] = (df['fg_made'] + 0.5 * df['three_pt_made']) / df['fg_attempted'].replace(0, np.nan)
    return df

# True Shooting Percentage
def calculate_ts(df):
    df['ts'] = df['points'] / (2 * (df['fg_attempted'] + 0.44 * df['ft_attempted'])).replace(0, np.nan)
    return df

# Usage Rate (approximate version)
def calculate_usage_rate(df):
    df['usage'] = (
        (df['fg_attempted'] + 0.44 * df['ft_attempted'] + df['turnovers']) *
        df['team_minutes']
    ) / (df['minutes_played'] * df['team_possessions']).replace(0, np.nan)
    return df

# Per-minute or per-possession stats (optional)
def calculate_per_minute(df, stat_col):
    df[f'{stat_col}_per_min'] = df[stat_col] / df['minutes_played'].replace(0, np.nan)
    return df

# Recent form (rolling average over 3 games)
def add_rolling_average(df, player_col='player_id', sort_cols=['date'], target_col='fantasy_score', window=3):
    df = df.sort_values(by=sort_cols)
    df[f'{target_col}_rolling_{window}'] = (
        df.groupby(player_col)[target_col].transform(lambda x: x.rolling(window, min_periods=1).mean())
    )
    return df



In [23]:

def plot_scores(df):
    plt.figure(figsize=(10,5))
    sns.barplot(x=['TeamA', 'TeamB'], y=[df['ScoreA'][0], df['ScoreB'][0]])
    plt.title("Game Score Comparison")
    plt.show()

In [24]:


def add_features(df):
    df['Score_Difference'] = abs(df['ScoreA'] - df['ScoreB'])
    return df



In [26]:
df['fantasy_points'] = df.apply(calculate_fantasy_points, axis=1)

In [29]:
#which player benefits the most/the least from the fantasy scoring system?
def calculate_scoring_system(df):
    df['scoring_system_advantage']=df['fantasy_points']-df['Valuation']
    return df

In [30]:
calculate_scoring_system(df)

Unnamed: 0,Player_ID,IsStarter,IsPlaying,Team,Dorsal,Player,Minutes,Points,FieldGoalsMade2,FieldGoalsAttempted2,FieldGoalsMade3,FieldGoalsAttempted3,FreeThrowsMade,FreeThrowsAttempted,OffensiveRebounds,DefensiveRebounds,TotalRebounds,Assistances,Steals,Turnovers,BlocksFavour,BlocksAgainst,FoulsCommited,FoulsReceived,Valuation,Plusminus,team,seasoncode,gamecode,teamwin,fantasy_points,scoring_system_advantage
0,P011225,0,0,MCO,5,"LEE, PARIS",18.55,10,1,3,2,4,2,2,1,0,1,5,1,0,0,0,2,3,14,13,AS MONACO,E2021,1,1,22.0,8.0
1,P002543,1,0,MCO,9,"WESTERMANN, LEO",21.98,8,1,2,1,1,3,3,0,2,2,4,1,2,1,1,2,1,11,8,AS MONACO,E2021,1,1,14.5,3.5
2,P005856,0,1,MCO,10,"THOMAS, WILL",22.47,5,1,4,1,4,0,0,1,3,4,0,1,0,0,1,1,1,3,7,AS MONACO,E2021,1,1,10.0,7.0
3,P011226,0,1,MCO,11,"DIALLO, ALPHA",22.78,2,0,2,0,1,2,2,1,3,4,0,0,1,0,0,1,1,2,2,AS MONACO,E2021,1,1,4.5,2.5
4,P006599,0,0,MCO,12,"MOTUM, BROCK",2.68,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,-3,1,AS MONACO,E2021,1,1,-2.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30012,P010106,0,1,BAR,17,"NUNEZ, JUAN",20.42,5,2,3,0,0,1,2,1,1,2,7,1,3,0,0,3,4,11,-11,FC BARCELONA,E2024,99,0,16.5,5.5
30013,P006568,0,1,BAR,19,"FALL, YOUSSOUPHA",8.40,3,1,1,0,0,1,2,0,4,4,0,0,0,1,0,1,1,7,-1,FC BARCELONA,E2024,99,0,6.5,-0.5
30014,P003478,1,0,BAR,21,"ABRINES, ALEX",11.90,0,0,0,0,1,0,0,0,1,1,2,0,0,0,0,0,2,4,-5,FC BARCELONA,E2024,99,0,4.5,0.5
30015,P012745,1,1,BAR,22,"PARKER, JABARI",27.05,26,8,8,3,5,1,2,0,3,3,2,2,3,0,0,0,2,29,-14,FC BARCELONA,E2024,99,0,30.0,1.0


In [28]:
print(df.columns)

Index(['Player_ID', 'IsStarter', 'IsPlaying', 'Team', 'Dorsal', 'Player',
       'Minutes', 'Points', 'FieldGoalsMade2', 'FieldGoalsAttempted2',
       'FieldGoalsMade3', 'FieldGoalsAttempted3', 'FreeThrowsMade',
       'FreeThrowsAttempted', 'OffensiveRebounds', 'DefensiveRebounds',
       'TotalRebounds', 'Assistances', 'Steals', 'Turnovers', 'BlocksFavour',
       'BlocksAgainst', 'FoulsCommited', 'FoulsReceived', 'Valuation',
       'Plusminus', 'team', 'seasoncode', 'gamecode', 'teamwin',
       'fantasy_points'],
      dtype='object')


In [None]:
# 6. Modeling (Optional)
# ----------------------
# Build predictive models if needed.

# 7. Insights & Conclusions
# -------------------------
# Summarize key findings and potential applications.

# Save cleaned data to CSV
# df.to_csv("euroleague_cleaned.csv", index=False)

In [25]:
    # # Convert only relevant numeric columns
    # numeric_columns = [col for col in df.columns if col.lower() in [
    #     'points', 'rebounds', 'assists', 'blocks', 'steals', 'turnovers',
    #     'minutes', 'seconds', 'fieldgoalsmade', 'fieldgoalsattempted',
    #     'threepointsmade', 'freethrowsmade', 'performanceindexrating', 'plusminus'
    #     # Add more numeric columns as needed
    # ]]
     
    
    # for col in numeric_columns:
    #     if col in df.columns:
    #         df[col] = df[col].astype(str).str.replace(',', '.')  # optional
    #         df[col] = pd.to_numeric(df[col], errors='coerce')
    #     return df
    # else:
    #     # No valid data found; return empty DataFrame
    #     return pd.DataFrame()