ORIGINAL DATASET GENERATION

In [13]:
# ===================================================================
# Synthetic Football Monitoring Dataset Generator (Final Calendar)
# ===================================================================

import pandas as pd
import numpy as np
from datetime import datetime, timedelta

np.random.seed(42)

# ----------------------------
# Settings
# ----------------------------
TEAMS = ["First Team", "U23", "U21", "U19"]
PLAYERS_PER_TEAM = 20
WEEKS = 24
START_DATE = datetime(2025, 1, 1)
WELLNESS_ITEMS = ["sleep_quality", "fatigue", "muscle_soreness", "stress", "motivation"]
FITNESS_TEST_TYPES = [
    "Sprint20m",
    "CMJ",
    "Submaximal_HR_Lactate",
    "Squat1RM",
    "SquatVelocity60%",
    "SquatLoad60%",
    "IsometricStrength"
]

# ----------------------------
# Helper Functions
# ----------------------------
def team_quality_factor(team):
    factors = {"First Team": 1.00, "U23": 0.95, "U21": 0.90, "U19": 0.85}
    return factors.get(team, 1.0)

def generate_players():
    players = []
    player_id = 1
    for team in TEAMS:
        for i in range(PLAYERS_PER_TEAM):
            players.append({
                "player_id": player_id,
                "name": f"Player_{player_id:03d}",
                "position": np.random.choice(["GK","DEF","MID","FWD"], p=[0.1,0.3,0.35,0.25]),
                "age": np.random.randint(17,31),
                "height_cm": int(np.random.normal(178,7)),
                "weight_kg": int(np.random.normal(74,6)),
                "team": team,
                "dominant_leg": np.random.choice(["Right","Left"]),
                "historical_max_speed_kmh": round(np.random.normal(33.0,1.5),1)
            })
            player_id +=1
    return pd.DataFrame(players)

# ----------------------------
# Generate Session Metrics with Calendar
# ----------------------------
def generate_session_metrics(players_df):
    rows = []
    current_date = START_DATE
    for week in range(WEEKS):
        # Choosing randomly between sunday and saturday
        md_weekday = int(np.random.choice([5,6]))
        md_date = current_date + timedelta(days=int(week*7 + md_weekday))

        # Assign training days
        day_mapping = {
            "MD": md_date,
            "MD-1": md_date - timedelta(days=1),
            "MD-2": md_date - timedelta(days=2),
            "MD-3": md_date - timedelta(days=3),
            "MD-4": md_date - timedelta(days=4),
            "MD+1": md_date + timedelta(days=1),
            "MD+2": md_date + timedelta(days=2)
        }

        for day_label, date in day_mapping.items():
            for team in TEAMS:
                team_players = players_df[players_df.team==team]

                # Define session types
                if day_label == "MD":
                    session_type = "Match"
                elif day_label in ["MD+1","MD+2"]:
                    session_type = np.random.choice(["Training","Rest"], p=[0.7,0.3])
                else:  # MD-1 to MD-4
                    session_type = "Training"

                for _, player in team_players.iterrows():
                    if session_type == "Training":
                        duration_min = int(np.random.normal(75,10))
                    elif session_type == "Match":
                        duration_min = int(np.random.normal(90,5))
                    else:  # Rest
                        duration_min = int(np.random.uniform(0,10))

                    total_distance_m = max(0,np.random.normal(7000,1500)) if session_type in ["Training","Match"] else 0
                    high_speed_distance_m = total_distance_m * np.random.uniform(0.05,0.10)
                    sprint_distance_m = total_distance_m * np.random.uniform(0.01,0.03) if np.random.rand()>0.3 else 0
                    accel_count = int(np.random.poisson(12)) if total_distance_m>0 else 0
                    decel_count = int(np.random.poisson(11)) if total_distance_m>0 else 0
                    mean_accel_intensity = round(np.random.uniform(2.5,4.0),2) if accel_count>0 else 0
                    mean_decel_intensity = round(np.random.uniform(2.5,4.0),2) if decel_count>0 else 0
                    max_speed_kmh = round(np.random.uniform(0.7,1.0)*player.historical_max_speed_kmh,1) if total_distance_m>0 else 0
                    perc_max_speed = round(max_speed_kmh/player.historical_max_speed_kmh*100,1)
                    distance_at_max_speed_m = round(np.random.uniform(0.03,0.10)*total_distance_m,1) if total_distance_m>0 else 0
                    rpe = round(np.random.uniform(3,8),1) if session_type in ["Training","Match"] else round(np.random.uniform(1,4),1)
                    hr_mean = round(np.random.uniform(120,180),1) if total_distance_m>0 else 60

                    # HR zones
                    hr_zone_total = duration_min if duration_min>0 else 1
                    hr_zone_5 = max(0,int(np.random.normal(5,2)))
                    remaining = hr_zone_total - hr_zone_5
                    hr_zone_1 = round(remaining*np.random.uniform(0.2,0.3))
                    hr_zone_2 = round(remaining*np.random.uniform(0.2,0.3))
                    hr_zone_3 = round(remaining*np.random.uniform(0.2,0.3))
                    hr_zone_4 = remaining - (hr_zone_1 + hr_zone_2 + hr_zone_3)
                    internal_load_rpe = rpe*duration_min
                    internal_load_hr = hr_zone_1*1 + hr_zone_2*2 + hr_zone_3*3 + hr_zone_4*4 + hr_zone_5*5

                    rows.append({
                        "date": date.date(),
                        "day_label": day_label,
                        "session_type": session_type,
                        "player_id": player.player_id,
                        "team": team,
                        "duration_min": duration_min,
                        "total_distance_m": total_distance_m,
                        "high_speed_distance_m": high_speed_distance_m,
                        "sprint_distance_m": sprint_distance_m,
                        "accel_count": accel_count,
                        "decel_count": decel_count,
                        "mean_accel_intensity": mean_accel_intensity,
                        "mean_decel_intensity": mean_decel_intensity,
                        "max_speed_kmh": max_speed_kmh,
                        "%_max_speed": perc_max_speed,
                        "distance_at_max_speed_m": distance_at_max_speed_m,
                        "rpe": rpe,
                        "hr_mean": hr_mean,
                        "hr_zone_1_min": hr_zone_1,
                        "hr_zone_2_min": hr_zone_2,
                        "hr_zone_3_min": hr_zone_3,
                        "hr_zone_4_min": hr_zone_4,
                        "hr_zone_5_min": hr_zone_5,
                        "internal_load_rpe": internal_load_rpe,
                        "internal_load_hr": internal_load_hr
                    })
    return pd.DataFrame(rows)

# ----------------------------
# Wellness (1-5)
# ----------------------------
def generate_wellness(session_df, players_df):
    rows = []
    for _, player in players_df.iterrows():
        q = team_quality_factor(player.team)
        player_sessions = session_df[session_df.player_id==player.player_id].sort_values("date")
        for date in player_sessions.date.unique():
            last3 = player_sessions[player_sessions.date < date].tail(3)
            recent_load = last3.internal_load_rpe.mean() if not last3.empty else 300
            sleep_quality = np.clip(np.random.normal(3*q - recent_load/1000,0.4),1,5)
            fatigue = np.clip(np.random.normal(2 + recent_load/1000,0.5),1,5)
            muscle_soreness = np.clip(np.random.normal(2 + recent_load/1200,0.5),1,5)
            stress = np.clip(np.random.normal(2 + recent_load/1000,0.5),1,5)
            motivation = np.clip(np.random.normal(3*q - recent_load/1500,0.5),1,5)
            rows.append({
                "date": date,
                "player_id": player.player_id,
                "sleep_quality": round(sleep_quality,1),
                "fatigue": round(fatigue,1),
                "muscle_soreness": round(muscle_soreness,1),
                "stress": round(stress,1),
                "motivation": round(motivation,1)
            })
    return pd.DataFrame(rows)

# ----------------------------
# Fitness Tests
# ----------------------------
def generate_fitness_tests(players_df):
    rows = []
    for _, player in players_df.iterrows():
        q = team_quality_factor(player.team)
        test_weeks = np.arange(0,WEEKS,np.random.randint(4,7))
        for week in test_weeks:
            test_date = START_DATE + timedelta(days=int(week*7))
            for test_type in FITNESS_TEST_TYPES:
                if test_type == "Sprint20m":
                    base_val = np.random.normal(3.3,0.15)/q
                    unit = "s"
                elif test_type == "CMJ":
                    base_val = np.random.normal(40*q,5)
                    unit = "cm"
                elif test_type == "Submaximal_HR_Lactate":
                    base_val = np.random.normal(2.2/q,0.5)
                    unit = "mmol/L"
                elif test_type == "Squat1RM":
                    base_val = np.random.normal(130*q,20)
                    unit = "kg"
                elif test_type == "SquatVelocity60%":
                    base_val = np.random.normal(0.9*q,0.1)
                    unit = "m/s"
                elif test_type == "SquatLoad60%":
                    base_val = np.random.normal(0.6*130*q,10)
                    unit = "kg"
                elif test_type == "IsometricStrength":
                    base_val = np.random.normal(400*q,50)
                    unit = "N"
                else:
                    base_val, unit = np.nan, ""
                for attempt in range(1,4):
                    val = round(base_val*np.random.uniform(0.97,1.03),2)
                    rows.append({
                        "player_id": player.player_id,
                        "test_date": test_date.date(),
                        "test_type": test_type,
                        "attempt": attempt,
                        "result": val,
                        "unit": unit
                    })
    return pd.DataFrame(rows)

# ----------------------------
# Generate CSVs
# ----------------------------
print("Generating players...")
players_df = generate_players()
players_df.to_csv("players.csv", index=False)

print("Generating session metrics...")
session_df = generate_session_metrics(players_df)
session_df.to_csv("session_metrics.csv", index=False)

print("Generating wellness data...")
wellness_df = generate_wellness(session_df, players_df)
wellness_df.to_csv("wellness.csv", index=False)

print("Generating fitness tests...")
fitness_df = generate_fitness_tests(players_df)
fitness_df.to_csv("fitness_tests.csv", index=False)

print("All CSV files generated successfully.")


Generating players...
Generating session metrics...
Generating wellness data...
Generating fitness tests...
All CSV files generated successfully.


DAILY SESSION ACTUALIZATION

In [18]:
import pandas as pd
import numpy as np
from datetime import timedelta

np.random.seed(123)

# ----------------------------
# LOAD HISTORICAL DATA
# ----------------------------
players_df = pd.read_csv("players.csv")
session_df = pd.read_csv("session_metrics.csv")
wellness_df = pd.read_csv("wellness.csv")
fitness_df = pd.read_csv("fitness_tests.csv")

# ----------------------------
# DETERMINE NEXT UPDATE DATE
# ----------------------------
last_date = pd.to_datetime(session_df['date']).max()
UPDATE_DATE = last_date + timedelta(days=1)
print("Generating daily update for:", UPDATE_DATE.date())

# ----------------------------
# DAILY WELLNESS
# ----------------------------
def generate_daily_wellness(player_id):
    last_days = wellness_df[wellness_df['player_id']==player_id].sort_values('date').tail(3)
    base_sleep = 3 if last_days.empty else last_days['sleep_quality'].mean()
    base_fatigue = 2 if last_days.empty else last_days['fatigue'].mean()
    base_muscle = 2 if last_days.empty else last_days['muscle_soreness'].mean()
    base_stress = 2 if last_days.empty else last_days['stress'].mean()
    base_motivation = 3 if last_days.empty else last_days['motivation'].mean()
    return {
        "sleep_quality": float(np.clip(base_sleep + np.random.normal(0,0.3),1,5)),
        "fatigue": float(np.clip(base_fatigue + np.random.normal(0,0.3),1,5)),
        "muscle_soreness": float(np.clip(base_muscle + np.random.normal(0,0.3),1,5)),
        "stress": float(np.clip(base_stress + np.random.normal(0,0.3),1,5)),
        "motivation": float(np.clip(base_motivation + np.random.normal(0,0.3),1,5))
    }

wellness_update = []
for pid in players_df['player_id']:
    wellness_data = generate_daily_wellness(pid)
    wellness_data.update({
        "player_id": pid,
        "date": UPDATE_DATE.date().isoformat()
    })
    wellness_update.append(wellness_data)

wellness_update_df = pd.DataFrame(wellness_update)

# ----------------------------
# DAILY SESSION METRICS
# ----------------------------
session_types = ["Training","Match","Rest"]
sessions_update = []

for pid in players_df['player_id']:
    session_type = np.random.choice(session_types, p=[0.6,0.1,0.3])
    if session_type == "Rest":
        duration_min = 0
        total_distance = 0
    elif session_type == "Training":
        duration_min = int(np.random.normal(75,10))
        total_distance = max(0,np.random.normal(6000,1000))
    else:  # Match
        duration_min = int(np.random.normal(90,5))
        total_distance = max(0,np.random.normal(8000,1500))

    high_speed_distance = total_distance*np.random.uniform(0.03,0.08) if total_distance>0 else 0
    sprint_distance = total_distance*np.random.uniform(0.01,0.03) if total_distance>0 and np.random.rand()>0.3 else 0

    historical_max = players_df.loc[players_df['player_id']==pid,'historical_max_speed_kmh'].values[0]
    max_speed = round(np.random.uniform(0.7,1.0)*historical_max,1) if total_distance>0 else 0
    perc_max_speed = round(max_speed/historical_max*100,1) if historical_max>0 else 0

    # HR zones
    hr_zone_5 = max(0,int(np.random.normal(5,2)))
    remaining = duration_min - hr_zone_5
    hr_zone_1 = round(remaining*np.random.uniform(0.2,0.3)) if remaining>0 else 0
    hr_zone_2 = round(remaining*np.random.uniform(0.2,0.3)) if remaining>0 else 0
    hr_zone_3 = round(remaining*np.random.uniform(0.2,0.3)) if remaining>0 else 0
    hr_zone_4 = remaining - (hr_zone_1 + hr_zone_2 + hr_zone_3) if remaining>0 else 0

    sessions_update.append({
        "player_id": pid,
        "date": UPDATE_DATE.date().isoformat(),
        "session_type": session_type,
        "duration_min": duration_min,
        "total_distance_m": total_distance,
        "high_speed_distance_m": high_speed_distance,
        "sprint_distance_m": sprint_distance,
        "max_speed_kmh": max_speed,
        "%_max_speed": perc_max_speed,
        "hr_zone_1_min": hr_zone_1,
        "hr_zone_2_min": hr_zone_2,
        "hr_zone_3_min": hr_zone_3,
        "hr_zone_4_min": hr_zone_4,
        "hr_zone_5_min": hr_zone_5
    })

session_update_df = pd.DataFrame(sessions_update)

# ----------------------------
# DAILY FITNESS TESTS (optional every 28 days)
# ----------------------------
fitness_update = []
if UPDATE_DATE.day % 28 == 0:
    test_types = ["Sprint20m","CMJ","Submaximal_HR_Lactate","Squat1RM","SquatVelocity60%","SquatLoad60%","IsometricStrength"]
    for pid in players_df['player_id']:
        for ttype in test_types:
            results = np.round(np.random.uniform(0.97,1.03,3),2)  # aplicar a base seg√∫n necesidad
            fitness_update.append({
                "player_id": pid,
                "date": UPDATE_DATE.date().isoformat(),
                "test_type": ttype,
                "attempt1": results[0],
                "attempt2": results[1],
                "attempt3": results[2]
            })
fitness_update_df = pd.DataFrame(fitness_update)

# ----------------------------
# SAVE UPDATE CSVS
# ----------------------------
wellness_update_df.to_csv("wellness_update.csv", index=False)
session_update_df.to_csv("session_metrics_update.csv", index=False)
fitness_update_df.to_csv("fitness_tests_update.csv", index=False)
print("Daily update CSVs generated for", UPDATE_DATE.date().isoformat())


Generating daily update for: 2025-06-19
Daily update CSVs generated for 2025-06-19
