In [5]:
import pandas as pd 
import os
import numpy as np
import re

In [6]:
import os
import numpy as np
import pandas as pd

def sample_data(n_data=100, seed=None, unique_pairs=True, min_rows=1):
    rng = np.random.default_rng(seed)
    root = "../data/Fantasy-Premier-League/data/2024-25/players"
    dirs = [d for d in os.listdir(root) if os.path.isdir(os.path.join(root, d))]

    rows = []
    seen = set()  # for unique (player, fixture) pairs
    tries = 0
    max_tries = n_data * 20  # safety to avoid infinite loops

    while len(rows) < n_data and tries < max_tries:
        tries += 1
        player = rng.choice(dirs)
        player_path = os.path.join(root, player, "gw.csv")
        if not os.path.exists(player_path):
            continue

        df = pd.read_csv(player_path)
        if df.empty or len(df) < min_rows:
            continue

        # pick a random row
        rand_idx = rng.integers(0, len(df))
        row = df.iloc[rand_idx]

        # pick a column name that identifies the fixture/round in this repo
        # (commonly 'round'; sometimes people rename to 'fixture')
        key_col = 'fixture' if 'fixture' in df.columns else ('round' if 'round' in df.columns else None)
        pair_key = (player, None if key_col is None else row[key_col])

        if unique_pairs and pair_key in seen:
            continue

        row_dict = row.to_dict()
        row_dict["player_dir"] = player  # keep folder name for traceability
        rows.append(row_dict)
        if unique_pairs:
            seen.add(pair_key)

    return pd.DataFrame(rows)

# Example:
# df = sample_data(n_data=1000, seed=42, unique_pairs=True)
# df.to_csv("random_player_fixtures.csv", index=False)


In [7]:
sample_data().columns

Index(['assists', 'bonus', 'bps', 'clean_sheets', 'creativity', 'element',
       'expected_assists', 'expected_goal_involvements', 'expected_goals',
       'expected_goals_conceded', 'fixture', 'goals_conceded', 'goals_scored',
       'ict_index', 'influence', 'kickoff_time', 'minutes', 'mng_clean_sheets',
       'mng_draw', 'mng_goals_scored', 'mng_loss', 'mng_underdog_draw',
       'mng_underdog_win', 'mng_win', 'modified', 'opponent_team', 'own_goals',
       'penalties_missed', 'penalties_saved', 'red_cards', 'round', 'saves',
       'selected', 'starts', 'team_a_score', 'team_h_score', 'threat',
       'total_points', 'transfers_balance', 'transfers_in', 'transfers_out',
       'value', 'was_home', 'yellow_cards', 'player_dir'],
      dtype='object')

In [8]:
fixture_data = pd.read_csv('../data/Fantasy-Premier-League/data/2024-25/fixtures.csv')


In [98]:
def sample_random(year=None):
    if year is None:
        years = ["2019-20", "2020-21", "2021-22", "2022-23", "2023-24", "2024-25"]
    else:
        years = [year]
    year = np.random.choice(years)
    root = f"../data/Fantasy-Premier-League/data/{year}/players"
    dirs = [d for d in os.listdir(root) if os.path.isdir(os.path.join(root, d))]
    player = np.random.choice(dirs)
    player_path = os.path.join(root, player, "gw.csv")
    data = load_csv(player_path)
    data["kickoff_time"] = pd.to_datetime(data["kickoff_time"])
    avail_gws = (pd.to_numeric(data["round"], errors="coerce")
                   .dropna().astype(int).unique())
    gw = 38
    return str(year), gw, str(player), data

def norm_name(name_path):
    """
    Normalize player names by removing special characters and converting to lowercase.
    """
    name = os.path.basename(name_path).replace(".csv", "")
    name = name.replace("_", " ")
    name = re.sub(r"\d*", "", name)
    return name.strip()

def load_csv(path):
    for enc in ("utf-8", "utf-8-sig", "cp1252", "latin-1"):
        try:
            return pd.read_csv(
                path,
                engine="python",       # more tolerant
                on_bad_lines="skip",   # or "skip"
                encoding=enc
            )
        except UnicodeDecodeError:
            continue
    # last-resort: decode with utf-8 replacing bad bytes (keeps data length)
    return pd.read_csv(
        path,
        engine="python",
        on_bad_lines="skip",
        encoding="utf-8",
        encoding_errors="replace"
    )


def calc_form(gw: int, df: pd.DataFrame) -> float:
    """
    Return the average total_points in the 30 days strictly before GW `gw`.
    Assumes `df` is a single player's gw.csv with columns:
      - 'round' (GW number)
      - 'kickoff_time' (ISO timestamp)
      - 'total_points'
    """

    # Reference time = earliest kickoff of the target GW (handles double GWs)
    ref_time = df.loc[df["round"] == gw, "kickoff_time"].min()
    if pd.isna(ref_time):
        raise ValueError(f"GW {gw} not found in dataframe")

    window_start = ref_time - pd.Timedelta(days=30)
    prior = df.loc[(df["kickoff_time"] >= window_start) & (df["kickoff_time"] < ref_time), "total_points"]

    return float(prior.mean()) if len(prior) else 0.0


def get_team_name(year, gw, season_data, fixture_df, opp):
    teams = load_csv(f"../data/Fantasy-Premier-League/data/{year}/teams.csv")
    was_home = season_data[season_data["round"] == gw]["was_home"].values[0]
    fixture = fixture_df[fixture_df["id"] == season_data[season_data["round"] == gw]["fixture"].values[0]]
    if opp:
        team_id = fixture["team_h"].values[0] if not was_home else fixture["team_a"].values[0]
    else:
        team_id = fixture["team_h"].values[0] if was_home else fixture["team_a"].values[0]
    date = pd.to_datetime(fixture["kickoff_time"]).values[0]
    team_name = teams[teams["id"] == team_id]["name"].values[0]
    return team_name
    


def get_player_data(year, id):
    data = load_csv(f"../data/Fantasy-Premier-League/data/{year}/players_raw.csv")
    player = data[data["id"] == id]
    if player.empty:
        raise ValueError(f"Player with ID {id} not found in {year} data.")
    result = {
        "form": player["form"].values[0],
        "position": player["element_type"].values[0],
        "creativity": player["creativity"].values[0],
        "influence": player["influence"].values[0],
        "threat": player["threat"].values[0],
        "selected": player["selected_by_percent"].values[0],
        "playing_chance": player["chance_of_playing_this_round"].values[0],
        "birth_date": player["birth_date"].values[0] if "birth_date" in player.columns else None,
        "corners_and_free_kicks_order": player["corners_and_indirect_freekicks_order"].fillna(0).values[0],
        "penalties_order": player["penalties_order"].fillna(0).values[0],
    }
    need_to_norm = {
        "team_join_date": player["team_join_date"].values[0] if "team_join_date" in player.columns else None,
        "bps": player["bps"],
        "minutes_total": player["minutes"].values[0],
        "yellow_cards": player["yellow_cards"].values[0],
        "red_cards": player["red_cards"].values[0],
    }
    return result, need_to_norm

def get_team_data(team_name, teams_df, home):
    team_r = teams_df[teams_df["name"] == team_name]
    if home:
        team_def = team_r["strength_defence_away"].values[0]
        team_att = team_r["strength_attack_away"].values[0]
    else:
        team_def = team_r["strength_defence_away"].values[0]
        team_att = team_r["strength_attack_away"].values[0]
    strength = team_r["strength"].values[0]
    return strength, team_att, team_def

In [99]:
year, gw, player, data = sample_random()
print(f"Sampled: {year}, GW {gw}, Player: {player}")
data[data["round"] == gw]["fixture"].values[0]
fixture = int(data[data["round"] == gw]["fixture"].values[0])
fixture_df = load_csv(f"../data/Fantasy-Premier-League/data/{year}/fixtures.csv")
print(fixture_df.columns)
print("playing for:", get_team_name(year, gw, data, fixture_df, opp=False))
print("playing against:", get_team_name(year, gw, data, fixture_df, opp=True))


Sampled: 2021-22, GW 38, Player: Taylor_Richards_537
Index(['code', 'event', 'finished', 'finished_provisional', 'id',
       'kickoff_time', 'minutes', 'provisional_start_time', 'started',
       'team_a', 'team_a_score', 'team_h', 'team_h_score', 'stats',
       'team_h_difficulty', 'team_a_difficulty', 'pulse_id'],
      dtype='object')
playing for: Brighton
playing against: West Ham


In [100]:
def generate_data():
    results = []
    years = ["2023-24", "2024-25"]
    for year in years:
        root = f"../data/Fantasy-Premier-League/data/{year}/players"
        dirs = [d for d in os.listdir(root) if os.path.isdir(os.path.join(root, d))]
        gw = 38
        for player in dirs:
            player_path = os.path.join(root, player)
            season_data = load_csv(os.path.join(player_path, "gw.csv"))
            season_data["kickoff_time"] = pd.to_datetime(season_data["kickoff_time"])
            player_name = norm_name(player)
            gw_player_data = season_data[season_data["round"] == gw]
            if gw_player_data.empty:
                continue
            fixture = int(gw_player_data["fixture"].values[0])
            fixture_df = load_csv(f"../data/Fantasy-Premier-League/data/{year}/fixtures.csv")
            id = gw_player_data["element"].values[0]
            team_name = get_team_name(year, gw, season_data, fixture_df, opp=False)
            opposition_name = get_team_name(year, gw, season_data, fixture_df, opp=True)
            result = ({
                "year": year,
                "gw": gw,
                "player": player_name,
                "id": id,
                "points_scored": gw_player_data["total_points"].values[0],
                "fixture_id": fixture,
                "team_name": team_name,
                "opposition_name": opposition_name
            })
            player_data, need_to_norm = get_player_data(year, id, season_data["kickoff_time"].values[0])
            for key, value in player_data.items():
                result[key] = value

            if need_to_norm["team_join_date"] is not None:
                join_dt = pd.to_datetime(need_to_norm["team_join_date"], utc=True, errors="coerce")
                mask_window = (season_data["round"] < gw) & (
                    season_data["kickoff_time"] >= join_dt if pd.notna(join_dt) else True
                )
            else:
                hist = pd.read_csv(os.path.join(player_path, "history.csv"))
                if hist.empty:
                    continue
                mask_window = (season_data["round"] < gw)

            possible = season_data[mask_window] ### Subset of fixtures the player was eligible for at this club, doesnt consider injuries annoyingly
            played = possible[possible["minutes"] > 0] ### This is the subset of fixtures where the player actually played (minutes > 0)
            mins = float(possible["minutes"].sum()) / 90
            
            result["form"] = calc_form(gw, season_data)
            result["yellow_card_90"] = (played["yellow_cards"].sum() / mins) if mins > 0 else 0.0
            result["red_card_90"] = (played["red_cards"].sum()    / mins) if mins > 0 else 0.0
            result["bps_90"] = (played["bps"].sum() / mins) if mins > 0 else 0.0
            result["game_time_percent"] = (float(played["minutes"].sum()) / possible["minutes"].sum()) if possible["minutes"].sum() > 0 else 0.0
            result["goals_90"] = (played["goals_scored"].sum() / mins) if mins > 0 else 0.0
            result["assists_90"] = (played["assists"].sum() / mins) if mins > 0 else 0.0
            result["goals_conceded_90"] = (played["goals_conceded"].sum() / mins) if mins > 0 else 0.0
            result["starts_90"] = (played["starts"].sum() / mins) if mins > 0 else 0.0
            result["saves_90"] = (played["saves"].sum() / mins) if mins > 0 else 0.0
            result["clean_sheets_90"] = (played["clean_sheets"].sum() / mins) if mins > 0 else 0.0

            #### Get team stats (defence score etc.)
            teams = load_csv(f"../data/Fantasy-Premier-League/data/{year}/teams.csv")
            team_data = get_team_data(team_name, teams, season_data[season_data["round"] == gw]["was_home"].values[0])
            result["team_strength"] = team_data[0]
            result["team_strength_attack"] = team_data[1]
            result["team_strength_defence"] = team_data[2]
            opp = get_team_data(opposition_name, teams, not season_data[season_data["round"] == gw]["was_home"].values[0])
            result["opposition_strength"] = opp[0]
            result["opposition_strength_attack"] = opp[1]
            result["opposition_strength_defence"] = opp[2]

            results.append(result)
    return pd.DataFrame(results)

In [101]:
result = generate_data()

TypeError: get_player_data() takes 2 positional arguments but 3 were given

In [None]:
result.to_csv("../data/processed_data.csv", index=False)

In [None]:
def generate_data2():
    results = []
    years = ["2023-24", "2024-25"]
    for year in years:
        root = f"../data/Fantasy-Premier-League/data/{year}/players"
        dirs = [d for d in os.listdir(root) if os.path.isdir(os.path.join(root, d))]
        fixture_df = load_csv(f"../data/Fantasy-Premier-League/data/{year}/fixtures.csv")
        teams = load_csv(f"../data/Fantasy-Premier-League/data/{year}/teams.csv")
        for player in dirs:
            player_path = os.path.join(root, player)
            season_data = load_csv(os.path.join(player_path, "gw.csv"))
            season_data["kickoff_time"] = pd.to_datetime(season_data["kickoff_time"])
            played_games = season_data[season_data["minutes"] > 0]
            player_name = norm_name(player)
            for gw in range(1, 39):
                played_games_prior = played_games[played_games["round"] < gw]
                result = {"gw": gw, "year": year, "player": player_name}
                gw_player_data = season_data[season_data["round"] == gw]
                if gw_player_data.empty:
                    continue
                if gw_player_data["minutes"].sum() == 0:
                    continue
                id = gw_player_data["element"].values[0]
                n_games = len(gw_player_data)
                total_points = gw_player_data["total_points"].sum()
                result["points_scored"] = total_points
                fixtures = gw_player_data["fixture"].values
                was_home = gw_player_data["was_home"].values
                playing_against_difficulty = 0
                playing_against_defence = 0
                playing_against_attack = 0
                for i, fixture in enumerate(list(fixtures)):
                    fixture = int(fixture)
                    home = was_home[i]
                    fixture_df_row = fixture_df[fixture_df["id"] == fixture]
                    team = fixture_df_row["team_h"].values[0] if home else fixture_df_row["team_a"].values[0]
                    result["playing_for_difficulty"] = fixture_df_row["team_h_difficulty"].values[0] if home else fixture_df_row["team_a_difficulty"].values[0]
                    result["playing_for_defence"]= teams[teams["name"] == team]["strength_defence_home"].values[0] if home else fixture_df_row["strength_defence_away"].values[0]
                    result["playing_for_attack"] = teams[teams["name"] == team]["strength_attack_home"].values[0] if home else fixture_df_row["strength_attack_away"].values[0]
                    opp = fixture_df_row["team_a"].values[0] if home else fixture_df_row["team_h"].values[0]
                    opp_name = teams[teams["id"] == opp]["name"].values[0]
                    if home:
                        playing_against_difficulty += fixture_df_row["team_a_difficulty"].values[0]
                        playing_against_defence += teams[teams["name"] == opp_name]["strength_defence_home"].values[0]
                        playing_against_attack += teams[teams["name"] == opp_name]["strength_attack_home"].values[0]
                    else:
                        playing_against_difficulty += fixture_df_row["team_h_difficulty"].values[0]
                        playing_against_defence += teams[teams["name"] == opp_name]["strength_defence_away"].values[0]
                        playing_against_attack += teams[teams["name"] == opp_name]["strength_attack_away"].values[0]
                result["playing_against_mean_difficulty"] = playing_against_difficulty / n_games
                result["playing_against_mean_defence"] = playing_against_defence / n_games
                result["playing_against_mean_attack"] = playing_against_attack / n_games
                player_data, _ = get_player_data(year, id)
                for key, value in player_data.items():
                    result[key] = value
                total_minutes = played_games_prior["minutes"].sum()
                result["avg_minutes_when_playing"] = total_minutes / len(played_games_prior) if len(played_games_prior) > 0 else 0.0
                result["avg_points_when_playing"] = played_games_prior["total_points"].sum() / len(played_games_prior) if len(played_games_prior) > 0 else 0.0
                result["form"] = calc_form(gw, season_data)
                result["avg_yellows_p_game_when_playing"] = (played_games_prior["yellow_cards"].sum() / len(played_games_prior)) if len(played_games_prior) > 0 else 0.0
                result["avg_reds_p_game_when_playing"] = (played_games_prior["red_cards"].sum() / len(played_games_prior)) if len(played_games_prior) > 0 else 0.0
                result["avg_bps_p_game_when_playing"] = (played_games_prior["bps"].sum() / len(played_games_prior)) if len(played_games_prior) > 0 else 0.0
                result["avg_goals_p_game_when_playing"] = (played_games_prior["goals_scored"].sum() / len(played_games_prior)) if len(played_games_prior) > 0 else 0.0
                result["avg_assists_p_game_when_playing"] = (played_games_prior["assists"].sum() / len(played_games_prior)) if len(played_games_prior) > 0 else 0.0
                result["avg_clean_sheets_when_playing"] = (played_games_prior["clean_sheets"].sum() / len(played_games_prior)) if len(played_games_prior) > 0 else 0.0
                result["avg_goals_conceded_when_playing"] = (played_games_prior["goals_conceded"].sum() / len(played_games_prior)) if len(played_games_prior) > 0 else 0.0
                result["avg_starts_when_playing"] = (played_games_prior["starts"].sum() / len(played_games_prior)) if len(played_games_prior) > 0 else 0.0
                result["avg_saves_when_playing"] = (played_games_prior["saves"].sum() / len(played_games_prior)) if len(played_games_prior) > 0 else 0.0
                results.append(result)
    return pd.DataFrame(results)


In [None]:
generate_data2()

IndexError: index 0 is out of bounds for axis 0 with size 0

In [108]:
def generate_data2():
    results = []
    years = ["2021-22", "2022-23", "2023-24", "2024-25"]
    for year in years:
        root = f"../data/Fantasy-Premier-League/data/{year}/players"
        dirs = [d for d in os.listdir(root) if os.path.isdir(os.path.join(root, d))]

        fixture_df = load_csv(f"../data/Fantasy-Premier-League/data/{year}/fixtures.csv").copy()
        teams = load_csv(f"../data/Fantasy-Premier-League/data/{year}/teams.csv").copy()

        # Index for fast, reliable lookups
        fixtures_idx = fixture_df.set_index("id", drop=True)
        teams_idx = teams.set_index("id", drop=True)

        for player in dirs:
            player_path = os.path.join(root, player)
            season_data = load_csv(os.path.join(player_path, "gw.csv")).copy()

            # Robust types
            if "kickoff_time" in season_data.columns:
                season_data["kickoff_time"] = pd.to_datetime(season_data["kickoff_time"], errors="coerce")
            for col in ["minutes", "round", "fixture", "team", "opponent_team", "bps",
                        "yellow_cards", "red_cards", "goals_scored", "assists",
                        "clean_sheets", "goals_conceded", "saves", "total_points", "starts"]:
                if col in season_data.columns:
                    season_data[col] = pd.to_numeric(season_data[col], errors="coerce").fillna(0)

            played_games = season_data[season_data["minutes"] > 0]
            player_name = norm_name(player)

            for gw in range(1, 39):
                played_games_prior = played_games[played_games["round"] < gw]

                gw_player_data = season_data[season_data["round"] == gw]
                if gw_player_data.empty:
                    continue
                if gw_player_data["minutes"].sum() == 0:
                    continue

                # Basic row
                result = {"gw": gw, "year": year, "player": player_name}

                # Player ID from gw.csv (FPL element id)
                if "element" in gw_player_data.columns:
                    id_ = int(gw_player_data["element"].iloc[0])
                else:
                    id_ = None

                n_games = len(gw_player_data)
                result["n_games_in_gw"] = n_games
                result["points_scored"] = float(gw_player_data["total_points"].sum())

                fixtures = gw_player_data["fixture"].astype(int).tolist()
                was_home_flags = gw_player_data["was_home"].astype(bool).tolist() if "was_home" in gw_player_data.columns else [True]*n_games

                # Accumulators across (possible) multiple fixtures
                playing_against_difficulty = 0.0
                playing_against_defence = 0.0
                playing_against_attack = 0.0

                # We'll also record the *player's* team context for the first fixture of the GW
                playing_for_difficulty = None
                playing_for_defence = None
                playing_for_attack = None

                for i, fixture_id in enumerate(fixtures):
                    if fixture_id not in fixtures_idx.index:
                        # Skip if fixture not found
                        continue
                    frow = fixtures_idx.loc[fixture_id]
                    home = bool(was_home_flags[i])

                    # Determine our team_id and opponent team_id from the fixture + home flag
                    team_id = int(frow["team_h"]) if home else int(frow["team_a"])
                    opp_id  = int(frow["team_a"]) if home else int(frow["team_h"])

                    # Difficulties from fixtures.csv
                    if home:
                        playing_against_difficulty += float(frow["team_a_difficulty"])
                        pf_diff = float(frow["team_h_difficulty"])
                    else:
                        playing_against_difficulty += float(frow["team_h_difficulty"])
                        pf_diff = float(frow["team_a_difficulty"])

                    # Strengths from teams.csv (be careful with home/away columns)
                    if team_id in teams_idx.index:
                        if home:
                            pf_def = float(teams_idx.at[team_id, "strength_defence_home"])
                            pf_att = float(teams_idx.at[team_id, "strength_attack_home"])
                        else:
                            pf_def = float(teams_idx.at[team_id, "strength_defence_away"])
                            pf_att = float(teams_idx.at[team_id, "strength_attack_away"])
                    else:
                        pf_def = pf_att = float("nan")

                    if opp_id in teams_idx.index:
                        if home:
                            # opponent is away
                            playing_against_defence += float(teams_idx.at[opp_id, "strength_defence_away"])
                            playing_against_attack  += float(teams_idx.at[opp_id, "strength_attack_away"])
                        else:
                            # opponent is home
                            playing_against_defence += float(teams_idx.at[opp_id, "strength_defence_home"])
                            playing_against_attack  += float(teams_idx.at[opp_id, "strength_attack_home"])
                    else:
                        # If unknown, skip contributions
                        pass

                    # Save "playing_for_*" once (first fixture context)
                    if playing_for_difficulty is None:
                        playing_for_difficulty = pf_diff
                        playing_for_defence = pf_def
                        playing_for_attack = pf_att

                # Averages over the number of fixtures in the GW
                if n_games > 0:
                    result["playing_against_mean_difficulty"] = playing_against_difficulty / n_games
                    result["playing_against_mean_defence"] = playing_against_defence / n_games
                    result["playing_against_mean_attack"] = playing_against_attack / n_games
                else:
                    result["playing_against_mean_difficulty"] = 0.0
                    result["playing_against_mean_defence"] = 0.0
                    result["playing_against_mean_attack"] = 0.0

                result["playing_for_difficulty"] = playing_for_difficulty
                result["playing_for_defence"] = playing_for_defence
                result["playing_for_attack"] = playing_for_attack

                # Optional: enrich with your player metadata
                if id_ is not None:
                    player_data, _ = get_player_data(year, id_)
                    for key, value in player_data.items():
                        result[key] = value

                # Prior-form features (when played)
                denom = max(len(played_games_prior), 1)
                total_minutes = float(played_games_prior["minutes"].sum())
                result["avg_minutes_when_playing"] = total_minutes / denom if len(played_games_prior) > 0 else 0.0
                result["avg_points_when_playing"] = float(played_games_prior["total_points"].sum()) / denom if len(played_games_prior) > 0 else 0.0
                result["form"] = calc_form(gw, season_data)

                # Rates from prior games
                def avg(col):
                    return float(played_games_prior[col].sum()) / denom if len(played_games_prior) > 0 else 0.0

                for col, out in [
                    ("yellow_cards", "avg_yellows_p_game_when_playing"),
                    ("red_cards", "avg_reds_p_game_when_playing"),
                    ("bps", "avg_bps_p_game_when_playing"),
                    ("goals_scored", "avg_goals_p_game_when_playing"),
                    ("assists", "avg_assists_p_game_when_playing"),
                    ("clean_sheets", "avg_clean_sheets_when_playing"),
                    ("goals_conceded", "avg_goals_conceded_when_playing"),
                    ("starts", "avg_starts_when_playing"),
                    ("saves", "avg_saves_when_playing"),
                ]:
                    if col in played_games_prior.columns:
                        result[out] = avg(col)
                    else:
                        result[out] = 0.0

                results.append(result)

    return pd.DataFrame(results)


In [109]:
data = generate_data2()

In [110]:
data.to_csv("../data/stats_by_game_played.csv", index=False)