In [19]:
import pandas as pd 
from typing import Dict

In [20]:
def load_csv(path):
    for enc in ("utf-8", "utf-8-sig", "cp1252", "latin-1"):
        try:
            return pd.read_csv(
                path,
                engine="python",       # more tolerant
                on_bad_lines="skip",   # or "skip"
                encoding=enc
            )
        except UnicodeDecodeError:
            continue

    return pd.read_csv(
        path,
        engine="python",
        on_bad_lines="skip",
        encoding="utf-8",
        encoding_errors="replace"
    )

def get_path_from_row(root:str, row: pd.Series) -> str:
    first_name = row.get("first_name", "")
    last_name = row.get("second_name", "")
    id = row.get("id", "")
    return f"{root}/players/{first_name}_{last_name}_{id}"

In [21]:
def calculate_weighted_field(player_df, field_name, upcoming_gw, look_back=5):
    # weight a field of previous results for the player. E.g. points scored, xG, xA, etc. Weighting recent GWs more heavily.
    weighted_sum = 0
    previous_fixtures = player_df[player_df["round"] < upcoming_gw].sort_values(by="round", ascending=False).head(look_back)
    total_weight = 0
    for i, (_, fixture) in enumerate(previous_fixtures.iterrows()):
        weight = look_back - i
        weighted_sum += fixture[field_name] * weight
        total_weight += weight
    if total_weight == 0:
        return 0
    return weighted_sum / total_weight


In [27]:
fields_to_weight = ["assists", "goals_scored", "minutes", "goals_conceded",
                    "influence", "creativity", "threat", "ict_index", "bonus", "yellow_cards", "saves",
                    "expected_assists","expected_goal_involvements", "expected_goals", "expected_goals_conceded"]


In [28]:
def get_team_and_fixture_info(player_df, teams_df, upcoming_gw):
    gw_fixtures = player_df[player_df["round"] == upcoming_gw]
    gw_data = [] # list of dicts for each fixture in the upcoming game week for that player (accounts for double gws)
    for _, fixture in gw_fixtures.iterrows():
        data = {}
        data["points"] = fixture["total_points"]
        opponent_team_id = fixture["opponent_team"]
        was_home = fixture["was_home"]
        opponent_row = teams_df[teams_df["id"] == opponent_team_id]
        data["opposition_strength"] = opponent_row["strength"].values[0]
        if was_home:
            data["opponent_strength_attack"] = opponent_row["strength_attack_home"].values[0]
            data["opponent_strength_defense"] = opponent_row["strength_defence_home"].values[0]
        else:
            data["opponent_strength_attack"] = opponent_row["strength_attack_away"].values[0]
            data["opponent_strength_defense"] = opponent_row["strength_defence_away"].values[0]
        data["was_home"] = int(was_home)
        gw_data.append(data)
    
    return gw_data


In [29]:
def position_dict(position:int) -> Dict:
    return {
        "goalkeeper": int(position == 1),
        "defender": int(position == 2),
        "midfielder": int(position == 3),
        "forward": int(position == 4)
    }
    

In [30]:
all_data = []
for season in ["2022-23", "2023-24", "2024-25"]:
    print("Processing season:", season)
    root = f"../../data/Fantasy-Premier-League/data/{season}"
    players_raw_df = load_csv(f"{root}/players_raw.csv")
    teams_df = load_csv(f"{root}/teams.csv")
    for _, player_row in players_raw_df.iterrows():
        player_path = get_path_from_row(root, player_row)
        player_df = load_csv(f"{player_path}/gw.csv")
        position = player_row["element_type"]

        for i in range(5, 39):
            if player_row["minutes"] == 0:
                continue
            data = {}
            for key in fields_to_weight:
                data[key] = calculate_weighted_field(player_df, key, i)
            fixture_details = get_team_and_fixture_info(player_df, teams_df, i)
            for fixture_dict in fixture_details:
                combined_data = {**data, **fixture_dict}
                combined_data["player_id"] = player_row["id"]
                combined_data.update(position_dict(position))
                combined_data["game_week"] = i
                combined_data["season"] = season
                all_data.append(combined_data)


Processing season: 2022-23
Processing season: 2023-24
Processing season: 2024-25


In [31]:
df = pd.DataFrame(all_data)
df.to_csv("../data/processed_player_data.csv", index=False)

In [32]:
df.columns

Index(['assists', 'goals_scored', 'minutes', 'goals_conceded', 'influence',
       'creativity', 'threat', 'ict_index', 'bonus', 'yellow_cards', 'saves',
       'expected_assists', 'expected_goal_involvements', 'expected_goals',
       'expected_goals_conceded', 'points', 'opposition_strength',
       'opponent_strength_attack', 'opponent_strength_defense', 'was_home',
       'player_id', 'goalkeeper', 'defender', 'midfielder', 'forward',
       'game_week', 'season'],
      dtype='object')