In [15]:
# Import required libraries
import requests
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import time

In [16]:
# Function to get data from FPL API
def get_fpl_data(endpoint):
    url = f"https://fantasy.premierleague.com/api/{endpoint}/"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Failed to retrieve data from {url}")
        return {}

# Extracting general information (bootstrap-static)
bootstrap_data = get_fpl_data("bootstrap-static")

In [17]:
# Convert data to DataFrames
players_df = pd.DataFrame(bootstrap_data.get('elements', []))
teams_df = pd.DataFrame(bootstrap_data.get('teams', []))
positions_df = pd.DataFrame(bootstrap_data.get('element_types', []))
events_df = pd.DataFrame(bootstrap_data.get('events', []))

In [18]:
# Function to get historical data for each player (element-summary)
def get_player_history(player_id):
    endpoint = f"element-summary/{player_id}/"
    time.sleep(0.1)  # Add a slight delay to avoid hitting API limits
    return get_fpl_data(endpoint)

# Extracting historical data for each player
player_histories = []
for player_id in players_df['id'].unique():
    player_hist = get_player_history(player_id)
    if player_hist:  # Only append if the data is valid
        history = pd.DataFrame(player_hist.get('history', []))
        history['player_id'] = player_id
        player_histories.append(history)

# Combine all player histories into one DataFrame
player_hist_df = pd.concat(player_histories, ignore_index=True)

In [19]:
# Merge player stats with team and position information
players_df = players_df.merge(teams_df[['id', 'name']], left_on='team', right_on='id', suffixes=('', '_team'))
players_df = players_df.merge(positions_df[['id', 'singular_name']], left_on='element_type', right_on='id', suffixes=('', '_position'))

In [20]:
# Example current team performance DataFrame
team_performance_df = pd.DataFrame({
    'team': ['Arsenal', 'Aston Villa', 'Bournemouth', 'Brentford', 'Brighton', 
             'Chelsea', 'Crystal Palace', 'Everton', 'Fulham', 'Liverpool', 
             'Luton Town', 'Manchester City', 'Manchester United', 'Newcastle', 
             'Nottingham Forest', 'Sheffield United', 'Tottenham', 'West Ham', 
             'Wolverhampton'],
    'last_5_avg_goals': [2.0, 1.5, 1.2, 1.0, 1.8, 1.3, 0.8, 1.0, 1.5, 2.2, 
                         1.1, 2.5, 1.7, 2.0, 1.4, 1.0, 1.8, 2.0, 1.6],  # Average goals scored in last 5 games
    'last_5_avg_conceded': [1.0, 2.5, 2.0, 1.8, 1.5, 2.2, 1.4, 2.0, 1.7, 
                            1.0, 2.1, 0.9, 1.5, 1.2, 2.0, 1.8, 1.0, 1.5, 1.2]  # Average goals conceded in last 5 games
})

In [21]:
# Example injury data DataFrame
injury_data_df = pd.DataFrame({
    'team': ['Arsenal', 'Aston Villa', 'Bournemouth', 'Brentford', 'Brighton', 
             'Chelsea', 'Crystal Palace', 'Everton', 'Fulham', 'Liverpool', 
             'Luton Town', 'Manchester City', 'Manchester United', 'Newcastle', 
             'Nottingham Forest', 'Sheffield United', 'Tottenham', 'West Ham', 
             'Wolverhampton'],
    'injury_count': [1, 2, 0, 1, 2, 0, 3, 1, 0, 1, 2, 0, 1, 0, 1, 0, 0, 1, 2],  # Number of starting players injured
    'injured_player_names': [['Player A'], ['Player B', 'Player C'], [], 
                             ['Player D'], ['Player E', 'Player F'], [], 
                             ['Player G'], ['Player H'], [], ['Player I'], 
                             ['Player J'], [], ['Player K'], [], 
                             ['Player L'], [], ['Player M'], 
                             ['Player N'], ['Player O']]
})

In [22]:
# Merge team performance data
players_df = players_df.merge(team_performance_df, how='left', left_on='name', right_on='team')

# Merge injury data
players_df = players_df.merge(injury_data_df, how='left', left_on='name', right_on='team')

In [23]:
# Select relevant columns for training
player_hist_df = player_hist_df[['player_id', 'total_points', 'minutes', 'goals_scored', 
                                  'assists', 'clean_sheets', 'goals_conceded', 
                                  'saves', 'yellow_cards', 'red_cards', 'bonus', 
                                  'bps', 'round']]

# Merge player history with player info, ensuring we keep first_name and second_name
train_df = player_hist_df.merge(players_df[['id', 'name', 'singular_name', 
                                             'first_name', 'second_name',  # Add these columns
                                             'last_5_avg_goals', 'last_5_avg_conceded', 
                                             'injury_count']], left_on='player_id', 
                                 right_on='id')

In [24]:
# Handle missing data by filling with zeros or mean as needed
train_df.fillna(0, inplace=True)

In [25]:
# Encode positions as numeric values (GK=1, DEF=2, MID=3, FWD=4)
train_df['position_encoded'] = train_df['singular_name'].map({'Goalkeeper': 1, 'Defender': 2, 'Midfielder': 3, 'Forward': 4})

# Feature selection: Player stats, position, fixture difficulty, team performance, and injuries
features = ['minutes', 'goals_scored', 'assists', 'clean_sheets', 'goals_conceded', 
            'saves', 'yellow_cards', 'red_cards', 'bonus', 'bps', 'position_encoded', 
            'last_5_avg_goals', 'last_5_avg_conceded', 'injury_count']
target = 'total_points'

In [26]:
# Split data into training and testing sets (use data pre-24/25 as training, 24/25 season as test)
train_df = train_df[train_df['round'] < 24]  # Train data before 24/25 season
X_train, X_test, y_train, y_test = train_test_split(train_df[features], train_df[target], test_size=0.2, random_state=42)

# Train a RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Test the model on the test set
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Output evaluation results
print("Mean Squared Error:", mse)
print("R^2 Score:", r2)

Mean Squared Error: 0.15228514851485145
R^2 Score: 0.9762503095775386


In [27]:
# Recommend top players per position for the upcoming gameweeks with aggregation
def recommend_top_players(model, data, position, top_n=5):
    position_encoded = {'Goalkeeper': 1, 'Defender': 2, 'Midfielder': 3, 'Forward': 4}[position]
    position_data = data[data['position_encoded'] == position_encoded].copy()
    
    # Predict points
    position_data['predicted_points'] = model.predict(position_data[features])
    
    # Aggregate predicted points for unique players
    aggregated_data = position_data.groupby(['first_name', 'second_name']).agg({'predicted_points': 'mean'}).reset_index()
    
    # Create full name column
    aggregated_data['full_name'] = aggregated_data['first_name'] + ' ' + aggregated_data['second_name']
    
    # Sort and get top N players
    top_players = aggregated_data.sort_values('predicted_points', ascending=False).head(top_n)
    
    return top_players[['full_name', 'predicted_points']]

# Get top 5 players for each position
top_goalkeepers = recommend_top_players(model, train_df, 'Goalkeeper')
top_defenders = recommend_top_players(model, train_df, 'Defender')
top_midfielders = recommend_top_players(model, train_df, 'Midfielder')
top_forwards = recommend_top_players(model, train_df, 'Forward')

In [28]:
# Team Rating Calculation
def calculate_team_rating(team_performance_df, injury_data_df):
    # Calculate performance rating for each team
    team_performance_df['performance_rating'] = (
        team_performance_df['last_5_avg_goals'] - team_performance_df['last_5_avg_conceded'] - injury_data_df['injury_count']
    )
    
    # Sort teams by performance rating and get top 5
    top_teams = team_performance_df.sort_values('performance_rating', ascending=False).head(5)
    return top_teams[['team', 'performance_rating']]

# Get top 5 teams
top_teams = calculate_team_rating(team_performance_df, injury_data_df)

In [29]:
# Output top players and top teams
print("Top Goalkeepers:")
print(top_goalkeepers)

print("\nTop Defenders:")
print(top_defenders)

print("\nTop Midfielders:")
print(top_midfielders)

print("\nTop Forwards:")
print(top_forwards)

print("\nTop 5 Teams Based on Performance Rating:")
print(top_teams)

Top Goalkeepers:
                full_name  predicted_points
7             André Onana          5.355714
21      David Raya Martin          5.102857
55         Robert Sánchez          4.997143
4   Alisson Ramses Becker          4.987143
50              Nick Pope          4.260000

Top Defenders:
                        full_name  predicted_points
202        Trent Alexander-Arnold          6.012857
65                Ibrahima Konaté          5.332857
210               Virgil van Dijk          5.134286
46           Diogo Dalot Teixeira          5.121429
57   Gabriel dos Santos Magalhães          5.068571

Top Midfielders:
         full_name  predicted_points
59     Cole Palmer          9.320000
217  Mohamed Salah          8.498571
43     Bukayo Saka          8.450000
42    Bryan Mbeumo          7.841429
186      Luis Díaz          7.077143

Top Forwards:
          full_name  predicted_points
23   Erling Haaland         10.048571
54  Nicolas Jackson          6.211429
43      Kai Havertz   