In [1]:
import collections
import warnings
import ScraperFC as sfc  # LEAVE THIS!
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from scipy.stats import poisson
import math
from tqdm import tqdm
from helpers import *
import os
import time
import logging

In [2]:
logging.basicConfig(level=logging.DEBUG, format='%(levelname)s: %(message)s')

In [3]:
# Specify file paths
player_elo_csv = 'player_elo_data_updated.csv'
team_elo_csv = 'team_elo_data_updated.csv'

# Ensure output directories exist
output_dir_players = 'playerperformancesupdated'
os.makedirs(output_dir_players, exist_ok=True)

output_dir_matches = 'matchstatsupdated'
os.makedirs(output_dir_matches, exist_ok=True)

# Load or initialize player ELO DataFrame
if os.path.exists(player_elo_csv):
    player_elo_df = pd.read_csv(player_elo_csv, index_col='player_id')
else:
    # Initialize an empty DataFrame with 'player_id' as the index
    player_elo_df = pd.DataFrame(columns=['player_name', 'team_id', 'elo', 'last_updated'])
    player_elo_df.index.name = 'player_id'  # Set the index name to 'player_id'
    player_elo_df.to_csv(player_elo_csv)  # Save with default index=True

# Load or initialize team ELO DataFrame
if os.path.exists(team_elo_csv):
    team_elo_df = pd.read_csv(team_elo_csv, index_col='team_id')  # Load with 'team_id' as index
else:
    team_elo_df = pd.DataFrame(columns=['team_name', 'elo', 'last_updated'])
    team_elo_df.index.name = 'team_id'  # Set 'team_id' as the index name


In [4]:
ss = sfc.Sofascore()

In [5]:
elo_ratings_mov = {}
total_home_wins, total_away_wins, total_matches = 0,0,0
matches_data = []

In [7]:
# Define the seasons and leagues
seasons = ['21/22','22/23','23/24', '24/25']
calendar_year_seasons = [str(year) for year in range(2022, 2025)]  # Use calendar years for MLS, Brasileirao

leagues = ['EPL', 'La Liga', 'liga-portugal-betclic', 'Ligue 1', 'Bundesliga', 'Serie A', 'Champions League', 'MLS', 'brasileirao-serie-a', 'championship', 'Europa League']

# Initialize a list to store all the match events
all_events = []

# Loop through each league
for league in leagues:
    try:
        # Check if the league uses calendar years
        if league in ['MLS', 'brasileirao-serie-a']:
            # Fetch match data using calendar year format
            for calendar_season in calendar_year_seasons:
                events = ss.get_match_dicts(calendar_season, league)
                all_events.extend(events)
                print(f"Successfully fetched data for {league} - {calendar_season}")
        else:
            # Fetch match data using 'XX/XX' format for European leagues
            for season in seasons:
                events = ss.get_match_dicts(season, league)
                all_events.extend(events)
                print(f"Successfully fetched data for {league} - {season}")

    except Exception as e:
        # Handle any exceptions that may occur during the fetching process
        print(f"Error fetching data for {league}: {e}")

# Now 'all_events' contains all the match data for the specified leagues and seasons


Error fetching data for EPL: Expecting value: line 2 column 1 (char 1)
Error fetching data for La Liga: Expecting value: line 2 column 1 (char 1)
Error fetching data for liga-portugal-betclic: Expecting value: line 2 column 1 (char 1)
Error fetching data for Ligue 1: Expecting value: line 2 column 1 (char 1)
Error fetching data for Bundesliga: Expecting value: line 2 column 1 (char 1)
Error fetching data for Serie A: Expecting value: line 2 column 1 (char 1)
Error fetching data for Champions League: Expecting value: line 2 column 1 (char 1)
Error fetching data for MLS: Expecting value: line 2 column 1 (char 1)
Error fetching data for brasileirao-serie-a: Expecting value: line 2 column 1 (char 1)
Error fetching data for championship: Expecting value: line 2 column 1 (char 1)
Error fetching data for Europa League: Expecting value: line 2 column 1 (char 1)


In [23]:
# Sort the events list based on 'startTimestamp'
events_sorted = sorted(all_events, key=lambda x: x.get('startTimestamp', 0))

In [56]:
def initialize_team_elo(team_id, team_name, team_elo_df, elo=1500, date=None):
    """
    Initialize or update a team's ELO rating in the DataFrame.

    Parameters:
    - team_id (int/str): Unique identifier for the team.
    - team_name (str): Name of the team.
    - team_elo_df (DataFrame): DataFrame containing team ELO information.
    - elo (float): Current or new ELO rating of the team (default is 1500).
    - date (str): Date when the ELO was last updated (YYYY-MM-DD).
    """
    if team_id in team_elo_df.index:
        # Update existing team's ELO and last_updated date
        team_elo_df.at[team_id, 'elo'] = elo
        team_elo_df.at[team_id, 'last_updated'] = date
        logging.debug(f"Updated ELO for team ID: {team_id} to {elo}.")
    else:
        # Add new team to the DataFrame
        team_elo_df.loc[team_id] = {
            'team_name': team_name,
            'elo': elo,
            'last_updated': date
        }
        logging.info(f"Added new team to DataFrame: {team_id} - {team_name} with ELO {elo}.")


In [None]:
all_match_player_data = []
all_match_data = []

In [None]:
events = events_sorted
count = 0
rating_sum = 0
rating_list = []
M_rating_list = []
F_rating_list = []
D_rating_list = []
G_rating_list = []


for event in tqdm(events, desc="Processing Events"):
        try:
            # Extract event details
            event_id = event.get("id", None)
            unique_tournament = event.get('tournament', {}).get('uniqueTournament', {})
            home_team = event.get('homeTeam', {}).get('name', 'Unknown')
            home_team_id = event.get('homeTeam', {}).get('id', 'Unknown')
            away_team = event.get('awayTeam', {}).get('name', 'Unknown')
            away_team_id = event.get('awayTeam', {}).get('id', 'Unknown')
            home_score = event.get('homeScore', {}).get('display', 0)
            away_score = event.get('awayScore', {}).get('display', 0)
            gd = home_score - away_score


            # Convert timestamp to readable date and time
            start_timestamp = event.get("startTimestamp")
            if start_timestamp:
                match_datetime = datetime.fromtimestamp(start_timestamp)
                match_date = match_datetime.strftime("%Y-%m-%d")
                match_time = match_datetime.strftime("%H:%M:%S")
            else:
                match_date = None
                match_time = None


            if home_team_id not in team_elo_df.index:
                initialize_team_elo(home_team_id, home_team, team_elo_df, elo=1500, date=match_date)

            if away_team_id not in team_elo_df.index:
                initialize_team_elo(away_team_id, away_team, team_elo_df, elo=1500, date=match_date)

            # Scrape player match stats
            
            match_player_data = ss.scrape_player_match_stats(event_id)

            # Aggregate team ELOs


            # Extract all player IDs in the match
           # Extract all player IDs in the match
            player_ids = match_player_data['id'].unique()

            # Identify missing player IDs
            missing_player_ids = [pid for pid in player_ids if pid not in player_elo_df.index]

            if missing_player_ids:
                # Extract details of missing players
                missing_players = match_player_data[match_player_data['id'].isin(missing_player_ids)]
                
                for _, player in missing_players.iterrows():
                    player_id = player['id']
                    player_name = player['name']
                    player_position = player['position']
                    team_id = player['teamId']
                    # Initialize missing player with default ELO of 1300
                    initialize_player_elo(player_id, player_name, team_id, player_position, elo=1300, date=match_date, player_elo_df=player_elo_df)

                logging.info(f"Initialized {len(missing_player_ids)} new players with default ELO of 1300.")

            agg_home_elo = aggregate_team_elo(home_team_id, match_player_data, player_elo_df)
            agg_away_elo = aggregate_team_elo(away_team_id, match_player_data, player_elo_df)
            
            # Iterate over each player to update their ELO
            for index, player in match_player_data.iterrows():
                player_id = player['id']
                player_name = player['name']
                player_rating = player['rating']
                player_position = player['position']
                player_position = str(player_position.iloc[0])
                minutes_played = player['minutesPlayed']
                team_id = player['teamId']
                

                if pd.notna(player_rating):
                    rating_list.append(player_rating)
                    
                    if player_position =='F':
                        F_rating_list.append(player_rating)
                    elif player_position =='D':
                        D_rating_list.append(player_rating)
                    elif player_position =='M':
                        M_rating_list.append(player_rating)
                    else:
                        G_rating_list.append(player_rating)

                # Update ELO based on team
                if team_id == home_team_id:
                    update_player_elo(
                        agg_elo_home=agg_home_elo,
                        agg_elo_away=agg_away_elo,
                        total_home_wins=total_home_wins,
                        total_away_wins=total_away_wins,
                        total_matches=total_matches,
                        gd=gd,
                        player_id=player_id,
                        rating=player_rating,
                        minutes_played=minutes_played,
                        position = player_position,
                        player_elo_df = player_elo_df,
                        U=0.8,
                        K=30,
                        sc=600,
                        away=False
                    )
                elif team_id == away_team_id:
                    update_player_elo(
                        agg_elo_home=agg_home_elo,
                        agg_elo_away=agg_away_elo,
                        total_home_wins=total_home_wins,
                        total_away_wins=total_away_wins,
                        total_matches=total_matches,
                        gd=gd,
                        player_id=player_id,
                        rating=player_rating,
                        minutes_played=minutes_played,
                        position = player_position,
                        player_elo_df = player_elo_df,
                        U=0.8,
                        K=30,
                        sc=600,
                        away=True
                    )

            # Update team ELOs and match outcomes
            if home_team not in elo_ratings_mov:
                elo_ratings_mov[home_team] = 1500  # Starting ELO rating
            if away_team not in elo_ratings_mov:
                elo_ratings_mov[away_team] = 1500  # Starting ELO rating

            homeElo, awayElo = davidson_mov(
                home_team,
                away_team,
                gd,
                elo_ratings_mov,
                total_home_wins,
                total_away_wins,
                total_matches
            )

            # Determine match outcome and update wins
            if gd == 0:
                FTR = 3
                home_outcome = 0  # Draw
                away_outcome = 0  # Draw
            elif gd > 0:
                FTR = 1
                total_home_wins += 1
                home_outcome = 1  # Home team won
                away_outcome = -1  # Away team lost
            elif gd < 0:
                FTR = 2
                home_outcome = -1  # Home team lost
                away_outcome = 1  # Away team won
                total_away_wins += 1

            total_matches += 1

            # Scrape team match stats
            match_data = ss.scrape_team_match_stats(event_id)

            # Prepare match data for saving
            new_row = {
                "eventId": event_id,
                "tournament":unique_tournament,
                "homeId": home_team_id,
                "homeTeam": home_team,
                "awayId": away_team_id,
                "awayTeam": away_team,
                "homeElo": homeElo,
                "awayElo": awayElo,
                "homeAggElo":agg_home_elo,
                "awayAggElo": agg_away_elo,
                "homeScore": home_score,
                "awayScore": away_score,
                "FTR": FTR,  # Full-Time Result
                "date": match_date,
                "time": match_time
            }

            match_data_filtered = match_data[['key', 'home', 'away']]

            for index, row in match_data_filtered.iterrows():
                key = row['key']
                # Create the new columns for home_{key} and away_{key}
                new_row[f'home{key}'] = row['home']
                new_row[f'away{key}'] = row['away']

            matches_data.append(new_row)

            # Save player match data and team match data to CSV
            filename_players = os.path.join(output_dir_players, f'{event_id}.csv')
            match_player_data.to_csv(filename_players, index=False)
            filename_matches = os.path.join(output_dir_matches, f'{event_id}.csv')
            match_data.to_csv(filename_matches, index=False)

        except Exception as e:
            logging.error(f"Error processing event {event.get('id', 'Unknown')}: {e}", exc_info=True)
            continue  # Skip to the next event in case of an error

        time.sleep(0.5)

# After processing all events, save the updated player ELO DataFrame
player_elo_df.to_csv(player_elo_csv, index_label='player_id')
team_elo_df.to_csv(team_elo_csv, index=False)
logging.info(f"Final player ELO data saved to {player_elo_csv}.")

# Convert matches data to DataFrame and save
matches_df = pd.DataFrame(matches_data)
matches_df.fillna(0, inplace=True)
# Step 5: Apply convert_percentage function to relevant columns
percentage_columns = [col for col in matches_df.columns if 'home' in col or 'away' in col]
matches_df[percentage_columns] = matches_df[percentage_columns].map(convert_percentage)
matches_df.to_csv('matches_data_3_years.csv', index=False)


Processing Events:   0%|          | 0/13014 [00:00<?, ?it/s]ERROR:root:Error processing event 9570863: Expecting value: line 2 column 1 (char 1)
Traceback (most recent call last):
  File "C:\Users\ahmed\AppData\Local\Temp\ipykernel_33900\3584678604.py", line 44, in <module>
    match_player_data = ss.scrape_player_match_stats(event_id)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ahmed\AppData\Local\Programs\Python\Python311\Lib\site-packages\ScraperFC\sofascore.py", line 423, in scrape_player_match_stats
    match_dict = self.get_match_dict(match_id)  # used to get home and away team names and IDs
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ahmed\AppData\Local\Programs\Python\Python311\Lib\site-packages\ScraperFC\sofascore.py", line 188, in get_match_dict
    data = response.json()['event']
           ^^^^^^^^^^^^^^^
  File "c:\Users\ahmed\AppData\Local\Programs\Python\Python311\Lib\site-packages\botasaurus_requests\response.py", 

In [63]:
all_match_player_df = pd.concat([df.reset_index(drop=True) for df in all_match_player_data], ignore_index=True)
# all_match_player_data


InvalidIndexError: Reindexing only valid with uniquely valued Index objects

In [67]:
# Remove duplicate columns in each DataFrame
all_match_player_data = [df.loc[:, ~df.columns.duplicated()].reset_index(drop=True) for df in all_match_player_data]


In [69]:
# Get a union of all unique columns
all_columns = list(set(col for df in all_match_player_data for col in df.columns))

# Reindex each DataFrame
all_match_player_data = [df.reindex(columns=all_columns).reset_index(drop=True) for df in all_match_player_data]
all_match_player_df = pd.concat(all_match_player_data, ignore_index=True)


In [71]:
all_match_player_df.to_csv('all_player_match_data.csv', index=False)

In [None]:
# # After processing all events, save the updated player ELO DataFrame
# all_match_player_df = pd.concat(all_match_player_data, ignore_index=True)


# Concatenate all team match data
matches_df = pd.concat(all_match_data, ignore_index=True)
matches_df.to_csv('all_team_match_data.csv', index=False)


# Step 5: Apply convert_percentage function to relevant columns
percentage_columns = [col for col in matches_df.columns if 'home' in col or 'away' in col]
matches_df[percentage_columns] = matches_df[percentage_columns].map(convert_percentage)
matches_df.to_csv('matches_data_3_years.csv', index=False)

AttributeError: 'DataFrame' object has no attribute 'map'

In [72]:
# Apply convert_percentage function to relevant columns individually
percentage_columns = [col for col in matches_df.columns if 'home' in col or 'away' in col]
matches_df[percentage_columns] = matches_df[percentage_columns].applymap(convert_percentage)
matches_df.to_csv('matches_data_3_years.csv', index=False)
