In [2]:
import pandas as pd
import re

# Load the dataset
file_path = 'all_player_performance_data.csv'  # Adjust the file path accordingly
df = pd.read_csv(file_path)

# Name the columns based on the first row and drop that row
df.columns = df.iloc[0]
df = df[1:]

# Define a function to check valid "season" format (YYYY-YYYY)
def is_valid_season(season):
    if isinstance(season, str) and re.match(r"^\d{4}-\d{4}$", season):
        return True
    return False

# Filter the dataframe to keep rows where the "Season" column has a valid format
df_filtered = df[df['Season'].apply(is_valid_season)]

# Save the cleaned dataframe to a new CSV file
df_filtered.to_csv('filtered_player_performance_data.csv', index=False)

# Output the first few rows to confirm the changes
print(df_filtered.head())


0     Season Age    Squad Country                 Comp LgRank  MP Starts  \
1  2015-2016  19  Aalborg  dk DEN  1. Danish Superliga    5th  14      7   
2  2016-2017  20  Aalborg  dk DEN  1. Danish Superliga   10th  11      4   
3  2017-2018  21  Aalborg  dk DEN  1. Danish Superliga    5th  17      9   
4  2018-2019  22  Aalborg  dk DEN  1. Danish Superliga    9th  25     24   
5  2019-2020  23  Aalborg  dk DEN  1. Danish Superliga    5th  11      2   

0   Min   90s  ...  G-PK G+A-PK   xG  xAG xG+xAG npxG npxG+xAG  Matches  \
1   594   6.6  ...  0.15   0.15  NaN  NaN    NaN  NaN      NaN  Matches   
2   451   5.0  ...  0.00   0.00  NaN  NaN    NaN  NaN      NaN  Matches   
3   828   9.2  ...  0.00   0.11  NaN  NaN    NaN  NaN      NaN  Matches   
4  2136  23.7  ...  0.04   0.08  NaN  NaN    NaN  NaN      NaN  Matches   
5   370   4.1  ...  0.24   0.24  NaN  NaN    NaN  NaN      NaN  Matches   

0                NaN   MP  
1  Oliver Abildgaard  NaN  
2  Oliver Abildgaard  NaN  
3  Olive

In [9]:
import pandas as pd

# Load the filtered dataset
file_path = 'filtered_player_performance_data.csv'  # Adjust the file path accordingly
df_filtered = pd.read_csv(file_path)

# Rename the "Player" column (based on the previous inspection, it was unnamed)
df_filtered.rename(columns={df_filtered.columns[-2]: 'Player'}, inplace=True)

# Drop the last column (which is the second "MP") and the "Matches" column
df_filtered = df_filtered.iloc[:, :-1]  # Drop the last column (second "MP")
df_filtered = df_filtered.drop(columns=['Matches'], errors='ignore')

# Only fill missing values for numeric columns
numeric_cols = df_filtered.select_dtypes(include='number').columns

# Group by 'Player' and only fill missing values with the mean for that player
def fill_missing_with_mean(group):
    for col in numeric_cols:
        # Fill missing values in each numeric column with the player's average
        group[col].fillna(group[col].mean(), inplace=True)
    return group

df_filled = df_filtered.groupby('Player').apply(fill_missing_with_mean).reset_index(drop=True)

# Save the resulting dataframe to a new CSV file
df_filled.to_csv('filled_player_performance_data.csv', index=False)

# Output the first few rows to confirm the changes
print(df_filled.head())


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  group[col].fillna(group[col].mean(), inplace=True)


      Season  Age     Squad Country        Comp LgRank    MP  Starts     Min  \
0  2022-2023   17    Torino  it ITA  1. Serie A   10th   0.0     0.0     1.0   
1  2023-2024   18    Torino  it ITA  1. Serie A    9th   0.0     0.0     1.0   
2  2024-2025   19    Torino  it ITA  1. Serie A    3rd   1.0     0.0     1.0   
3  2016-2017   19  Espanyol  es ESP  1. La Liga    8th  30.0    28.0  2587.0   
4  2017-2018   20  Espanyol  es ESP  1. La Liga   11th  32.0    31.0  2815.0   

    90s  ...  Ast.1  G+A.1  G-PK.1  G+A-PK      xG.1     xAG.1    xG+xAG  \
0   0.0  ...    0.0    0.0     0.0     0.0  0.000000  0.000000  0.000000   
1   0.0  ...    0.0    0.0     0.0     0.0  0.000000  0.000000  0.000000   
2   0.0  ...    0.0    0.0     0.0     0.0  0.000000  0.000000  0.000000   
3  28.7  ...    0.0    0.0     0.0     0.0  0.025556  0.113333  0.136667   
4  31.3  ...    0.0    0.0     0.0     0.0  0.000000  0.090000  0.090000   

     npxG.1  npxG+xAG.1                 Player  
0  0.000000  

  df_filled = df_filtered.groupby('Player').apply(fill_missing_with_mean).reset_index(drop=True)


In [16]:
import pandas as pd

# Load the filtered dataset
file_path = 'filtered_player_performance_data.csv'  # Adjust the file path accordingly
df_filtered = pd.read_csv(file_path)

# Rename the "Player" column (based on the previous inspection, it was unnamed)
df_filtered.rename(columns={df_filtered.columns[-2]: 'Player'}, inplace=True)

# Merge the two "MP" columns instead of dropping the second one
# Assuming the two columns are named "MP" and "MP.1" (you can adjust this to the actual column names)
df_filtered['MP'] = df_filtered['MP'].combine_first(df_filtered['MP.1'])

# Drop the redundant "MP.1" column (the second "MP")
df_filtered = df_filtered.drop(columns=['MP.1'], errors='ignore')

# Drop the "Matches" column if it exists
df_filtered = df_filtered.drop(columns=['Matches'], errors='ignore')

# Only fill missing values for numeric columns
numeric_cols = df_filtered.select_dtypes(include='number').columns

# Group by 'Player' and only fill missing values with the mean for that player
def fill_missing_with_mean(group):
    for col in numeric_cols:
        # Fill missing values in each numeric column with the player's average
        group[col].fillna(group[col].mean(), inplace=True)
    return group

df_filled = df_filtered.groupby('Player').apply(fill_missing_with_mean).reset_index(drop=True)

# Save the resulting dataframe to a new CSV file
df_filled.to_csv('filled_player_performance_data.csv', index=False)

# Output the first few rows to confirm the changes
print(df_filled.head())


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  group[col].fillna(group[col].mean(), inplace=True)


      Season  Age     Squad Country        Comp LgRank    MP  Starts     Min  \
0  2022-2023   17    Torino  it ITA  1. Serie A   10th   0.0     0.0     1.0   
1  2023-2024   18    Torino  it ITA  1. Serie A    9th   0.0     0.0     1.0   
2  2024-2025   19    Torino  it ITA  1. Serie A    3rd   1.0     0.0     1.0   
3  2016-2017   19  Espanyol  es ESP  1. La Liga    8th  30.0    28.0  2587.0   
4  2017-2018   20  Espanyol  es ESP  1. La Liga   11th  32.0    31.0  2815.0   

    90s  ...  Ast.1  G+A.1  G-PK.1  G+A-PK      xG.1     xAG.1    xG+xAG  \
0   0.0  ...    0.0    0.0     0.0     0.0  0.000000  0.000000  0.000000   
1   0.0  ...    0.0    0.0     0.0     0.0  0.000000  0.000000  0.000000   
2   0.0  ...    0.0    0.0     0.0     0.0  0.000000  0.000000  0.000000   
3  28.7  ...    0.0    0.0     0.0     0.0  0.025556  0.113333  0.136667   
4  31.3  ...    0.0    0.0     0.0     0.0  0.000000  0.090000  0.090000   

     npxG.1  npxG+xAG.1                 Player  
0  0.000000  

  df_filled = df_filtered.groupby('Player').apply(fill_missing_with_mean).reset_index(drop=True)


In [17]:
import pandas as pd

# Load the dataset
file_path = 'filled_player_performance_data.csv'  # Replace with your file path
df = pd.read_csv(file_path)

# Step 1: Filter out the 2024-2025 season
df_filtered = df[df['Season'] != '2024-2025']

# Step 2: Identify the numerical columns (dropping categorical columns like 'Season', 'Squad', 'Comp', 'LgRank', 'Player')
numerical_columns = df_filtered.select_dtypes(include=['float64', 'int64']).columns

# Step 3: Define a function to compute averages for past seasons and set the 'target' for the next season
def create_training_data(player_df, target_column):
    rows = []
    for i in range(1, len(player_df)):
        # Get the data for all previous seasons
        previous_data = player_df.iloc[:i]
        
        # Average numerical columns
        avg_data = previous_data[numerical_columns].mean().to_dict()
        
        # Get current season's Squad and Comp, and set the target for prediction
        current_row = player_df.iloc[i]
        avg_data['Player'] = current_row['Player']  # Include the player's name
        avg_data['Squad'] = current_row['Squad']
        avg_data['Comp'] = current_row['Comp']
        
        # Set the target as the value from the target_column (e.g., Gls or Ast) in the current season
        avg_data['target'] = current_row[target_column]
        
        rows.append(avg_data)
    
    return pd.DataFrame(rows)

# Step 4: Generate training data for Goals (Gls) prediction
training_data_gls = df_filtered.groupby('Player').apply(lambda x: create_training_data(x, 'Gls')).reset_index(drop=True)

# Step 5: Generate training data for Assists (Ast) prediction
training_data_ast = df_filtered.groupby('Player').apply(lambda x: create_training_data(x, 'Ast')).reset_index(drop=True)

# Optional: Save both datasets to CSV files
training_data_gls.to_csv('training_data_with_goals_target.csv', index=False)
training_data_ast.to_csv('training_data_with_assists_target.csv', index=False)

# You can now use 'training_data_with_goals_target.csv' and 'training_data_with_assists_target.csv' as your training datasets


  training_data_gls = df_filtered.groupby('Player').apply(lambda x: create_training_data(x, 'Gls')).reset_index(drop=True)
  training_data_ast = df_filtered.groupby('Player').apply(lambda x: create_training_data(x, 'Ast')).reset_index(drop=True)


In [19]:
import pandas as pd

# Load the dataset
file_path = 'filled_player_performance_data.csv'  # Replace with your file path
df = pd.read_csv(file_path)

# Step 1: Filter out rows from seasons other than 2024-2025
df_2024 = df[df['Season'] == '2024-2025']

# Step 2: Identify the numerical columns (dropping categorical columns like 'Season', 'Squad', 'Comp', 'LgRank', 'Player')
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns

# Step 3: Define a function to compute averages for all past seasons for each player and set the target for prediction
def create_prediction_data(player_df):
    # Get the data for all previous seasons (excluding the current season)
    previous_data = player_df[player_df['Season'] != '2024-2025']
    
    # Average numerical columns
    avg_data = previous_data[numerical_columns].mean().to_dict()
    
    # Get current season's Squad, Comp, and Player details (2024-2025)
    current_row = player_df[player_df['Season'] == '2024-2025'].iloc[0]
    avg_data['Player'] = current_row['Player']  # Include the player's name
    avg_data['Squad'] = current_row['Squad']
    avg_data['Comp'] = current_row['Comp']
    
    return pd.Series(avg_data)

# Step 4: Apply this function to generate prediction data for the current 2024-2025 season
prediction_data = df.groupby('Player').apply(create_prediction_data).reset_index(drop=True)

# Optional: Save the prediction dataset to a CSV file
prediction_data.to_csv('prediction_data_2024_2025.csv', index=False)

# You can now use 'prediction_data_2024_2025.csv' as your prediction dataset


  prediction_data = df.groupby('Player').apply(create_prediction_data).reset_index(drop=True)
