In [1]:
import pandas as pd
import numpy as np
import os

# Loading Team Data and Outcomes of Games

In this notebook I will go through the steps I took to familiarise myself with the dataset we have access to. I also do a first attempt at selecting the columns which we include as features input to the random forest. There are 4 sets of data. These are: home team stats (Train_Data\train_home_team_statistics_df.csv), away team stats (Train_Data\train_away_team_statistics_df.csv) and similar things for the players at (Train_Data/train_home_player_statistics_df.csv, Train_Data/train_away_player_statistics_df.csv)

In [4]:
data_folder = os.path.join(os.getcwd(), '..', 'Train_Data')
train_home_team_statistics_df   = pd.read_csv(os.path.join(data_folder, 'train_home_team_statistics_df.csv'), index_col=0)
train_away_team_statistics_df   = pd.read_csv(os.path.join(data_folder, 'train_away_team_statistics_df.csv'), index_col=0)
train_home_player_statistics_df = pd.read_csv(os.path.join(data_folder, 'train_home_player_statistics_df.csv'), index_col=0)
train_away_player_statistics_df = pd.read_csv(os.path.join(data_folder, 'train_away_player_statistics_df.csv'), index_col=0)
train_scores = pd.read_csv(os.path.join(data_folder, 'Y_train.csv'), index_col=0)
# we define index column as column 0 so we can reference this later. Note, index is the game number. e.g., home team at 
# index 7 is the team which plays away team at index 7

I have saved the names of each of the columns in each df at column_names_for_teams.txt, column_names_for_players.txt.

The team stats is a df which is 1x140 for each game. 35 of these are stats for season to date, 35 are std devs for these over the season to date. The remaining 70 is another 35, 35 split of the same stats over the last 5 games.

The player stats is an n by 300 array, where n is the number of players in a game including subs (typically around 20). There are 50 season cumulative stats, e.g. PLAYER_ACCURATE_CROSSES_season_sum, PLAYER_ACCURATE_PASSES_season_sum, 50 stats which are the average of these stats over games played, 50 are std devs of them over the season. The remaining 150 are the analagous metrics for last 5 games. 

Whilst random forest can deal well with this many features, it is best when they each have predictive power. Most of my work has been data processing so that everything gets massaged into a convenient form to use. I have a few unanswered questions about how we will choose the features we use.

In [5]:
train_home_team_statistics_df.head(3)

Unnamed: 0_level_0,LEAGUE,TEAM_NAME,TEAM_SHOTS_TOTAL_season_sum,TEAM_SHOTS_INSIDEBOX_season_sum,TEAM_SHOTS_OFF_TARGET_season_sum,TEAM_SHOTS_ON_TARGET_season_sum,TEAM_SHOTS_OUTSIDEBOX_season_sum,TEAM_PASSES_season_sum,TEAM_SUCCESSFUL_PASSES_season_sum,TEAM_SAVES_season_sum,...,TEAM_YELLOWCARDS_5_last_match_std,TEAM_REDCARDS_5_last_match_std,TEAM_OFFSIDES_5_last_match_std,TEAM_ATTACKS_5_last_match_std,TEAM_PENALTIES_5_last_match_std,TEAM_SUBSTITUTIONS_5_last_match_std,TEAM_BALL_SAFE_5_last_match_std,TEAM_DANGEROUS_ATTACKS_5_last_match_std,TEAM_INJURIES_5_last_match_std,TEAM_GOALS_5_last_match_std
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,Ligue 1,Toulouse,3.0,2.0,5.0,2.0,1.0,2.0,2.0,5.0,...,3.0,0.0,6.0,0.0,10.0,8.0,7.0,2.0,4.0,3.0
1,Ligue 2,Brest,6.0,8.0,3.0,6.0,5.0,8.0,7.0,10.0,...,4.0,0.0,4.0,3.0,10.0,0.0,1.0,2.0,8.0,4.0
2,Serie A,Sampdoria,4.0,2.0,5.0,2.0,8.0,1.0,1.0,2.0,...,4.0,5.0,6.0,3.0,6.0,7.0,2.0,3.0,2.0,4.0


We now drop the league and team name columns so we only work with numerical data (I can perhaps imagine a model which would like to know what team is which and use this as another feature. Food for thought.)

In [6]:
# the [:,2:] selects all rows with : and columns 2 onwards with 2:. 
train_home = train_home_team_statistics_df.iloc[:,2:]
train_away = train_away_team_statistics_df.iloc[:,2:]
train_home.head(3)

Unnamed: 0_level_0,TEAM_SHOTS_TOTAL_season_sum,TEAM_SHOTS_INSIDEBOX_season_sum,TEAM_SHOTS_OFF_TARGET_season_sum,TEAM_SHOTS_ON_TARGET_season_sum,TEAM_SHOTS_OUTSIDEBOX_season_sum,TEAM_PASSES_season_sum,TEAM_SUCCESSFUL_PASSES_season_sum,TEAM_SAVES_season_sum,TEAM_CORNERS_season_sum,TEAM_FOULS_season_sum,...,TEAM_YELLOWCARDS_5_last_match_std,TEAM_REDCARDS_5_last_match_std,TEAM_OFFSIDES_5_last_match_std,TEAM_ATTACKS_5_last_match_std,TEAM_PENALTIES_5_last_match_std,TEAM_SUBSTITUTIONS_5_last_match_std,TEAM_BALL_SAFE_5_last_match_std,TEAM_DANGEROUS_ATTACKS_5_last_match_std,TEAM_INJURIES_5_last_match_std,TEAM_GOALS_5_last_match_std
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,3.0,2.0,5.0,2.0,1.0,2.0,2.0,5.0,3.0,6.0,...,3.0,0.0,6.0,0.0,10.0,8.0,7.0,2.0,4.0,3.0
1,6.0,8.0,3.0,6.0,5.0,8.0,7.0,10.0,6.0,8.0,...,4.0,0.0,4.0,3.0,10.0,0.0,1.0,2.0,8.0,4.0
2,4.0,2.0,5.0,2.0,8.0,1.0,1.0,2.0,2.0,7.0,...,4.0,5.0,6.0,3.0,6.0,7.0,2.0,3.0,2.0,4.0


what we now have is 2 arrays, one has the outcomes of the games, the other has properties of each of the two teams that played. Note, these properties are not fixed in time. E.g., for any given team, there are a set of properties related to their last 5 games. These are updated after each match. 

In [7]:
train_scores.head(3)

Unnamed: 0_level_0,HOME_WINS,DRAW,AWAY_WINS
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,0,1
1,0,1,0
2,0,0,1


# Loading Player Data

Loading in player stats, we see that they give stats for every player on the team. The model will only want to take in one row of inputs per 'ID', so as a first pass I have averaged each of these attirbutues over the entire team. 

In [12]:
player_home_stats = train_home_player_statistics_df.sort_values(by = ['ID', 'POSITION'])
player_away_stats = train_away_player_statistics_df.sort_values(by = ['ID', 'POSITION'])
player_home_stats.head(30)

Unnamed: 0_level_0,LEAGUE,TEAM_NAME,POSITION,PLAYER_NAME,PLAYER_ACCURATE_CROSSES_season_sum,PLAYER_ACCURATE_PASSES_season_sum,PLAYER_AERIALS_WON_season_sum,PLAYER_ASSISTS_season_sum,PLAYER_BIG_CHANCES_CREATED_season_sum,PLAYER_BIG_CHANCES_MISSED_season_sum,...,PLAYER_STARTING_LINEUP_5_last_match_std,PLAYER_SUCCESSFUL_DRIBBLES_5_last_match_std,PLAYER_TACKLES_5_last_match_std,PLAYER_TOTAL_CROSSES_5_last_match_std,PLAYER_TOTAL_DUELS_5_last_match_std,PLAYER_YELLOWCARDS_5_last_match_std,PLAYER_PUNCHES_5_last_match_std,PLAYER_LONG_BALLS_5_last_match_std,PLAYER_LONG_BALLS_WON_5_last_match_std,PLAYER_SHOTS_OFF_TARGET_5_last_match_std
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,Ligue 1,Toulouse,defender,Agustín Rogel,0.0,8.0,5.0,0.0,0.0,0.0,...,,,,,,,,,,
0,Ligue 1,Toulouse,defender,Mathieu Goncalves,0.0,7.0,4.0,0.0,0.0,0.0,...,63.0,14.0,13.0,0.0,36.0,77.0,,,,
0,Ligue 1,Toulouse,defender,Nicolas Isimat-Mirin,0.0,32.0,46.0,0.0,0.0,0.0,...,0.0,0.0,37.0,0.0,25.0,77.0,,,,
0,Ligue 1,Toulouse,defender,Issiaga Sylla,14.0,52.0,52.0,14.0,8.0,0.0,...,0.0,28.0,55.0,23.0,55.0,77.0,,,,
0,Ligue 1,Toulouse,defender,Steven Moreira,8.0,24.0,6.0,0.0,8.0,0.0,...,77.0,41.0,24.0,24.0,70.0,94.0,,,,
0,Ligue 1,Toulouse,defender,Kelvin Amian,2.0,40.0,20.0,28.0,16.0,0.0,...,0.0,14.0,26.0,16.0,25.0,77.0,,,,
0,Ligue 1,Toulouse,goalkeeper,Baptiste Reynet,0.0,33.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,0.0,,,,
0,Ligue 1,Toulouse,goalkeeper,Mauro Goicoechea,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,,,,
0,Ligue 1,Toulouse,midfielder,Jean-Victor Makengo,8.0,27.0,2.0,0.0,8.0,0.0,...,63.0,0.0,13.0,6.0,17.0,0.0,,,,
0,Ligue 1,Toulouse,midfielder,Ibrahim Sangaré,0.0,44.0,26.0,14.0,8.0,0.0,...,0.0,34.0,53.0,8.0,46.0,77.0,,,,


I cant believe there is a team called Brest. Shall we fix their win % to 100?

lets assume league, team name, player name and position aren't of any use to us. We can drop those columns

In [16]:
trimmed_home_player_stats = player_home_stats.iloc[:,4:]
trimmed_away_player_stats = player_away_stats.iloc[:,4:]
trimmed_home_player_stats

Unnamed: 0_level_0,PLAYER_ACCURATE_CROSSES_season_sum,PLAYER_ACCURATE_PASSES_season_sum,PLAYER_AERIALS_WON_season_sum,PLAYER_ASSISTS_season_sum,PLAYER_BIG_CHANCES_CREATED_season_sum,PLAYER_BIG_CHANCES_MISSED_season_sum,PLAYER_BLOCKED_SHOTS_season_sum,PLAYER_CAPTAIN_season_sum,PLAYER_CLEARANCES_season_sum,PLAYER_CLEARANCE_OFFLINE_season_sum,...,PLAYER_STARTING_LINEUP_5_last_match_std,PLAYER_SUCCESSFUL_DRIBBLES_5_last_match_std,PLAYER_TACKLES_5_last_match_std,PLAYER_TOTAL_CROSSES_5_last_match_std,PLAYER_TOTAL_DUELS_5_last_match_std,PLAYER_YELLOWCARDS_5_last_match_std,PLAYER_PUNCHES_5_last_match_std,PLAYER_LONG_BALLS_5_last_match_std,PLAYER_LONG_BALLS_WON_5_last_match_std,PLAYER_SHOTS_OFF_TARGET_5_last_match_std
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,8.0,5.0,0.0,0.0,0.0,0.0,,4.0,0.0,...,,,,,,,,,,
0,0.0,7.0,4.0,0.0,0.0,0.0,6.0,,12.0,0.0,...,63.0,14.0,13.0,0.0,36.0,77.0,,,,
0,0.0,32.0,46.0,0.0,0.0,0.0,62.0,,59.0,50.0,...,0.0,0.0,37.0,0.0,25.0,77.0,,,,
0,14.0,52.0,52.0,14.0,8.0,0.0,12.0,,57.0,0.0,...,0.0,28.0,55.0,23.0,55.0,77.0,,,,
0,8.0,24.0,6.0,0.0,8.0,0.0,0.0,,18.0,0.0,...,77.0,41.0,24.0,24.0,70.0,94.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12302,6.0,6.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,...,63.0,41.0,14.0,8.0,23.0,0.0,,,,
12302,0.0,23.0,12.0,0.0,0.0,0.0,27.0,,11.0,0.0,...,63.0,0.0,14.0,0.0,16.0,0.0,,,,
12302,15.0,22.0,14.0,50.0,31.0,28.0,5.0,,2.0,0.0,...,63.0,52.0,27.0,35.0,30.0,77.0,,,,
12302,3.0,17.0,3.0,0.0,6.0,0.0,11.0,,5.0,0.0,...,0.0,28.0,18.0,12.0,6.0,77.0,,,,


In the next section I will average over all of the positions to get these properties as position averaged, e.g. average crosses by the midfeilders. For now, lets assume that this doesnt matter, and we will drop the position information to just get another batch of relevant team data. By averaging over each game (which is each 'ID') we get a first new batch of features which we can include as inputs. 

In [19]:
averaged_trimmed_home_player_stats = trimmed_home_player_stats.groupby(['ID']).mean()
averaged_trimmed_away_player_stats = trimmed_away_player_stats.groupby(['ID']).mean()
averaged_trimmed_home_player_stats

Unnamed: 0_level_0,PLAYER_ACCURATE_CROSSES_season_sum,PLAYER_ACCURATE_PASSES_season_sum,PLAYER_AERIALS_WON_season_sum,PLAYER_ASSISTS_season_sum,PLAYER_BIG_CHANCES_CREATED_season_sum,PLAYER_BIG_CHANCES_MISSED_season_sum,PLAYER_BLOCKED_SHOTS_season_sum,PLAYER_CAPTAIN_season_sum,PLAYER_CLEARANCES_season_sum,PLAYER_CLEARANCE_OFFLINE_season_sum,...,PLAYER_STARTING_LINEUP_5_last_match_std,PLAYER_SUCCESSFUL_DRIBBLES_5_last_match_std,PLAYER_TACKLES_5_last_match_std,PLAYER_TOTAL_CROSSES_5_last_match_std,PLAYER_TOTAL_DUELS_5_last_match_std,PLAYER_YELLOWCARDS_5_last_match_std,PLAYER_PUNCHES_5_last_match_std,PLAYER_LONG_BALLS_5_last_match_std,PLAYER_LONG_BALLS_WON_5_last_match_std,PLAYER_SHOTS_OFF_TARGET_5_last_match_std
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,10.222222,21.333333,15.277778,7.777778,7.222222,6.166667,9.222222,45.500000,16.722222,2.777778,...,36.647059,24.058824,25.352941,14.705882,39.176471,41.764706,,,,
1,9.222222,31.555556,14.555556,16.666667,10.277778,9.000000,11.888889,50.000000,17.166667,2.777778,...,3.500000,10.000000,13.611111,8.666667,22.055556,19.833333,,,,
2,8.434783,16.608696,15.347826,2.826087,3.652174,3.869565,8.826087,3.333333,12.826087,0.000000,...,53.636364,13.000000,23.954545,11.454545,35.181818,36.590909,,,,
3,8.888889,34.055556,15.833333,12.000000,14.444444,8.722222,7.444444,19.750000,19.333333,4.166667,...,19.833333,8.833333,19.111111,17.055556,19.611111,15.555556,,,,
4,9.722222,25.888889,15.111111,8.166667,9.500000,10.666667,9.611111,100.000000,17.555556,0.000000,...,19.352941,19.235294,24.352941,15.411765,22.117647,14.588235,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12298,4.833333,23.888889,11.444444,5.444444,2.000000,3.111111,11.277778,43.500000,15.666667,2.777778,...,46.111111,8.555556,18.166667,11.722222,27.166667,37.333333,,,,
12299,3.833333,15.666667,16.444444,5.388889,5.500000,7.000000,12.277778,25.000000,18.277778,8.333333,...,33.764706,11.294118,19.647059,12.058824,25.294118,44.882353,,,,
12300,6.000000,16.000000,9.777778,7.222222,8.388889,6.000000,15.444444,43.000000,10.833333,0.000000,...,49.388889,20.666667,38.166667,7.611111,24.333333,18.277778,,,,
12301,,,,12.888889,,,,47.500000,,,...,42.388889,,,,,34.611111,,,,


In [11]:
# Through discussions with chat GPTs 'Football Predictor' I was given a list of the most relevant of these stats to determining game outcome.
# We then went though the list of team stats we have, and got rid of any additional stats which are already covered by team stats. The result was this list
relevant_player_stats = [
    "PLAYER_ASSISTS_season_sum",
    "PLAYER_KEY_PASSES_season_sum",
    "PLAYER_DRIBBLED_ATTEMPTS_season_sum",
    "PLAYER_SUCCESSFUL_DRIBBLES_season_sum",
    "PLAYER_ACCURATE_CROSSES_season_sum",
    "PLAYER_TOTAL_CROSSES_season_sum",
    "PLAYER_HIT_WOODWORK_season_sum",
    "PLAYER_BIG_CHANCES_CREATED_season_sum",
    "PLAYER_INTERCEPTIONS_season_sum",
    "PLAYER_CLEARANCES_season_sum",
    "PLAYER_BLOCKED_SHOTS_season_sum",
    "PLAYER_SHOTS_BLOCKED_season_sum",
    "PLAYER_AERIALS_WON_season_sum",
    "PLAYER_DUELS_WON_season_sum",
    "PLAYER_DUELS_LOST_season_sum",
    "PLAYER_FOULS_DRAWN_season_sum",
    "PLAYER_GOALS_CONCEDED_season_sum",
    "PLAYER_SAVES_INSIDE_BOX_season_sum",
    "PLAYER_GOALS_5_last_match_sum",
    "PLAYER_ASSISTS_5_last_match_sum",
    "PLAYER_KEY_PASSES_5_last_match_sum",
    "PLAYER_SUCCESSFUL_DRIBBLES_5_last_match_sum",
    "PLAYER_INTERCEPTIONS_5_last_match_sum",
    "PLAYER_CLEARANCES_5_last_match_sum",
    "PLAYER_MINUTES_PLAYED_season_sum",
    "PLAYER_STARTING_LINEUP_season_sum",
    "PLAYER_CAPTAIN_season_sum",
    "PLAYER_ASSISTS_season_average",
    "PLAYER_KEY_PASSES_season_average",
    "PLAYER_DRIBBLED_ATTEMPTS_season_average",
    "PLAYER_SUCCESSFUL_DRIBBLES_season_average",
    "PLAYER_TACKLES_season_average",
    "PLAYER_INTERCEPTIONS_season_average",
    "PLAYER_DUELS_WON_season_average",
    "PLAYER_AERIALS_WON_season_average",
    "PLAYER_RATING_season_average",
    "PLAYER_GOALS_CONCEDED_5_last_match_average",
    "PLAYER_ASSISTS_5_last_match_average",
    "PLAYER_KEY_PASSES_5_last_match_average",
    "PLAYER_SUCCESSFUL_DRIBBLES_5_last_match_average",
    "PLAYER_INTERCEPTIONS_5_last_match_average",
    "PLAYER_CLEARANCES_5_last_match_average",
    "PLAYER_RATING_5_last_match_average",
    "PLAYER_GOALS_CONCEDED_5_last_match_sum",
    "PLAYER_GOALS_CONCEDED_5_last_match_average",
    "PLAYER_TACKLES_season_sum",
    "PLAYER_FOULS_5_last_match_sum"
]
print(len(relevant_player_stats))

47


This is pretty primative, and obviously just a first pass. Nevertheless, the infrastrucure is now all set up if we get a list of everything we want to inlude in the final model as a feature. 

In [20]:
Stats_to_keep_home = averaged_trimmed_home_player_stats[relevant_player_stats]
Stats_to_keep_away = averaged_trimmed_away_player_stats[relevant_player_stats]

Stats_to_keep_home.columns = 'home_average_' + Stats_to_keep_home.columns
Stats_to_keep_away.columns = 'away_average_' + Stats_to_keep_away.columns

Stats_to_keep_home.to_csv('selected_averaged_home_player_stats.csv', index = True)
Stats_to_keep_away.to_csv('selected_averaged_away_player_stats.csv', index = True)

These are now saved as dfs with 47 columns each in Train_Data/selected_averaged_home_player_stats.csv, Train_Data/selected_averaged_away_player_stats.csv. These are the sets which I used in making the first random forest.

# Positionally Averaged Attirbutes

A perhaps less diluted metric to use is to average by position instead, and then you can have how each component of any given team is doing. The goal is likely to still trim down the list massively, but only include goalkeepers conceded goals, or an attackers successful dribbles. 

In [23]:
# Populating a set of unique positions, just incase it had stuff like 'left wing' etc. 
trimmed_player_home_stats_and_positions = player_home_stats.iloc[:, [2] + list(range(4, len(player_home_stats.columns)))]
trimmed_player_away_stats_and_positions = player_away_stats.iloc[:, [2] + list(range(4, len(player_away_stats.columns)))]

# unique_positions = set()
# for i in range(len(trimmed_player_stats)):
#     if trimmed_player_stats.iloc[i]['POSITION'] not in unique_positions:
#         unique_positions.add(trimmed_player_stats.iloc[i]['POSITION'])
# print(unique_positions)

The idea I had from this is to group the positions by ID for each team , and average for an 'attackers average crosses', 'defender accurate crosses' etc, and so the list of 20 or so can be collapsed into a list of these 5 just placed next to each other. 

In [14]:
# You get the jist, I hope. These last few cells will be very demanding to run locally, so probably dont do that.
grouped_player_stats = trimmed_player_home_stats_and_positions.groupby(['ID', 'POSITION']).mean()

In [15]:
grouped_player_stats.head(50)

Unnamed: 0_level_0,Unnamed: 1_level_0,PLAYER_ACCURATE_CROSSES_season_sum,PLAYER_ACCURATE_PASSES_season_sum,PLAYER_AERIALS_WON_season_sum,PLAYER_ASSISTS_season_sum,PLAYER_BIG_CHANCES_CREATED_season_sum,PLAYER_BIG_CHANCES_MISSED_season_sum,PLAYER_BLOCKED_SHOTS_season_sum,PLAYER_CAPTAIN_season_sum,PLAYER_CLEARANCES_season_sum,PLAYER_CLEARANCE_OFFLINE_season_sum,...,PLAYER_STARTING_LINEUP_5_last_match_std,PLAYER_SUCCESSFUL_DRIBBLES_5_last_match_std,PLAYER_TACKLES_5_last_match_std,PLAYER_TOTAL_CROSSES_5_last_match_std,PLAYER_TOTAL_DUELS_5_last_match_std,PLAYER_YELLOWCARDS_5_last_match_std,PLAYER_PUNCHES_5_last_match_std,PLAYER_LONG_BALLS_5_last_match_std,PLAYER_LONG_BALLS_WON_5_last_match_std,PLAYER_SHOTS_OFF_TARGET_5_last_match_std
ID,POSITION,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,defender,4.0,27.166667,22.166667,7.0,5.333333,0.0,17.5,,34.833333,8.333333,...,28.0,19.4,31.0,12.6,42.2,80.4,,,,
0,goalkeeper,0.0,16.5,1.0,0.0,0.0,0.0,0.0,,8.0,0.0,...,0.0,0.0,0.0,0.0,1.5,0.0,,,,
0,midfielder,24.5,23.333333,9.166667,9.333333,12.333333,4.166667,8.166667,45.5,8.333333,0.0,...,33.833333,41.666667,40.0,24.333333,48.166667,25.666667,,,,
1,defender,5.6,57.2,32.4,8.0,2.4,3.2,26.4,100.0,48.0,10.0,...,0.0,9.4,10.0,7.6,23.6,30.8,,,,
1,goalkeeper,0.0,14.0,3.0,0.0,0.0,0.0,0.0,,3.5,0.0,...,0.0,0.0,0.0,0.0,1.5,0.0,,,,
1,midfielder,15.75,27.125,6.875,17.5,13.875,6.0,9.25,0.0,4.125,0.0,...,7.875,13.75,22.875,12.875,27.5,25.375,,,,
2,attacker,5.0,5.25,6.75,1.75,4.5,10.0,0.0,,1.25,0.0,...,51.333333,5.666667,5.666667,9.333333,35.0,0.0,,,,
2,defender,8.857143,22.714286,20.571429,1.0,1.714286,4.0,19.714286,,26.428571,0.0,...,64.714286,14.285714,25.428571,16.285714,34.857143,42.0,,,,
2,goalkeeper,0.0,11.5,2.0,0.0,0.0,0.0,0.0,,11.0,0.0,...,0.0,0.0,0.0,0.0,2.0,0.0,,,,
2,midfielder,11.2,17.9,17.8,5.1,5.4,2.1,6.5,3.333333,8.3,0.0,...,57.3,16.9,33.2,11.0,42.1,51.1,,,,


In [16]:
# here we make an array with 4x the number of columns with each of them next to eachother. Again, this is because random forest
# wants to take in one big long vector. Perhaps it is from this type of set that you would draw the final features with the most predictive  power. 
positions = ['defender', 'goalkeeper', 'midfielder', 'attacker']
new_columns = [f'{pos}_{col}' for pos in positions for col in trimmed_player_stats.columns if col not in ['ID', 'POSITION']]
result_df = pd.DataFrame(index=grouped_player_stats.index.levels[0], columns=new_columns)

# Iterate through each group and populate the result dataframe
for idx, row in grouped_player_stats.iterrows():
    id_value, position = idx
    pos_col_prefix = f'{position}_'
    for col in trimmed_player_stats.columns:
        if col not in ['ID', 'POSITION']:
            new_col_name = f'{pos_col_prefix}{col}'
            result_df.loc[id_value, new_col_name] = row[col]

# Fill any missing values with NaN (if some groups have fewer columns)
result_df = result_df.fillna(np.nan)

# Display the resulting dataframe
result_df.shape

KeyboardInterrupt: 

In [None]:
result_df.to_csv('training_home_player_stats.csv', index = True)
