In [2]:
import pandas as pd
from matplotlib.colors import LinearSegmentedColormap
from mplsoccer import VerticalPitch, Pitch
from highlight_text import ax_text, fig_text
import matplotlib.pyplot as plt

from joblib import Parallel, delayed
from tqdm import tqdm

In [3]:
matches = pd.read_csv("WhoScored_ENG-Premier League.csv")

In [13]:
example_match = matches[matches["game_id"] == 410988] # 1729340  1729483 615168

In [5]:
print(example_match["type"].unique())
print(example_match["outcome_type"].unique())

['End' 'FormationSet' 'Start' 'Pass' 'Interception' 'Aerial' 'Clearance'
 'BallRecovery' 'Challenge' 'TakeOn' 'Foul' 'MissedShots' 'BallTouch'
 'Dispossessed' 'Tackle' 'CornerAwarded' 'GoodSkill' 'SavedShot' 'Save'
 'KeeperPickup' 'OffsideProvoked' 'OffsidePass' 'OffsideGiven' 'Claim'
 'ShieldBallOpp' 'SubstitutionOn' 'SubstitutionOff' 'Card' 'Goal'
 'FormationChange']
['Successful' 'Unsuccessful']


In [29]:
def get_features_individuales(df_matches, match, bins_x, bins_y):
    
    ## Features elegidos
    features = ['Pass', 'TakeOn', 'Tackle', 'BallRecovery'
                'Interception', 'BlockedPass', 'Clearance', 'CornerAwarded', 'Aerial', 'Foul',
                'Dispossessed', 'BallTouch', 'ShieldBallOpp', 'Challenge', 'MissedShots',
                'SavedShot', 'Save', 'KeeperPickup', 'Goal', 'KeeperSweeper', 'Card', 'Smother',
                'OffsideProvoked', 'OffsidePass', 'OffsideGiven', 'Punch', 'Error']
    ##
    date = match["fecha"].unique()[0]

    df_matches_limited = df_matches[df_matches["fecha"] < date]
    str_team1 = " ".join(match["game"].iloc[0].split(" ")[1:]).split("-")[0]
    str_team2 = " ".join(match["game"].iloc[0].split(" ")[1:]).split("-")[1]
    team1 = match[match["team"] == str_team1]["player"].dropna().unique()
    team2 = match[match["team"] == str_team2]["player"].dropna().unique()

    # Features team1
    all_players = []
    for player in team1:
        dict_player = {"Player": player}
        player_events = df_matches_limited[df_matches_limited["player"] == player]

        pitch = Pitch(pitch_type='opta', pitch_color='white',
                      line_color='black', line_zorder=2)

        for fea in features:
            event_succesful = player_events[(player_events["type"] == fea) & (
                player_events["outcome_type"] == 'Successful')]
            event_unsuccessful = player_events[(player_events["type"] == fea) & (
                player_events["outcome_type"] == 'Unsuccessful')]

            bin_statistic_succesful = pitch.bin_statistic(
                event_succesful.x, event_succesful.y, statistic="count", bins=(bins_x, bins_y), normalize=False)

            bin_statistic_unsuccesful = pitch.bin_statistic(
                event_unsuccessful.x, event_unsuccessful.y, statistic="count", bins=(bins_x, bins_y), normalize=False)

            bins_successful = bin_statistic_succesful["statistic"].flatten()
            bins_unsuccessful = bin_statistic_unsuccesful["statistic"].flatten(
            )

            for i in range(len(bins_successful)):
                dict_player[f"{fea}_Successful_bin_{i}"] = bins_successful[i]
                dict_player[f"{fea}_Unsuccessful_bin_{i}"] = bins_unsuccessful[i]

        all_players.append(dict_player)

    df_all_players_team1 = pd.DataFrame(all_players)

    # Features team2
    all_players = []
    for player in team2:
        dict_player = {"Player": player}
        player_events = df_matches_limited[df_matches_limited["player"] == player]

        pitch = Pitch(pitch_type='opta', pitch_color='white',
                      line_color='black', line_zorder=2)

        for fea in features:
            event_succesful = player_events[(player_events["type"] == fea) & (
                player_events["outcome_type"] == 'Successful')]
            event_unsuccessful = player_events[(player_events["type"] == fea) & (
                player_events["outcome_type"] == 'Unsuccessful')]

            bin_statistic_succesful = pitch.bin_statistic(
                event_succesful.x, event_succesful.y, statistic="count", bins=(bins_x, bins_y), normalize=False)

            bin_statistic_unsuccesful = pitch.bin_statistic(
                event_unsuccessful.x, event_unsuccessful.y, statistic="count", bins=(bins_x, bins_y), normalize=False)

            bins_successful = bin_statistic_succesful["statistic"].flatten()
            bins_unsuccessful = bin_statistic_unsuccesful["statistic"].flatten(
            )

            for i in range(len(bins_successful)):
                dict_player[f"{fea}_Successful_bin_{i}"] = bins_successful[i]
                dict_player[f"{fea}_Unsuccessful_bin_{i}"] = bins_successful[i]

        all_players.append(dict_player)

    df_all_players_team2 = pd.DataFrame(all_players)

    # Agrgar el nombre del equipo
    df_all_players_team1["Team"] = str_team1 
    df_all_players_team2["Team"] = str_team2

    # Agregar fecha
    df_all_players_team1["Date"] = date
    df_all_players_team2["Date"] = date

    # Agregar resultado goles del equipo final
    df_all_players_team1["Goals"] = len(match[(match["type"] == "Goal") & (match["team"] == str_team1)])
    df_all_players_team2["Goals"] = len(match[(match["type"] == "Goal") & (match["team"] == str_team2)])
    


    return df_all_players_team1, df_all_players_team2

df_all_players_team1, df_all_players_team2 = get_features_individuales(matches, example_match, bins_x = 6, bins_y=6)

In [32]:
bins_x = 6
bins_y = 6
n_workers = 12
matches_id = matches["game_id"].unique()
resultado = []

#En serie 
# for id in tqdm(matches_id):
#     data = get_features_individuales(matches, matches[matches["game_id"] == id], bins_x, bins_y)
#     resultado.append(data)

#En paralelo
# resultado = Parallel(n_jobs=n_workers, verbose=0)(
#     delayed(get_features_individuales)(matches[matches["fecha"] <= matches[matches["game_id"] == id]["fecha"].unique()[0]], matches[matches["game_id"] == id], bins_x, bins_y) for id in tqdm(matches_id))

  4%|▍         | 191/4810 [01:08<26:53,  2.86it/s]  

KeyboardInterrupt: 