In [2]:
import soccerdata as sd
from joblib import Parallel, delayed
import pandas as pd
from tqdm import tqdm

In [None]:
print(sd.WhoScored.available_leagues())

In [None]:
# En serie
leagues = sd.WhoScored.available_leagues()[2]
seasons = [
    2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]
ws = sd.WhoScored(leagues=leagues, seasons=seasons, no_cache = True)
epl_schedule = ws.read_schedule().reset_index()

game_ids = epl_schedule["game_id"]

trabajo = list(game_ids)
events = ws.read_events(match_id=game_ids)

In [None]:
events.to_csv(f"WhoScored_{leagues[2]}.csv", header=True)

In [None]:
# Paralelo
league = "FRA-Ligue 1"
seasons = [
    2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]

ws = sd.WhoScored(leagues=league, seasons=seasons, no_cache = True)
schedule = ws.read_schedule().reset_index()

game_ids = schedule["game_id"]

trabajo = list(game_ids)
workers = 36
division_work = []
tamaño_sublistas = len(trabajo) // workers
for n in range(workers):
    work = [trabajo[n*tamaño_sublistas+i] for i in range(tamaño_sublistas)]
    division_work.append(work)
# El resto
resto = len(trabajo) % workers
for i in range(resto):
    division_work[0].append(trabajo[-i-1])


def read_parallel(game_ids, leagues, seasons):
    try:
        ws = sd.WhoScored(leagues=leagues, seasons=seasons, no_cache = True)
        events = ws.read_events(match_id=game_ids)
        return events
    except:
        return pd.DataFrame()

n_workers = 12
resultado = Parallel(n_jobs=n_workers, verbose=10)(
    delayed(read_parallel)(w, league, seasons) for w in division_work)


In [None]:
# Guardar
df = pd.DataFrame()
for i in resultado:
    df = pd.concat([df,i])
df = df.reset_index()
df.to_csv(f"WhoScored_{league}.csv", header=True)

In [None]:
# Checkeo de que se guardó
for i in range(36):
    print(f"Id: {i}, {len(resultado[i])}")

In [None]:
resultado

In [None]:
df.head()

In [None]:
# Solucionando errores
    
# Parte de la solución
set_problematicos = [9,20]
ids_con_problmas = [1076372]
problem_division_work = [list(set(division_work[i]) - set(ids_con_problmas)) for i in set_problematicos]

problem_division_work = [item for sublist in problem_division_work for item in sublist]
ws = sd.WhoScored(leagues=league, seasons=seasons, no_cache = True)
events = ws.read_events(match_id=problem_division_work)


In [None]:
events

In [None]:
df = pd.read_csv(f"WhoScored_{league}.csv")
events = events.reset_index()
df = pd.concat([df,events])

In [None]:
df.to_csv(f"WhoScored_{league}.csv", header=True)

In [3]:
# Antes de seguir manipulandolo, vemos que todo esté correcto
leagues_to_sort = ["ENG-Premier League", "ESP-La Liga",
                   "FRA-Ligue 1", "GER-Bundesliga", "ITA-Serie A"]

df = pd.read_csv(f"WhoScored_{leagues_to_sort[0]}.csv")

In [33]:
leagues_to_sort = ["ENG-Premier League", "ESP-La Liga",
                   "FRA-Ligue 1", "GER-Bundesliga", "ITA-Serie A"]

# Ordenar por fecha, id, minuto, segundo
for l in leagues_to_sort:
    df = pd.read_csv(f"WhoScored_{l}.csv")
    # Configuración de fecha
    df['fecha'] = df['game'].str.slice(stop=10)
    fecha_column = df.pop('fecha')  # Extraemos la columna 'fecha'
    df.insert(0, 'fecha', fecha_column)  # Insertamos la columna 'fecha' al principio del DataFrame
    # Configuración de columnas
    df = df.drop('Unnamed: 0', axis= 1)
    # Orden 
    df = df.sort_values(by=["fecha","game_id","minute","second"],ignore_index=True)
    df.to_csv(f"WhoScored_{l}.csv", header=True)


In [34]:
df

Unnamed: 0,fecha,league,season,game,id,game_id,period,minute,second,expanded_minute,...,goal_mouth_z,blocked_x,blocked_y,qualifiers,is_touch,is_shot,is_goal,card_type,related_event_id,related_player_id
0,2010-08-28,ITA-Serie A,1011,2010-08-28 Udinese-Genoa,7.495863e+07,443471,FirstHalf,0,0.0,0,...,,,,[],False,,,,,
1,2010-08-28,ITA-Serie A,1011,2010-08-28 Udinese-Genoa,1.545439e+08,443471,PreMatch,0,0.0,0,...,,,,[{'type': {'displayName': 'TeamPlayerFormation...,False,,,,,
2,2010-08-28,ITA-Serie A,1011,2010-08-28 Udinese-Genoa,2.607548e+08,443471,FirstHalf,0,0.0,0,...,,,,[],False,,,,,
3,2010-08-28,ITA-Serie A,1011,2010-08-28 Udinese-Genoa,9.352677e+08,443471,PreMatch,0,0.0,0,...,,,,"[{'type': {'displayName': 'InvolvedPlayers', '...",False,,,,,
4,2010-08-28,ITA-Serie A,1011,2010-08-28 Udinese-Genoa,1.450992e+09,443471,PostGame,0,0.0,7,...,,,,[],False,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7509774,2024-02-23,ITA-Serie A,2324,2024-02-23 Bologna-Verona,2.649815e+09,1746291,SecondHalf,92,56.0,100,...,,,,[],False,,,,,
7509775,2024-02-23,ITA-Serie A,2324,2024-02-23 Bologna-Verona,2.649815e+09,1746291,SecondHalf,92,57.0,100,...,,,,"[{'type': {'displayName': 'Zone', 'value': 56}...",True,,,,,
7509776,2024-02-23,ITA-Serie A,2324,2024-02-23 Bologna-Verona,2.649815e+09,1746291,SecondHalf,92,58.0,100,...,,,,"[{'type': {'displayName': 'StandingSave', 'val...",True,,,,,
7509777,2024-02-23,ITA-Serie A,2324,2024-02-23 Bologna-Verona,2.649815e+09,1746291,SecondHalf,93,3.0,101,...,,,,[],False,,,,,
