In [3]:
import duckdb
import pandas as pd
import xml.etree.ElementTree as ET

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
#tabla paises 
pais = duckdb.sql('''SELECT name AS Nombre FROM enunciado_paises.csv''')
df_pais = pais.df()
df_pais

Unnamed: 0,Nombre
0,Belgium
1,England
2,France
3,Germany
4,Italy
5,Netherlands
6,Poland
7,Portugal
8,Scotland
9,Spain


In [4]:
#tabla jugador
jugador = duckdb.sql('''SELECT player_api_id AS ID, player_name AS Nombre FROM enunciado_jugadores.csv''')
df_jugador = jugador.df()
df_jugador

Unnamed: 0,ID,Nombre
0,505942,Aaron Appindangoye
1,155782,Aaron Cresswell
2,162549,Aaron Doran
3,30572,Aaron Galindo
4,23780,Aaron Hughes
...,...,...
11055,26357,Zoumana Camara
11056,111182,Zsolt Laczko
11057,36491,Zsolt Low
11058,35506,Zurab Khizanishvili


In [4]:
#tabla temporada, el id fue creado con pandas
temporada = duckdb.sql('''SELECT DISTINCT season AS Temporada FROM enunciado_partidos.csv''')
df_temporada = temporada.df()         # paso el objeto duckdb a df de pandas
df_temporada['ID'] = range(len(df_temporada),0,-1) # para cada valor de temporada en la nueva columna pongo ese id
df_temporada.to_csv('tabla_temporada', index=False)

In [5]:
# tabla liga, EL ID ESTA MAL, debería ser uno de estos [1,  1729,  4769,  7809, 10257, 13274, 15722, 17642, 19694,21518, 24558]
# pero como saber cual es cual? YA SE, gracias al country_id del partido
liga = duckdb.sql('''
                  SELECT DISTINCT l.name AS Nombre_liga, p.name AS Nombre_pais, part.league_id AS ID
                  FROM enunciado_liga.csv AS l
                  INNER JOIN enunciado_paises.csv AS p ON l.country_id = p.id
                  INNER JOIN enunciado_partidos.csv AS part ON l.country_id = part.country_id
                  ''')

df_liga = liga.df()
df_liga


Unnamed: 0,Nombre_liga,Nombre_pais,ID
0,Netherlands Eredivisie,Netherlands,13274
1,Belgium Jupiler League,Belgium,1
2,Spain LIGA BBVA,Spain,21518
3,Scotland Premier League,Scotland,19694
4,England Premier League,England,1729
5,Italy Serie A,Italy,10257
6,Portugal Liga ZON Sagres,Portugal,17642
7,France Ligue 1,France,4769
8,Germany 1. Bundesliga,Germany,7809
9,Switzerland Super League,Switzerland,24558


In [3]:
# Tabla Equipos
equipo = duckdb.sql('''
                    SELECT DISTINCT eq.team_api_id AS ID_equipo, eq.team_long_name AS Nombre, par.league_id AS ID_liga
                    FROM enunciado_equipos.csv AS eq
                    INNER JOIN enunciado_partidos.csv AS par ON eq.team_api_id = par.home_team_api_id
                    ''')

df_equipo = equipo.df()
#EXPLICACION: basicamente hice el join en base a la liga donde ese equipo jugó de local, podria pasar que no hay jugado de local?
# según lo que estuve viendo no, porque me unió a todos los equipos
df_equipo.to_csv('tabla_equipo', index=False)


In [None]:
goleador = duckdb.sql('''
                      SELECT DISTINCT match_api_id AS ID_partido, goal AS Goles
                      FROM enunciado_partidos.csv
                      ''')

df_goleador = goleador.df().dropna(subset=['Goles'])
goles = df_goleador["Goles"]
#print(f"Hay {len(goles)} filas")
columna_final = []

for gol in goles:
    # Parsear el string XML
    root = ET.fromstring(gol)
    # Función para convertir XML a diccionario
    def xml_to_dict(element):
        result = {}
        if element.tag == "goal":
            result["values"] = []  # Inicializa la lista de valores
            for value in element.findall('value'):
                value_dict = xml_to_dict(value)  # Convierte cada valor a dict
                result["values"].append(value_dict)  # Agrega el dict a la lista
        else:
            for child in element:
                # Llamar recursivamente si hay más hijos
                result[child.tag] = xml_to_dict(child) if len(child) > 0 else child.text
        return result
    # Convertir el XML a diccionario
    goal_dict = xml_to_dict(root)
    # Appendeamos
    columna_final.append(goal_dict)
    



In [None]:
for goles_en_partido in columna_final:
    for gol in goles_en_partido['values']:

        print(gol.keys())
        gol['player1'] # id_jugador
        print(gol['stats'])
        if gol['stats']['goals'] != 1:
            print(f"Hay {gol['stats']['goals']} goles en una llamada.")

In [14]:
#Tabla Plantel
# Acá hice un cross join, suponiendo que todos los equipos jugaron todas las temporadas...
plantel = duckdb.sql('''
                     SELECT t.Temporada AS Temporada, eq.ID_equipo
                     FROM tabla_equipo.csv AS eq
                     CROSS JOIN tabla_temporada.csv AS t
                     ''')

df_plantel = plantel.df()
df_plantel['ID_plantel'] = range(0,len(df_plantel),1)
df_plantel.to_csv('plantel.csv', index=False)

In [29]:
# Conformación plantel
conf_plantel = duckdb.sql('''
                          SELECT DISTINCT ID_plantel, home_player_1 AS ID_jugador
                          FROM enunciado_partidos.csv, plantel.csv
                          WHERE season = Temporada AND home_team_api_id = ID_equipo
                          UNION
                          SELECT DISTINCT ID_plantel, home_player_2 AS ID_jugador
                          FROM enunciado_partidos.csv, plantel.csv
                          WHERE season = Temporada AND home_team_api_id = ID_equipo
                          UNION
                          SELECT DISTINCT ID_plantel, home_player_3 AS ID_jugador
                          FROM enunciado_partidos.csv, plantel.csv
                          WHERE season = Temporada AND home_team_api_id = ID_equipo
                          UNION
                          SELECT DISTINCT ID_plantel, home_player_4 AS ID_jugador
                          FROM enunciado_partidos.csv, plantel.csv
                          WHERE season = Temporada AND home_team_api_id = ID_equipo
                          UNION
                          SELECT DISTINCT ID_plantel, home_player_5 AS ID_jugador
                          FROM enunciado_partidos.csv, plantel.csv
                          WHERE season = Temporada AND home_team_api_id = ID_equipo
                          UNION
                          SELECT DISTINCT ID_plantel, home_player_6 AS ID_jugador
                          FROM enunciado_partidos.csv, plantel.csv
                          WHERE season = Temporada AND home_team_api_id = ID_equipo
                          UNION
                          SELECT DISTINCT ID_plantel, home_player_7 AS ID_jugador
                          FROM enunciado_partidos.csv, plantel.csv
                          WHERE season = Temporada AND home_team_api_id = ID_equipo
                          UNION
                          SELECT DISTINCT ID_plantel, home_player_8 AS ID_jugador
                          FROM enunciado_partidos.csv, plantel.csv
                          WHERE season = Temporada AND home_team_api_id = ID_equipo
                          UNION
                          SELECT DISTINCT ID_plantel, home_player_9 AS ID_jugador
                          FROM enunciado_partidos.csv, plantel.csv
                          WHERE season = Temporada AND home_team_api_id = ID_equipo
                          UNION
                          SELECT DISTINCT ID_plantel, home_player_10 AS ID_jugador
                          FROM enunciado_partidos.csv, plantel.csv
                          WHERE season = Temporada AND home_team_api_id = ID_equipo
                          UNION
                          SELECT DISTINCT ID_plantel, home_player_11 AS ID_jugador
                          FROM enunciado_partidos.csv, plantel.csv
                          WHERE season = Temporada AND home_team_api_id = ID_equipo
                          UNION
                          SELECT DISTINCT ID_plantel, away_player_1 AS ID_jugador
                          FROM enunciado_partidos.csv, plantel.csv
                          WHERE season = Temporada AND away_team_api_id = ID_equipo
                          UNION
                          SELECT DISTINCT ID_plantel, away_player_2 AS ID_jugador
                          FROM enunciado_partidos.csv, plantel.csv
                          WHERE season = Temporada AND away_team_api_id = ID_equipo
                          UNION
                          SELECT DISTINCT ID_plantel, away_player_3 AS ID_jugador
                          FROM enunciado_partidos.csv, plantel.csv
                          WHERE season = Temporada AND away_team_api_id = ID_equipo
                          UNION
                          SELECT DISTINCT ID_plantel, away_player_4 AS ID_jugador
                          FROM enunciado_partidos.csv, plantel.csv
                          WHERE season = Temporada AND away_team_api_id = ID_equipo
                          UNION
                          SELECT DISTINCT ID_plantel, away_player_5 AS ID_jugador
                          FROM enunciado_partidos.csv, plantel.csv
                          WHERE season = Temporada AND away_team_api_id = ID_equipo
                          UNION
                          SELECT DISTINCT ID_plantel, away_player_6 AS ID_jugador
                          FROM enunciado_partidos.csv, plantel.csv
                          WHERE season = Temporada AND away_team_api_id = ID_equipo
                          UNION
                          SELECT DISTINCT ID_plantel, away_player_7 AS ID_jugador
                          FROM enunciado_partidos.csv, plantel.csv
                          WHERE season = Temporada AND away_team_api_id = ID_equipo
                          UNION
                          SELECT DISTINCT ID_plantel, away_player_8 AS ID_jugador
                          FROM enunciado_partidos.csv, plantel.csv
                          WHERE season = Temporada AND away_team_api_id = ID_equipo
                          UNION
                          SELECT DISTINCT ID_plantel, away_player_9 AS ID_jugador
                          FROM enunciado_partidos.csv, plantel.csv
                          WHERE season = Temporada AND away_team_api_id = ID_equipo
                          UNION
                          SELECT DISTINCT ID_plantel, away_player_10 AS ID_jugador
                          FROM enunciado_partidos.csv, plantel.csv
                          WHERE season = Temporada AND away_team_api_id = ID_equipo
                          UNION
                          SELECT DISTINCT ID_plantel, away_player_11 AS ID_jugador
                          FROM enunciado_partidos.csv, plantel.csv
                          WHERE season = Temporada AND away_team_api_id = ID_equipo
                          ''')

# una vez que hago todos estos simplemente hago un join y listo
conformacion_plantel = conf_plantel.df()
conformacion_plantel = conformacion_plantel.dropna(subset=['ID_jugador']) # acá borré los nulls para poder aplicar la funcion para sacar ese id.0 que aparece en todos los jugadores de partidos
conformacion_plantel['ID_jugador'] = conformacion_plantel['ID_jugador'].apply(lambda x: int(x)) # saco esos .0
conformacion_plantel.to_csv('conformacion_plantel.csv',index=False)


# agarro la tabla plantel y digo: si en partidos coincide el season con id_temporada Y local con equipo SELECT id_plantel con id_home player

In [12]:
# Tabla Partidos
partidos = duckdb.sql('''
                      SELECT DISTINCT match_api_id AS ID_partido, date AS Fecha, home_team_api_id AS ID_local, away_team_api_id AS ID_visitante, home_team_goal AS Goles_local, away_team_goal AS Goles_visitante,
                      CASE WHEN home_team_goal > away_team_goal THEN 'Ganado'
                      WHEN away_team_goal > home_team_goal THEN 'Perdido'
                      ELSE 'Empatado'
                      END AS Resultado
                      FROM enunciado_partidos.csv, plantel.csv
                    
                      ''')
partidos = partidos.df()
partidos.to_csv('tabla_partidos.csv')

Unnamed: 0,ID_partido,Fecha,ID_local,ID_visitante,Goles_local,Goles_visitante,Resultado
0,839878,2010-10-03,8456,10261,2,1,Ganado
1,839881,2010-10-02,10194,8655,1,0,Ganado
2,1024675,2011-10-29,8668,10260,0,1,Perdido
3,1024753,2011-11-05,8650,10003,0,0,Empatado
4,1024765,2011-11-05,10261,8668,2,1,Ganado
...,...,...,...,...,...,...,...
25974,530466,2009-02-01,8371,8661,1,0,Ganado
25975,530476,2009-02-08,8634,9869,3,1,Ganado
25976,530487,2009-02-15,9865,10281,3,2,Ganado
25977,530493,2009-02-21,8634,8558,1,2,Perdido
