In [1]:
import numpy as np
import polars as pl
import os

In [43]:
seasons = [str(x) + '-' + str(x+1) for x in range(2003, 2023)]
seasons_path = ['datasets/' + season + '.csv' for season in seasons]

seasons_dict = dict(zip(seasons, seasons_path))

for season, season_path in seasons_dict.items():
    seasons_dict[season] = pl.read_csv(season_path, truncate_ragged_lines=True, encoding='ISO-8859-1')

In [44]:
# Obtener lista de listas de columnas de cada dataset:
datasets_columns_list = []

for season_df in seasons_dict.values():
    datasets_columns_list.append(season_df.columns)

# Convertir cada lista de columnas en un conjunto: 
list_of_columns_sets = map(set, datasets_columns_list)

# Obtener listado de columnas comunes en todos los datasets:
from functools import reduce
common_columns_list = reduce(set.intersection, list_of_columns_sets)

# Ordenar lista de columnas basado en uno de los dataframes:
common_columns_list = sorted(common_columns_list, key=lambda x: seasons_dict[seasons[0]].columns.index(x))

In [58]:
for list in datasets_columns_list:
    print(len(list))

57
57
68
68
71
71
71
71
71
74
68
68
65
65
65
62
106
106
106
106


In [57]:
# Crear una lista de los datasets a unir verticalmente:
common_columns_datasets_dict = {season:season_df[common_columns_list] for season, season_df in seasons_dict.items()}

for season, season_df in common_columns_datasets_dict.items():
    print(season, ': ', season_df.dtypes)

2003-2004 :  [Utf8, Utf8, Utf8, Utf8, Int64, Int64, Utf8, Int64, Int64, Utf8, Utf8, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64]
2004-2005 :  [Utf8, Utf8, Utf8, Utf8, Int64, Int64, Utf8, Int64, Int64, Utf8, Utf8, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64]
2005-2006 :  [Utf8, Utf8, Utf8, Utf8, Int64, Int64, Utf8, Int64, Int64, Utf8, Utf8, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64]
2006-2007 :  [Utf8, Utf8, Utf8, Utf8, Int64, Int64, Utf8, Int64, Int64, Utf8, Utf8, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64]
2007

In [55]:
seasons_dict

{'2003-2004': shape: (380, 57)
 ┌─────┬──────────┬────────────┬───────────────┬───┬───────┬─────────┬─────────┬────────┐
 │ Div ┆ Date     ┆ HomeTeam   ┆ AwayTeam      ┆ … ┆ LBAH  ┆ B365AHH ┆ B365AHA ┆ B365AH │
 │ --- ┆ ---      ┆ ---        ┆ ---           ┆   ┆ ---   ┆ ---     ┆ ---     ┆ ---    │
 │ str ┆ str      ┆ str        ┆ str           ┆   ┆ f64   ┆ f64     ┆ f64     ┆ f64    │
 ╞═════╪══════════╪════════════╪═══════════════╪═══╪═══════╪═════════╪═════════╪════════╡
 │ E0  ┆ 16/08/03 ┆ Arsenal    ┆ Everton       ┆ … ┆ -1.25 ┆ 1.975   ┆ 1.925   ┆ -1.25  │
 │ E0  ┆ 16/08/03 ┆ Birmingham ┆ Tottenham     ┆ … ┆ 0.0   ┆ 2.1     ┆ 1.8     ┆ -0.25  │
 │ E0  ┆ 16/08/03 ┆ Blackburn  ┆ Wolves        ┆ … ┆ -0.75 ┆ 1.95    ┆ 1.95    ┆ -0.75  │
 │ E0  ┆ 16/08/03 ┆ Fulham     ┆ Middlesbrough ┆ … ┆ 0.0   ┆ 2.1     ┆ 1.8     ┆ -0.25  │
 │ …   ┆ …        ┆ …          ┆ …             ┆ … ┆ …     ┆ …       ┆ …       ┆ …      │
 │ E0  ┆ 15/05/04 ┆ Liverpool  ┆ Newcastle     ┆ … ┆ null  ┆ 2.1     

In [46]:
common_columns_datasets_dict

{'2022-2023': shape: (380, 32)
 ┌─────┬────────────┬────────────────┬─────────────┬───┬──────┬──────┬──────┬──────┐
 │ Div ┆ Date       ┆ HomeTeam       ┆ AwayTeam    ┆ … ┆ IWA  ┆ WHH  ┆ WHD  ┆ WHA  │
 │ --- ┆ ---        ┆ ---            ┆ ---         ┆   ┆ ---  ┆ ---  ┆ ---  ┆ ---  │
 │ str ┆ str        ┆ str            ┆ str         ┆   ┆ f64  ┆ f64  ┆ f64  ┆ f64  │
 ╞═════╪════════════╪════════════════╪═════════════╪═══╪══════╪══════╪══════╪══════╡
 │ E0  ┆ 05/08/2022 ┆ Crystal Palace ┆ Arsenal     ┆ … ┆ 1.85 ┆ 4.4  ┆ 3.5  ┆ 1.83 │
 │ E0  ┆ 06/08/2022 ┆ Fulham         ┆ Liverpool   ┆ … ┆ 1.27 ┆ 12.0 ┆ 5.5  ┆ 1.27 │
 │ E0  ┆ 06/08/2022 ┆ Bournemouth    ┆ Aston Villa ┆ … ┆ 2.05 ┆ 3.75 ┆ 3.3  ┆ 2.05 │
 │ E0  ┆ 06/08/2022 ┆ Leeds          ┆ Wolves      ┆ … ┆ 3.15 ┆ 2.25 ┆ 3.3  ┆ 3.2  │
 │ …   ┆ …          ┆ …              ┆ …           ┆ … ┆ …    ┆ …    ┆ …    ┆ …    │
 │ E0  ┆ 28/05/2023 ┆ Leeds          ┆ Tottenham   ┆ … ┆ 2.45 ┆ 2.6  ┆ 3.5  ┆ 2.2  │
 │ E0  ┆ 28/05/2023 ┆ Leicester   

In [47]:

# Realiza la unión vertical de los DataFrames usando extend
df = pl.DataFrame(schema=common_columns_list)
for season_df in common_columns_datasets_dict.values():
    df = df.extend(season_df)
df

ShapeError: unable to vstack, dtypes for column "Div" don't match: `f32` and `str`

In [None]:
# Crear un DataFrame vacío para almacenar los datos combinados
df = pl.DataFrame(None)

# Iterar a través de la lista de archivos CSV y cargarlos en DataFrames
for file_path in seasons_data:
    season_data = pl.read_csv(file_path, truncate_ragged_lines=True)
    
    # Unir el DataFrame actual con el DataFrame combinado
    if df is None:
        df = season_data
    else:
        df = df.extend(season_data)
