In [1]:
import pandas as pd
import os
import sys
sys.path.append('../src')
import core as c

#### Dataset parameters

In [2]:
# Premier_league - E0, La_liga - SP1, Ligue_1 - F1, Serie_A - I1
league_name, code = "Premier_league", "E0"
FIRST_SEASON_YEAR, LAST_SEASON_YEAR = 5, 23

#### Last number of seaseons to calculate new features

In [3]:
N = 5

#### Important columns 

In [4]:
selected_columns = [
    'Div', 'Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG', 'HTR', 'HS', 'AS', 'HST', 'AST', 'HY', 'AY', 'HR', 'AR',
    f'HPTS_avg_{N}', f'APTS_avg_{N}', f'H_gd_{N}', f'A_gd_{N}', f'H_eff_{N}', f'A_eff_{N}', f'HST_avg_{N}', f'AST_avg_{N}', 'target'
]

#### Creating folder if not exist

In [5]:
output_folder = f"../data/processed/"

if not os.path.exists(output_folder):
    os.makedirs(output_folder)

#### Main function

In [6]:
def merge_all_seasons(file_names, selected_columns, output_folder, N):
    merged_df = pd.DataFrame()

    for i in range(len(file_names)):
        current_basic_df = pd.read_csv(f"../data/raw/{league_name}/{file_names[i]}.csv")
        current_transformed_df = c.data_transformation(current_basic_df, ["HST", "AST"], window=N)
        current_df = current_transformed_df[selected_columns].copy()  

        if i > 2:
            last_seasons = [pd.read_csv(f"../data/raw/{league_name}/{file_names[j]}.csv") for j in range(i-3, i)]
            teams_status_dict = c.create_teams_status_dict(current_df, last_seasons)
            current_df['H_status'] = current_df['HomeTeam'].apply(lambda team: teams_status_dict.get(team))
            current_df['A_status'] = current_df['AwayTeam'].apply(lambda team: teams_status_dict.get(team))

            current_df = c.calc_h2h_stats(current_df, last_seasons)

        merged_df = pd.concat([merged_df, current_df], ignore_index=True)

    output_file = os.path.join(output_folder, f"{code}_seasons_{FIRST_SEASON_YEAR}_to_{LAST_SEASON_YEAR}.csv")
    merged_df.to_csv(output_file, index=False)
    print(f"Merged data saved to {output_file}")

In [7]:
file_names_to_merge = [f"{code}_{str(year).zfill(2)}_{str(year+1).zfill(2)}" for year in range(FIRST_SEASON_YEAR, LAST_SEASON_YEAR)]
merge_all_seasons(file_names_to_merge, selected_columns, output_folder, N)

Merged data saved to ../data/processed/E0_seasons_5_to_23.csv
