In [None]:
# import libraries ncessary for preprocessing
import pandas as pd
import numpy as np
import sqlite3

from sklearn.model_selection import train_test_split

In [None]:
# create a connection with the given database
con = sqlite3.connect("../laliga.sqlite")

# read the matches table from the database into Pandas DataFrames
df_matches = pd.read_sql_query("SELECT * from Matches", con)

# close the connection of the database
con.close()

In [None]:
def count_points(last_5):
    if last_5 == 0:
        return 0
        
    count = 0
    for i in last_5:
        if i == 'W':
            count += 3
        elif i == 'T':
            count += 1
    return count

def extend_data(input_data):
    data = input_data.copy()
    
    # drop the "time" column because it contains to many "None" values, and the "date" column because it is not used
    data.drop(columns = ['time', 'date'], inplace = True)
    
    # drop NaN values (there are only NaN values left in the "score" column)
    data.dropna(inplace = True)
    
    # create two new columns, one for the score of the home team, and one for the score of the away team, drop the score column
    data[['home_score', 'away_score']] = data['score'].str.split(':', expand=True).astype(int)
    data.drop(columns=['score'], inplace = True)
    
    # create a new column to show which team won (1 for home team, 2 for away team and 'X' for a draw)
    data['winner'] = data.apply(lambda row: 1 if row['home_score'] > row['away_score'] else 2 if row['home_score'] < row['away_score'] else 'X', axis=1)

    data_copy = data.dropna(subset=['home_score', 'away_score', 'winner'])

    # Calculate cumulative results for home and away games
    home_stats = data_copy.groupby(['season', 'division', 'matchday', 'home_team']).agg(
        GF=('home_score', 'sum'), GA=('away_score', 'sum'),
        W=('winner', lambda x: (x == 1).sum()),
        L=('winner', lambda x: (x == 2).sum()),
        T=('winner', lambda x: (x == 'X').sum())
    ).reset_index().rename(columns={'home_team': 'team', 'result': 'home_result'})
    
    away_stats = data_copy.groupby(['season', 'division', 'matchday', 'away_team']).agg(
        GF=('away_score', 'sum'), GA=('home_score', 'sum'),
        W=('winner', lambda x: (x == 2).sum()),
        L=('winner', lambda x: (x == 1).sum()),
        T=('winner', lambda x: (x == 'X').sum())
    ).reset_index().rename(columns={'away_team': 'team', 'result': 'away_result'})

    # Combine home and away stats
    combined_stats = pd.concat([home_stats, away_stats]).fillna(0)
    combined_stats['GD'] = combined_stats['GF'] - combined_stats['GA']
    combined_stats['Pts'] = combined_stats['W'] * 3 + combined_stats['T']
    combined_stats['result'] = combined_stats.apply(lambda row: 'W' if row['W'] > 0 else ('L' if row['L'] > 0 else 'T'), axis=1)

    # Calculate cumulative season stats and rankings
    results = combined_stats.groupby(['season', 'division', 'matchday', 'team']).agg(
        GF=('GF', 'sum'), GA=('GA', 'sum'), GD=('GD', 'sum'),
        W=('W', 'sum'), L=('L', 'sum'), T=('T', 'sum'),
        Pts=('Pts', 'sum'), result=('result', 'first')
    ).reset_index()

    results = results.sort_values(by=['season', 'division', 'matchday', 'Pts', 'GD', 'GF'])
    results[['GF', 'GA', 'GD', 'W', 'L', 'T', 'Pts']] = results.groupby(['season', 'division', 'team'])[['GF', 'GA', 'GD', 'W', 'L', 'T', 'Pts']].cumsum()

    results = results.sort_values(by=['season', 'division', 'matchday', 'Pts', 'GD', 'GF'], ascending=[True, True, True, False, False, False])    
    results['rank'] = results.groupby(['season', 'division', 'matchday']).cumcount() + 1
    results = results.sort_values(by=['season', 'division', 'matchday', 'rank'], ascending=[False, True, True, True]).reset_index(drop=True)

    results = results[['season', 'division', 'matchday', 'rank', 'team', 'GF', 'GA', 'GD', 'W', 'L', 'T', 'Pts', 'result']]

    # Calculate last 5 games results and prepare home/away final resultsframes
    results['last_5'] = results.apply(lambda row: results[
        (results['season'] == row['season']) &
        (results['division'] == row['division']) &
        (results['matchday'] <= row['matchday']) &
        (results['matchday'] >= max(1, row['matchday'] - 4)) &
        (results['team'] == row['team'])
    ]['result'].tolist()[-5:], axis=1)

    results['last_5'] = results['last_5'].apply(lambda x: count_points(x))    
    
    results_shifted = results.copy()
    results_shifted['matchday'] += 1
    
    # Rename columns in results_shifted for clarity when merged
    results_shifted = results_shifted.rename(columns={
        'rank': 'prev_rank',
        'GF': 'prev_GF', 'GA': 'prev_GA', 'GD': 'prev_GD',
        'W': 'prev_W', 'L': 'prev_L', 'T': 'prev_T', 'Pts': 'prev_Pts',
        'result': 'prev_result', 'last_5': 'prev_last_5'
    })
    
    # Merge data with the shifted results to get previous matchday stats
    data = data.merge(results_shifted, 
                      how='left', 
                      left_on=['season', 'division', 'matchday', 'home_team'], 
                      right_on=['season', 'division', 'matchday', 'team']).fillna(0)
    
    # Drop the redundant 'team' column from the merged results_shifted dataframe
    data.drop(columns=['team'], inplace=True)
    
    # Repeat the merge for the away team data
    data = data.merge(results_shifted, 
                      how='left', 
                      left_on=['season', 'division', 'matchday', 'away_team'], 
                      right_on=['season', 'division', 'matchday', 'team'], 
                      suffixes=('_home', '_away')).fillna(0)
    
    # Drop the redundant 'team' column from the second merge
    data.drop(columns=['team'], inplace=True)
head_to_head_last_5
    data['GDD'] = data['prev_GD_home'] - data['prev_GD_away']

    data['winner'] = data.apply(lambda row: 1 if row['winner'] == 1 else 2 if row['winner'] == 2 else 0, axis=1)
    
    data['head_to_head_last_5'] = data.groupby(['home_team', 'away_team'])['winner'].transform(lambda x: x.rolling(5, 1).apply(lambda y: sum(y == 1) / 5, raw=True))

    return data