#### 1. Perform future engineering on fixtures data

In [None]:
def add_statistics(fixtures_df):
    
    fixtures_df['fixture_date'] = pd.to_datetime(fixtures_df['fixture_date']).dt.date
    fixtures_df = fixtures_df.sort_values(by='fixture_date')
    fixtures_df['teams_home_goals_scored_home'] = fixtures_df.groupby(['league_season', 'teams_home_id'])['goals_home'].cumsum()
    fixtures_df['teams_away_goals_scored_away'] = fixtures_df.groupby(['league_season','teams_away_id'])['goals_away'].cumsum()
    fixtures_df['teams_home_goals_lost_home'] = fixtures_df.groupby(['league_season','teams_home_id'])['goals_away'].cumsum()
    fixtures_df['teams_away_goals_lost_away'] = fixtures_df.groupby(['league_season','teams_away_id'])['goals_home'].cumsum()
    fixtures_df['teams_home_winner'] = fixtures_df.apply(
        lambda row: 3 if row['score_fulltime_home']>row['score_fulltime_away'] else (1 if row['score_fulltime_home']==row['score_fulltime_away'] else 0), axis=1
    )
    fixtures_df['teams_away_winner'] = fixtures_df.apply(
        lambda row: 3 if row['score_fulltime_home']<row['score_fulltime_away'] else (1 if row['score_fulltime_home']==row['score_fulltime_away'] else 0), axis=1
    )
    home = fixtures_df[[
        'fixture_date',
        'league_season',
        'teams_home_id', 
        'goals_home',
        'goals_away',
        'teams_home_winner', 
        'league_round'
        ]].rename(columns={
        'teams_home_id':'team_id',
        'goals_home':'goals_scored',
        'goals_away':'goals_lost',
        'teams_home_winner':'points'
        })
    away = fixtures_df[[
        'fixture_date', 
        'league_season',
        'teams_away_id', 
        'goals_away',
        'goals_home',
        'teams_away_winner', 
        'league_round'
        ]].rename(columns={
        'teams_away_id':'team_id', 
        'goals_away':'goals_scored',
        'goals_home':'goals_lost',
        'teams_away_winner':'points'
        })

    total = pd.concat([home, away])
    total = total.sort_values(by='fixture_date')
    total['total_goals_scored'] = total.groupby(['league_season','team_id'])['goals_scored'].cumsum()
    total['total_goals_lost'] = total.groupby(['league_season','team_id'])['goals_lost'].cumsum()
    total = total.sort_values(by='fixture_date')

    total['total_points'] = total.groupby(['league_season','team_id'])['points'].cumsum()

    total.sort_values(by=['league_season','league_round','total_points','total_goals_scored','fixture_date'], ascending=[True,True,False,False,True])
    total['standings'] = total.groupby(['league_season','league_round'])['total_points'].rank(method='min', ascending=False)
    total['standings'] = total['standings'].astype(int)

    total = total.sort_values(by=['team_id','fixture_date'])
    total['points_last_5_matches'] = total.groupby('team_id')['points'].rolling(window=5, min_periods=1).sum().reset_index(level=0, drop=True)
    total['points_last_5_matches'] = total['points_last_5_matches'].fillna(0)
    total['points_last_5_matches'] = total['points_last_5_matches'].astype(int)

    fixtures_df = fixtures_df.merge(total[[
        'fixture_date',
        'team_id',
        'total_goals_scored',
        'total_goals_lost', 
        'points', 
        'total_points', 
        'standings',
        'points_last_5_matches'
        ]], left_on = [
            'fixture_date',
            'teams_home_id'
            ],right_on = [
            'fixture_date',
            'team_id'
            ], how='left'
            ).rename(columns={
                'total_goals_scored':'teams_home_total_goals_scored',
                'total_goals_lost':'teams_home_total_goals_lost',
                'points':'teams_home_points',
                'total_points':'teams_home_total_points',
                'standings':'teams_home_standings',
                'points_last_5_matches':'teams_home_last_five_matches_points'
            }).drop(columns='team_id')
    
    fixtures_df = fixtures_df.merge(total[[
        'fixture_date',
        'team_id',
        'total_goals_scored',
        'total_goals_lost', 
        'points', 
        'total_points', 
        'standings',
        'points_last_5_matches'
        ]], left_on = [
            'fixture_date',
            'teams_away_id'
            ],right_on = [
            'fixture_date',
            'team_id'
            ], how='left'
            ).rename(columns={
                'total_goals_scored':'teams_away_total_goals_scored',
                'total_goals_lost':'teams_away_total_goals_lost',
                'points':'teams_away_points',
                'total_points':'teams_away_total_points',
                'standings':'teams_away_standings',
                'points_last_5_matches':'teams_away_last_five_matches_points'
            }).drop(columns='team_id')

    return fixtures_df

In [None]:
# add stats to fixtures table and save it in fixtures update table
def future_engineering():
    conn = None
    cur = None
    fixtures_df = None
    conflict_columns = ['fixture_id']
    db_params = {
        'host': 'localhost',
        'database': 'preds',
        'user': 'postgres',
        'password': 'pass',
        'port': '5432'
    }
    
    try:
        conn = None
        cur = None 
    
        conn = psycopg2.connect(**db_params)
        cur = conn.cursor()
        

        query = """
            SELECT *
            FROM fixtures
            WHERE fixture_status_short IN ('FT', 'WO', 'AET', 'PEN', 'CANC')
        """
        
        fixtures_df = pd.read_sql_query(query, conn)
        
        df = add_statistics(fixtures_df)
        
        update_columns = [col for col in df.columns if col not in conflict_columns]
        #insert data into tables

        update_set = ', '.join([f"{col} = EXCLUDED.{col}" for col in update_columns])
        insert_query = """
            INSERT INTO {} ({})
            VALUES ({})
            ON CONFLICT ({}) DO UPDATE SET {}
        """.format('fixtures_updated', ','.join(df.columns), ','.join(['%s']*len(df.columns)), ','.join(conflict_columns), update_set)

        cur.executemany(insert_query, df.values.tolist())
        print(f'table fixtures_updated updated')
        
        # Commit the changes
        conn.commit()
        return df
    except Exception as e:
        print(f'Error {e}')
        if conn is not None:
            conn.rollback()
    finally:
        if conn is not None:
            # Close the cursor and connection
            cur.close()
        if cur is not None:
            conn.close()

In [None]:
import pandas as pd
import psycopg2

df = future_engineering()
print('done')

#### 2. Get updated data and preprocess it

In [None]:
# add stats to fixtures table and save it in fixtures update table
def get_updated_matches()
    conn = None
    cur = None
    fixtures_df = None
    conflict_columns = ['fixture_id']
    db_params = {
        'host': 'localhost',
        'database': 'preds',
        'user': 'postgres',
        'password': 'pass',
        'port': '5432'
    }
    
    try:
        conn = None
        cur = None 
    
        conn = psycopg2.connect(**db_params)
        cur = conn.cursor()
        
        query = """
            SELECT *
            FROM fixtures_updated
        """
        
        fixtures_df = pd.read_sql_query(query, conn)
        return fixtures_df
    except Exception as e:
        print(f'Error {e}')
    
    finally:
        if conn is not None:
            # Close the cursor and connection
            cur.close()
        if cur is not None:
            conn.close()

In [None]:
fixtures_df = get_updated_matches()
s_path = 'data/contests.csv'
seasons = pd.read_csv(s_path)

fixtures_df['fixture_date'] = pd.to_datetime(fixtures_df['fixture_date'])
fixtures_df['day_of_week'] = fixtures_df['fixture_date'].dt.dayofweek
fixtures_df = fixtures_df.merge(seasons[['league_id','type']], on='league_id', how='left')

fixtures_df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
import joblib

label_encoder = LabelEncoder()
label_encoder.fit(fixtures_df['type'])
fixtures_df['league_type_encoded'] = label_encoder.transform(fixtures_df['type'])

joblib.dump(label_encoder, 'apache_airflow/models/label_encoder_league_type.pkl')

In [None]:
df_goals = fixtures_df[[
    'fixture_id',
    'day_of_week', 
    'league_id', 
    'league_round',
    'league_type_encoded',
    'teams_home_id',
    'teams_home_total_goals_scored',
    'teams_home_total_goals_lost',
    'teams_home_last_five_matches_points',
    'teams_home_goals_scored_home',
    'teams_home_goals_lost_home',
    'teams_away_id',
    'teams_away_total_goals_scored',
    'teams_away_total_goals_lost',
    'teams_away_last_five_matches_points',
    'teams_away_goals_scored_away',
    'teams_away_goals_lost_away',
    'goals_home',
    'goals_away'
]]

df_goals['home_over_1'] = df_goals['goals_home'].apply(lambda x: 1 if x > 1 else 0)
df_goals['home_over_2'] = df_goals['goals_home'].apply(lambda x: 1 if x > 2 else 0)

df_goals['away_over_1'] = df_goals['goals_away'].apply(lambda x: 1 if x > 1 else 0)
df_goals['away_over_2'] = df_goals['goals_away'].apply(lambda x: 1 if x > 2 else 0)

df_goals['both_scores'] = df_goals.apply(lambda row: 1 if (row['goals_home']>0)&(row['goals_away']>0) else 0, axis=1)
df_goals['over_1'] = df_goals.apply(lambda row: 1 if row['goals_home']+row['goals_away']>1 else 0, axis=1)
df_goals['over_2'] = df_goals.apply(lambda row: 1 if row['goals_home']+row['goals_away']>2 else 0, axis=1)

In [None]:
df_result = fixtures_df[[
    'fixture_id',
    'day_of_week', 
    'league_id', 
    'league_type_encoded',
    'teams_home_id',
    'teams_home_total_goals_scored',
    'teams_home_total_goals_lost',
    'teams_home_last_five_matches_points',
    'teams_home_goals_scored_home',
    'teams_home_goals_lost_home',
    'teams_home_total_points',
    'teams_home_standings',
    'score_halftime_home',
    'teams_away_id',
    'teams_away_total_goals_scored',
    'teams_away_total_goals_lost',
    'teams_away_last_five_matches_points',
    'teams_away_goals_scored_away',
    'teams_away_goals_lost_away',
    'teams_away_total_points',
    'teams_away_standings',
    'score_halftime_away',
    'goals_home',
    'goals_away'
]]

df_result['result'] = df_result.apply(
    lambda row: 0 if row['goals_home']>row['goals_away'] else (
        1 if row['goals_home']==row['goals_away'] else 2
    ), 
    axis=1
)

df_result['result_double_chance_home'] = df_result.apply(
    lambda row: 1 if row['goals_home']>=row['goals_away'] else 0,
    axis=1
)
df_result['result_double_chance_away'] = df_result.apply(
    lambda row: 1 if row['goals_away']>=row['goals_home'] else 0,
    axis=1
)

df_result['result_first_half'] = df_result.apply(
    lambda row: 0 if row['score_halftime_home']>row['score_halftime_away'] else (
        1 if row['score_halftime_home']==row['score_halftime_away'] else 2
    ), 
    axis=1
)

#### 3. Add class saving training data

In [None]:
from tensorflow.keras.callbacks import Callback

class MetricsCallback(Callback):
    def __init__(self, params):
        super().__init__()
        self.epoch_metrics = {}
        self.params_list = [params]

    def on_train_begin(self, logs=None):
        self.epoch_metrics = {}
        self.params_list.append(self.params)
    
    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        for key, value in logs.items():
            if key.endswith('_accuracy') or key.endswith('loss'):
                if key not in self.epoch_metrics:
                    self.epoch_metrics[key] = {'accuracy': [], 'loss': []}
                if key.endswith('_accuracy'):
                    self.epoch_metrics[key]['accuracy'].append(value)
                elif key.endswith('loss'):
                    self.epoch_metrics[key]['loss'].append(value)
   
    def get_metrics(self):
        return self.epoch_metrics
    
    def get_training_data(self):
        return self.params_list, self.epoch_metrics

#### 4. Learning goals model

In [None]:
def define_model(numerical_inp = 15):

    # Label Encoding
    input_numerical = Input(shape=(numerical_inp,), name = 'input_numerical')
    
    # 
    dense1 = Dense(64, activation='relu')(input_numerical)
    dense2 = Dense(32, activation='relu')(dense1)
    
    # results
    output1 = Dense(1, activation='sigmoid', name='home_over_1')(dense2)
    output2 = Dense(1, activation='sigmoid', name='home_over_2')(dense2)
    output3 = Dense(1, activation='sigmoid', name='away_over_1')(dense2)
    output4 = Dense(1, activation='sigmoid', name='away_over_2')(dense2)
    output5 = Dense(1, activation='sigmoid', name='both_scores')(dense2)
    
    # layer use
    model = Model(inputs=[input_numerical], outputs=[output1, output2, output3, output4, output5])

    return model
    
def training(model, X, y_1, y_2, y_3, y_4, y_5, **params):
    tf.keras.backend.clear_session()
    metrics_callback = MetricsCallback(params=params)
    model.compile(optimizer=Adam(learning_rate=params['learning_rate']),
                 loss={
                     'home_over_1': 'binary_crossentropy', 
                     'home_over_2': 'binary_crossentropy', 
                     'away_over_1': 'binary_crossentropy',
                     'away_over_2': 'binary_crossentropy',
                     'both_scores': 'binary_crossentropy'
                 },
                  metrics={
                     'home_over_1': 'accuracy', 
                     'home_over_2': 'accuracy', 
                     'away_over_1': 'accuracy', 
                     'away_over_2': 'accuracy', 
                     'both_scores': 'accuracy',
                  })
    
    model.fit(X,
              {
                  'home_over_1':y_1, 
                  'home_over_2':y_2, 
                  'away_over_1':y_3, 
                  'away_over_2':y_4, 
                  'both_scores':y_5
              },
              epochs = params['epochs'], 
              batch_size = params['batch_size'], 
              validation_split = params['validation_split'],
              verbose = 1, 
              callbacks=[metrics_callback])
    
    params_list, metrics_list = metrics_callback.get_training_data()
    return params_list, metrics_list, model

In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

numerical_inp = 15

X = df_goals[[
    'day_of_week', 
    'league_id', 
    'league_type_encoded', 
    'teams_home_id', 
    'teams_home_total_goals_scored',
    'teams_home_total_goals_lost', 
    'teams_home_last_five_matches_points',
    'teams_home_goals_scored_home', 
    'teams_home_goals_lost_home',
    'teams_away_id', 
    'teams_away_total_goals_scored',
    'teams_away_total_goals_lost', 
    'teams_away_last_five_matches_points',
    'teams_away_goals_scored_away', 
    'teams_away_goals_lost_away'
]]

y_1 = df_goals['home_over_1']
y_2 = df_goals['home_over_2']
y_3 = df_goals['away_over_1']
y_4 = df_goals['away_over_2']
y_5 = df_goals['both_scores']

In [None]:
params = {
    'validation_split': 0.15,
    'optimizer': 'adam',
    'batch_size': 32,
    'epochs': 12,
    'learning_rate': 0.001
}

b_goal_model = define_model()
returned_params, returned_metrics, goal_model = training(model=b_goal_model, X=X, y_1=y_1, y_2=y_2, y_3=y_3, y_4=y_4, y_5=y_5, **params)

#### 5. Learning result model

In [None]:
def define_rm_model(numerical_inp = 19):

    # Label Encoding
    input_numerical = Input(shape=(numerical_inp,), name = 'input_numerical')
    
    # 
    dense1 = Dense(64, activation='relu')(input_numerical)
    dense2 = Dense(32, activation='relu')(dense1)
    
    # results
    output1 = Dense(3, activation='softmax', name='result')(dense2)
    output2 = Dense(3, activation='softmax', name='result_first_half')(dense2)
    output3 = Dense(1, activation='sigmoid', name='result_double_chance_home')(dense2)
    output4 = Dense(1, activation='sigmoid', name='result_double_chance_away')(dense2)
    
    # layer use
    model = Model(inputs=[input_numerical], outputs=[output1, output2, output3, output4])

    return model
    
def training_rm(model, X, y_1, y_2, y_3, y_4, **params):
    tf.keras.backend.clear_session()
    metrics_callback = MetricsCallback(params=params)
    model.compile(optimizer=Adam(learning_rate=params['learning_rate']),
                 loss={
                     'result': 'sparse_categorical_crossentropy',
                     'result_first_half': 'sparse_categorical_crossentropy',
                     'result_double_chance_home': 'binary_crossentropy',
                     'result_double_chance_away': 'binary_crossentropy'
                 },
                  metrics={
                     'result': 'accuracy',
                     'result_first_half': 'accuracy',
                     'result_double_chance_home': 'accuracy',
                     'result_double_chance_away': 'accuracy'
                  })
    
    model.fit(X,
              {
                 'result': y_1,
                 'result_first_half': y_2,
                 'result_double_chance_home': y_3,
                 'result_double_chance_away': y_4
              },
              epochs = params['epochs'], 
              batch_size = params['batch_size'], 
              validation_split = params['validation_split'],
              verbose = 1, 
              callbacks=[metrics_callback])
    
    params_list, metrics_list = metrics_callback.get_training_data()
    return params_list, metrics_list, model

In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

numerical_inp = 19

X = df_result[[
    'day_of_week', 
    'league_id', 
    'league_type_encoded',
    'teams_home_id',
    'teams_home_total_goals_scored',
    'teams_home_total_goals_lost',
    'teams_home_last_five_matches_points',
    'teams_home_goals_scored_home',
    'teams_home_goals_lost_home',
    'teams_home_total_points',
    'teams_home_standings',
    'teams_away_id',
    'teams_away_total_goals_scored',
    'teams_away_total_goals_lost',
    'teams_away_last_five_matches_points',
    'teams_away_goals_scored_away',
    'teams_away_goals_lost_away',
    'teams_away_total_points',
    'teams_away_standings'
]]

y_1 = df_result['result']
y_2 = df_result['result_first_half']
y_3 = df_result['result_double_chance_home']
y_4 = df_result['result_double_chance_away']

In [None]:
params = {
    'validation_split': 0.15,
    'optimizer': 'adam',
    'batch_size': 32,
    'epochs': 10,
    'learning_rate': 0.0015
}

b_result_model = define_rm_model()
returned_params, returned_metrics, mod = training_rm(model=b_result_model, X=X, y_1=y_1, y_2=y_2, y_3=y_3, y_4=y_4, **params)
returned_metrics