In [20]:

import pandas as pd
import torch
import torch.nn as nn
import numpy as np
import os
from glicko2 import Player
from helpers import davidson_mov
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR


In [21]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm  # Progress bar

# Path to your matrices folder
folder_path = "matrices"

# Lists to store data
file_names = []
diagonal_sums = []      # To be saved as prob_draw
bottom_left_sums = []   # To be saved as prob_home
top_right_sums = []     # To be saved as prob_away
home_xGs = []           # To store home expected goals
away_xGs = []           # To store away expected goals
negative_matrices = []  # To track matrices with negative values

# Loop through all files in the folder with a progress bar
for file_name in tqdm(os.listdir(folder_path), desc="Processing matrices", unit="file"):
    file_path = os.path.join(folder_path, file_name)
    
    # Read the matrix from the file
    with open(file_path, 'r') as file:
        matrix = np.loadtxt(file, delimiter=',')
    
    # Ensure it's 6x6
    if matrix.shape == (6, 6):
        # Check for negatives and log the matrix file name
        if (matrix < 0).any():
            negative_matrices.append(file_name)
        
        # Clip negative values to 0
        matrix = np.clip(matrix, 0, None)
        
        # Strip the .csv extension and append to the file_names list
        event_id = os.path.splitext(file_name)[0]
        file_names.append(event_id)
        
        # Append data to lists
        diagonal_sums.append(np.trace(matrix))  # Main diagonal (draw_prob)
        
        # Bottom-left triangle (home_win_prob)
        bottom_left_sum = np.sum(np.tril(matrix, -1))
        bottom_left_sums.append(bottom_left_sum)
        
        # Top-right triangle (away_win_prob)
        top_right_sum = np.sum(np.triu(matrix, 1))
        top_right_sums.append(top_right_sum)
        
        # Calculate expected goals
        goal_indices = np.arange(matrix.shape[0])  # [0, 1, 2, 3, 4, 5]
        
        # Calculate marginal probabilities
        # Sum over rows to get P(Home Goals)
        P_home_goals = matrix.sum(axis=1)  # Shape: (6,)
        # Sum over columns to get P(Away Goals)
        P_away_goals = matrix.sum(axis=0)  # Shape: (6,)
        
        # Calculate expected goals
        home_xG = np.sum(goal_indices * P_home_goals)
        away_xG = np.sum(goal_indices * P_away_goals)
        
        # Append to lists
        home_xGs.append(home_xG)
        away_xGs.append(away_xG)
    else:
        print(f"Matrix {file_name} is not 6x6, shape is {matrix.shape}")

# Create a DataFrame
data = {
    "eventId": file_names,
    "prob_draw": diagonal_sums,
    "home_prob": bottom_left_sums,
    "away_prob": top_right_sums,
    "home_xG": home_xGs,
    "away_xG": away_xGs
}

matrix_probs = pd.DataFrame(data)

# Print the list of matrices with negative values
if negative_matrices:
    print("Matrices with negative values before clipping:")
    for matrix_file in negative_matrices:
        print(matrix_file)
else:
    print("No matrices with negative values found.")



Processing matrices: 100%|██████████| 5581/5581 [00:03<00:00, 1565.27file/s]

Matrices with negative values before clipping:
10952371.csv
10952373.csv
10952377.csv
10952378.csv
10952380.csv
10952382.csv
10952383.csv
10952396.csv
10952421.csv
10952422.csv
10952423.csv
10952425.csv
10952426.csv
10952427.csv
10952428.csv
10952429.csv
10952430.csv
10952431.csv
10952432.csv
10952434.csv
10952435.csv
10952436.csv
10952438.csv
10952439.csv
10952440.csv
10952441.csv
10952442.csv
10952443.csv
10952445.csv
10952446.csv
10952447.csv
10952448.csv
10952449.csv
10952450.csv
10952451.csv
10952452.csv
10952453.csv
10952454.csv
10952455.csv
10952458.csv
10952459.csv
10952460.csv
10952461.csv
10952462.csv
10952463.csv
10952465.csv
10952466.csv
10952467.csv
10952468.csv
10952469.csv
10952471.csv
10952472.csv
10952473.csv
10952474.csv
10952475.csv
10952476.csv
10952481.csv
10952482.csv
10952484.csv
10952487.csv
10952488.csv
10952489.csv
10952490.csv
10952491.csv
10952492.csv
10952493.csv
11067286.csv
11067308.csv
11067313.csv
11067317.csv
11067329.csv
11067333.csv
11067335.csv
1106




In [22]:
# Check for negatives in each column
print("Negative values in prob_draw:", (matrix_probs['prob_draw'] < 0).any())
print("Negative values in prob_home:", (matrix_probs['home_prob'] < 0).any())
print("Negative values in prob_away:", (matrix_probs['away_prob'] < 0).any())


Negative values in prob_draw: False
Negative values in prob_home: False
Negative values in prob_away: False


In [23]:
import pandas as pd

# Read the CSV files
elo_df = pd.read_csv('elo.csv')  # Replace 'elo.csv' with your actual file path
team_names_df = pd.read_csv('team_names.csv')  # Replace 'team_names.csv' with your actual file path

# Merge the two dataframes
merged_df = pd.merge(team_names_df, elo_df, left_on='Team Names', right_on='Club', how='left')

# Assign default Elo of 1500 to missing values
merged_df['Elo'] = merged_df['Elo'].fillna(1400)

# Drop the redundant Club column if present
merged_df = merged_df.drop(columns=['Club'], errors='ignore')

# Save or display the result
# Convert to dictionary
elo_dict = merged_df.set_index('Team Names')['Elo'].to_dict()


In [24]:
# import os
# import pandas as pd

# # Define the folder path
# folder_path = 'next_week_games'

# # Get all file names in the folder
# file_names = [f for f in os.listdir(folder_path) if f.startswith('next_week_games_')]

# # Sort the file names based on year and week number
# sorted_files = sorted(
#     file_names,
#     key=lambda x: pd.to_datetime(x.split('_')[-1].replace('.csv', '') + '-1', format='%Y-%W-%w')
# )

# # Initialize an empty list to store dataframes
# dataframes = []

# # Read each file and append the dataframe to the list
# for file in sorted_files:
#     file_path = os.path.join(folder_path, file)
#     df = pd.read_csv(file_path)  # Adjust read method if files are not CSV
#     dataframes.append(df)

# # Concatenate all dataframes into a single dataframe
# combined_df = pd.concat(dataframes, ignore_index=True)
# combined_df.to_csv('final_iteration.csv',index=False)

In [25]:
import os
import pandas as pd
import re

# Function to sanitize file names
def sanitize_filename(name):
    return re.sub(r'[<>:"/\\|?*]', '_', name)

# Load the CSV file
input_csv = "final_iteration.csv"  # Replace with your actual CSV file name
data = pd.read_csv(input_csv)

# Create the output folder if it doesn't exist
output_folder = "tournaments"
os.makedirs(output_folder, exist_ok=True)

# Split the data by tournament and save each to a separate CSV
for tournament in data['tournament'].unique():
    sanitized_name = sanitize_filename(tournament)
    tournament_data = data[data['tournament'] == tournament]
    output_file = os.path.join(output_folder, f"{sanitized_name}.csv")
    tournament_data.to_csv(output_file, index=False)
    print(f"Saved {tournament} data to {output_file}")

print("All tournament CSVs have been saved.")


  data = pd.read_csv(input_csv)


Saved Major League Soccer 2023 data to tournaments\Major League Soccer 2023.csv
Saved Brasileirão Série A 2023 data to tournaments\Brasileirão Série A 2023.csv
Saved UEFA Champions League 23/24 data to tournaments\UEFA Champions League 23_24.csv
Saved Championship 23/24 data to tournaments\Championship 23_24.csv
Saved UEFA Europa League 23/24 data to tournaments\UEFA Europa League 23_24.csv
Saved Ligue 1 23/24 data to tournaments\Ligue 1 23_24.csv
Saved LaLiga 23/24 data to tournaments\LaLiga 23_24.csv
Saved Premier League 23/24 data to tournaments\Premier League 23_24.csv
Saved Liga Portugal 23/24 data to tournaments\Liga Portugal 23_24.csv
Saved Bundesliga 23/24 data to tournaments\Bundesliga 23_24.csv
Saved Serie A 23/24 data to tournaments\Serie A 23_24.csv
Saved MLS 2024 data to tournaments\MLS 2024.csv
Saved Brasileiro Serie A 2024 data to tournaments\Brasileiro Serie A 2024.csv
Saved UEFA Champions League 24/25 data to tournaments\UEFA Champions League 24_25.csv
Saved UEFA Europ

In [26]:

# Load the existing final_iteration.csv
final_iteration = pd.read_csv('final_iteration.csv')

# Load the combined tournament data
combined_tournament_data = pd.read_csv('combined_tournament_data.csv')

# Merge the DataFrames on 'eventId'
merged_df = final_iteration.merge(
    combined_tournament_data[['eventId', 'homeAttack', 'homeDefence', 'awayAttack', 'awayDefence']],
    on='eventId',
    how='left'  # Use 'left' to keep all rows from final_iteration
)

# Save the updated DataFrame
merged_df.to_csv('final_iteration_updated.csv', index=False)

print("Columns added and saved to final_iteration_updated.csv.")


  final_iteration = pd.read_csv('final_iteration.csv')


Columns added and saved to final_iteration_updated.csv.


In [27]:

df = pd.read_csv('final_iteration_updated.csv')


  df = pd.read_csv('final_iteration_updated.csv')


In [28]:
df['eventId'] = df['eventId'].astype(str)
matrix_probs['eventId'] = matrix_probs['eventId'].astype(str)

df = df.merge(
    matrix_probs[['eventId', 'prob_draw', 'home_prob', 'away_prob','home_xG','away_xG']],
    on='eventId',
    how='left'  # Ensures all rows in df are kept, even if no match in matrix_probs
)

df.to_csv('final_iteration_updated.csv', index=False)

In [29]:
# df = df[['eventId','tournament','date','homeId','awayId','homeTeam','awayTeam','homeScore','awayScore','home_attack_param','home_defense_param','away_defense_param','away_attack_param','homeAttack','homeDefence','awayAttack','awayDefence','prob_home','prob_draw','prob_away','FTR','home_xG','away_xG']]

In [30]:
default_rating = 1500
default_rd = 80
default_vol = 0.06
glicko_players = {}
teams = set(df['homeTeam']).union(set(df['awayTeam']))

# Initialize Player objects with custom starting ratings
for team in teams:
    if team in elo_dict:
        starting_rating = elo_dict[team]
    else:
        # If the team is not in the elo_ratings_mov dictionary, assign a default rating
        starting_rating = 1500  # Or another value you deem appropriate
    # Create the Player object with the starting rating
    glicko_players[team] = Player(rating=starting_rating, rd=default_rd, vol=default_vol)

# Initialize lists to store ratings before the match
glicko_home_ratings = []
glicko_away_ratings = []
glicko_home_rd = []
glicko_away_rd = []
glicko_home_vol = []
glicko_away_vol = []

total_home_wins = 0
total_away_wins = 0
total_matches = 0

In [31]:
for index, row in df.iterrows():
    home_team = row['homeTeam']
    away_team = row['awayTeam']
    home_goals = row['homeScore']
    away_goals = row['awayScore']
    goal_difference = home_goals - away_goals

    # Retrieve Player objects
    home_player = glicko_players[home_team]
    away_player = glicko_players[away_team]

    # Store ratings, RD, and volatility before the match
    glicko_home_ratings.append(home_player.getRating())
    glicko_away_ratings.append(away_player.getRating())
    glicko_home_rd.append(home_player.getRd())
    glicko_away_rd.append(away_player.getRd())
    glicko_home_vol.append(home_player.vol)
    glicko_away_vol.append(away_player.vol)

    # Determine match outcome
    if goal_difference > 0:
        home_outcome = 1.0
        away_outcome = 0.0
        total_home_wins += 1
    elif goal_difference == 0:
        home_outcome = 0.5
        away_outcome = 0.5
    else:
        home_outcome = 0.0
        away_outcome = 1.0
        total_away_wins += 1

    total_matches += 1


    # Update ratings for home team
    home_player.update_player(
        rating_list=[away_player.getRating()],
        RD_list=[away_player.getRd()],
        outcome_list=[home_outcome],
        goal_difference_list=[goal_difference],
        is_home_list=[True],  # Home team is playing at home
    )

    # Update ratings for away team
    away_player.update_player(
        rating_list=[home_player.getRating()],
        RD_list=[home_player.getRd()],
        outcome_list=[away_outcome],
        goal_difference_list=[-goal_difference],
        is_home_list=[False],  # Away team is not playing at home
    )

rating list:  [-0.5756462492617337]
RD List:  [0.4605169994093869]
v:  5.726705929232926
rating list:  [-0.4973656149462989]
RD List:  [0.4509435659029199]
v:  5.947214192095335
rating list:  [-0.5756462492617337]
RD List:  [0.4605169994093869]
v:  5.726705929232926
rating list:  [-0.5259319091123444]
RD List:  [0.4559035565319133]
v:  5.865764391629058
rating list:  [-0.5259319091123444]
RD List:  [0.4559035565319133]
v:  5.586087256328288
rating list:  [-0.5241462858233314]
RD List:  [0.45569543267080215]
v:  5.7265496158669364
rating list:  [-0.5756462492617337]
RD List:  [0.4605169994093869]
v:  5.726705929232926
rating list:  [-0.6266600014484011]
RD List:  [0.4559035565319133]
v:  5.582650673457906
rating list:  [-0.5756462492617337]
RD List:  [0.4605169994093869]
v:  5.726705929232926
rating list:  [-0.5259319091123444]
RD List:  [0.4559035565319133]
v:  5.865764391629058
rating list:  [-0.5756462492617337]
RD List:  [0.4605169994093869]
v:  5.957332500098591
rating list:  [-0.4

In [32]:
# import math
# elo_dict = {}
# def davidson_mov(home_team,
#                  away_team,
#                  gd,
#                  elo_dict,
#                  total_home_wins, total_away_wins, total_matches,
#                  U=0.8,
#                  K=30,
#                  sc=600):
#     R_h = elo_dict[home_team]
#     R_a = elo_dict[away_team]

#     if gd > 0:
#         s = 1
#     elif gd < 0:
#         s = 0
#     else:
#         s = 0.5

#     gd = abs(gd)
#     mov = max(1, math.log2(gd + 1))

#     # Calculate frequencies with smoothing
#     f_home = (total_home_wins + 0.5) / (total_matches + 1)
#     f_away = (total_away_wins + 0.5) / (total_matches + 1)

#     # Calculate H
#     H = math.log10(f_home / f_away)

#     # Calculate D with home advantage
#     D = R_h - R_a + H * sc

#     # Calculate F(D)
#     numerator = math.pow(10, 0.5 * D / sc) + 0.5 * U
#     denominator = math.pow(10, 0.5 * D / sc) + math.pow(10, -0.5 * D / sc) + U
#     F_D = numerator / denominator

#     # Rating changes
#     elo_change_home = K * (s - F_D) * mov
#     new_R_h = R_h + elo_change_home

#     # For the away team
#     s_away = 1 - s
#     F_D_away = 1 - F_D
#     elo_change_away = K * (s_away - F_D_away) * mov
#     new_R_a = R_a + elo_change_away

#     return new_R_h, new_R_a


In [33]:
# elo_home_list_mov = []
# elo_away_list_mov = []

# for index, row in df.iterrows():
#     home_team = row['homeTeam']
#     away_team = row['awayTeam']
#     home_goals = row['homeScore']
#     away_goals = row['awayScore']
#     gd = home_goals - away_goals
    
#     # Initialize ELO ratings for teams if not already done
#     if home_team not in elo_dict:
#         elo_dict[home_team] = 1500
#     if away_team not in elo_dict:
#         elo_dict[away_team] = 1500

#     # Store ELO ratings before the match
#     elo_home_before_mov = elo_dict[home_team]
#     elo_away_before_mov = elo_dict[away_team]

#     elo_home_list_mov.append(elo_home_before_mov)
#     elo_away_list_mov.append(elo_away_before_mov)

#     # Update ELO ratings after the match
#     new_home_elo, new_away_elo = davidson_mov(
#         home_team,
#         away_team,
#         gd,
#         elo_dict,
#         2500,
#         1500,
#         4000,
#         K=30,
#         sc=600
#     )

#     # Update the ELO ratings in the dictionary
#     elo_dict[home_team] = new_home_elo
#     elo_dict[away_team] = new_away_elo

# df['HomeTeamElo_mov'] = elo_home_list_mov
# df['AwayTeamElo_mov'] = elo_away_list_mov

In [34]:
df['homeTeamGlickoRating'] = glicko_home_ratings
df['awayTeamGlickoRating'] = glicko_away_ratings
df['homeTeamGlickoRD'] = glicko_home_rd
df['awayTeamGlickoRD'] = glicko_away_rd

# Calculate upper and lower bounds for home team
df['homeTeamGlickoLower'] = df['homeTeamGlickoRating'] - 2 * df['homeTeamGlickoRD']
df['homeTeamGlickoUpper'] = df['homeTeamGlickoRating'] + 2 * df['homeTeamGlickoRD']

# Calculate upper and lower bounds for away team
df['awayTeamGlickoLower'] = df['awayTeamGlickoRating'] - 2 * df['awayTeamGlickoRD']
df['awayTeamGlickoUpper'] = df['awayTeamGlickoRating'] + 2 * df['awayTeamGlickoRD']


In [35]:
df.to_csv('final_iteration_updated.csv',index=False)

In [17]:
# List of columns to drop
columns_to_drop = [
    "homepenaltySaves", "awaypenaltySaves",
    "homeerrorsLeadToGoal", "awayerrorsLeadToGoal",
    "awayerrorsLeadToShot", "homeerrorsLeadToShot",
    "homepunches", "awaypunches",
    "homediveSaves", "awaydiveSaves",
    "awayhighClaims", "homehighClaims",
    "awayredCards", "homeredCards",
    "awayfinalThirdPhaseStatistic", "homefinalThirdPhaseStatistic",
    "awaytouchesInOppBox", "hometouchesInOppBox",
    "awayaccurateThroughBall", "homeaccurateThroughBall",
    "awaybigChanceScored", "homebigChanceScored",
    "homebigChanceMissed", "awaybigChanceMissed"
]

# Drop columns from the DataFrame
df = df.drop(columns=columns_to_drop, errors='ignore')  # Use errors='ignore' to avoid issues if columns are missing

# Confirm columns were dropped
print("Remaining columns in DataFrame:")
print(df.columns)


Remaining columns in DataFrame:
Index(['eventId', 'homeId', 'homeTeam', 'awayId', 'awayTeam', 'homeScore',
       'awayScore', 'FTR', 'date', 'time',
       ...
       'awayTeamGlickoUpper', 'homeAttack_y', 'homeDefence_y', 'awayAttack_y',
       'awayDefence_y', 'prob_draw_y', 'home_prob_y', 'away_prob_y',
       'home_xG_y', 'away_xG_y'],
      dtype='object', length=113)


In [18]:
df = df.fillna(0)
len(df)

6122

In [19]:
df['upper_diff'] = df['homeTeamGlickoUpper'] - df['awayTeamGlickoUpper']
df['lower_diff'] = df['homeTeamGlickoLower'] - df['awayTeamGlickoLower']
df['attack_diff'] = df['home_attack_param'] - df['away_attack_param']
df['defense_diff'] = df['home_defense_param'] - df['away_defense_param']
df['attack_diff_op'] = df['homeAttack'] - df['awayAttack']
df['defense_diff_op'] = df['homeDefence'] - df['awayDefence']
#df['elo_diff'] = df['HomeTeamElo_mov'] - df['AwayTeamElo_mov']
df['WL'] = df['home_prob'] - df['away_prob']
df['glicko_diff'] = df['homeTeamGlickoRating'] - df['awayTeamGlickoRating']


KeyError: 'homeAttack'

In [None]:
from sklearn.preprocessing import LabelEncoder

# Create a mapping of tournament_id to tournament_name
le = LabelEncoder()
df['tournament_id'] = le.fit_transform(df['tournament'])

tournament_mapping = {idx: name for idx, name in enumerate(le.classes_)}


In [None]:
# Combine homeId and awayId to get all unique team IDs
all_team_ids = pd.concat([df['homeId'], df['awayId']]).unique()

print(f"Unique Team IDs Before Encoding: {all_team_ids}")
team_encoder = LabelEncoder()
all_team_ids_encoded = team_encoder.fit_transform(all_team_ids)

# Replace homeId and awayId with encoded values
df['homeId'] = team_encoder.transform(df['homeId'])
df['awayId'] = team_encoder.transform(df['awayId'])

print(f"Home IDs After Encoding: {df['homeId'].unique()}")
print(f"Away IDs After Encoding: {df['awayId'].unique()}")
print(f"Home ID range: {df['homeId'].min()} to {df['homeId'].max()}")
print(f"Away ID range: {df['awayId'].min()} to {df['awayId'].max()}")




Unique Team IDs Before Encoding: [ 22009  22007 274650   2505  21825   2510   2512 377973 407803   2504
   2511   2506 243211 337602  52237 404108  22010   5133 337612  41618
   7080   2502 215167  39833 187643   2509  22006   2513   2508   1958
   2020   1999   1973   1963   1967   5926   5981   1957   1977   1981
  49202   1961   1960   1982   1968   1974   1966   1954   1955  37862
  37962   5143   5225 126304   5345   2416  33583  35268 230417   5194
   3177   5222   2404   3294   1760   2242  36155   1901   4921   3350
   5197   5178   5173 259121   3346   5353   5962   5171   1925  43840
   5382   5184   2032 258537   2448   3397   3248   3061   1284   2890
    661     12     71     58     74     24     46     36    263     29
     34     31     41   2999   2952   2351   2216   3245   1661   2858
      6   2833   3010     45      8   2824     59   1641   2825   2998
   6577   1644   3001     39     32     48      9     11     42     60
     30     21     25     96     61     13  

In [None]:
# # Prepare lists for numerical features, home/away IDs, tournament IDs, and targets
# numerical_features = []  # For matrix + params
# home_ids = []  # For home team embeddings
# away_ids = []  # For away team embeddings
# tournament_ids = []  # For tournament embeddings
# targets = []  # Labels
# matrices_dir = 'matrices'

# for idx, row in df.iterrows():
#     event_id = row['eventId']
#     ftr = row['FTR']  # Target

#     # Skip draws (FTR == 1)
#     if ftr == 1:
#         continue

#     # Load the corresponding matrix
#     matrix_file = os.path.join(matrices_dir, f"{event_id}.csv")
    
#     if os.path.exists(matrix_file):
#         matrix = np.loadtxt(matrix_file, delimiter=',')
        
#         # Convert matrix to tensor
#         matrix_tensor = torch.tensor(matrix, dtype=torch.float32)
        
#         # Extract additional parameters (excluding tournament ID for now)
#         params = [
#             row['upper_diff'], row['lower_diff'],
#             row['attack_diff'], row['defense_diff'],
#             row['WL'], row['prob_draw'],
#             row['attack_diff_op'], row['defense_diff_op'],
            
            
#         ]
#         params_tensor = torch.tensor(params, dtype=torch.float32)
        
#         # Combine matrix and params into a single tensor
#         combined_tensor = torch.cat([matrix_tensor.flatten(), params_tensor])  # Shape: (Flattened matrix + params)
        
#         # Store numerical features and IDs
#         numerical_features.append(combined_tensor)
#         home_ids.append(torch.tensor(row['homeId'], dtype=torch.long))  # Home team ID tensor
#         away_ids.append(torch.tensor(row['awayId'], dtype=torch.long))  # Away team ID tensor
#         tournament_ids.append(torch.tensor(row['tournament_id'], dtype=torch.long))  # Tournament ID tensor
        
#         # Store target (0 for home win, 1 for away win)
#         new_label = 0 if ftr == 0 else 1
#         targets.append(torch.tensor(new_label, dtype=torch.long))
#     else:
#         print(f"Matrix file for Event ID {event_id} not found.")

# # Stack numerical features, home IDs, away IDs, tournament IDs, and targets
# numerical_features_tensor = torch.stack(numerical_features)
# home_ids_tensor = torch.stack(home_ids)
# away_ids_tensor = torch.stack(away_ids)
# tournament_ids_tensor = torch.stack(tournament_ids)
# targets_tensor = torch.stack(targets)

# print("Numerical Features Shape:", numerical_features_tensor.shape)
# print("Home IDs Shape:", home_ids_tensor.shape)
# print("Away IDs Shape:", away_ids_tensor.shape)
# print("Tournament IDs Shape:", tournament_ids_tensor.shape)
# print("Targets Shape:", targets_tensor.shape)


In [None]:
# import torch
# from torch.utils.data import random_split

# # Prepare tensors and targets without using matrices
# data_tensors = []
# targets = []

# for idx, row in df.iterrows():
#     event_id = row['eventId']
#     ftr = row['FTR']  # Target

#     # Skip draws (FTR == 1)
#     if ftr == 1:
#         continue

#     # Extract additional parameters and convert to tensor
#     params = [
#         row['WL'],
#         row['upper_diff'],
#         row['lower_diff'],
#         row['attack_diff'],
#         row['defense_diff'],
#         row['prob_draw']
#     ]
    
#     params_tensor = torch.tensor(params, dtype=torch.float32)

#     # Adjust target: 0 for home win, 1 for away win
#     new_label = 0 if ftr == 0 else 1  # Home win -> 0, Away win -> 1

#     # Append the parameters tensor and adjusted target
#     data_tensors.append(params_tensor)
#     targets.append(torch.tensor(new_label, dtype=torch.long))

# # Convert lists to tensors
# data_tensors = torch.stack(data_tensors)
# targets = torch.stack(targets)

# print("Data Tensors Shape:", data_tensors.shape)
# print("Targets Shape:", targets.shape)

# # Define the split ratio
# train_ratio = 0.8  # 80% training, 20% testing
# train_size = int(train_ratio * len(data_tensors))
# test_size = len(data_tensors) - train_size

# # Split the dataset
# train_dataset, test_dataset = random_split(
#     list(zip(data_tensors, targets)), [train_size, test_size]
# )

# # Convert back to DataLoader or Tensor
# train_data = torch.stack([x[0] for x in train_dataset])
# train_targets = torch.stack([x[1] for x in train_dataset])
# test_data = torch.stack([x[0] for x in test_dataset])
# test_targets = torch.stack([x[1] for x in test_dataset])

# print(f"Training Data: {train_data.shape}, Training Targets: {train_targets.shape}")
# print(f"Testing Data: {test_data.shape}, Testing Targets: {test_targets.shape}")


In [None]:
# # Prepare lists for numerical features (params), matrices, home/away IDs, tournament IDs, and targets
# params_list = []       # For params from DataFrame
# matrices_list = []     # For matrices
# home_ids = []          # For home team embeddings
# away_ids = []          # For away team embeddings
# tournament_ids = []    # For tournament embeddings
# targets = []           # Labels
# matrices_dir = 'matrices'

# for idx, row in df.iterrows():
#     event_id = row['eventId']
#     ftr = row['FTR']  # Target

#     if ftr == 1:
#         continue
    
#     # Load the corresponding matrix
#     matrix_file = os.path.join(matrices_dir, f"{event_id}.csv")
    
#     if os.path.exists(matrix_file):
#         matrix = np.loadtxt(matrix_file, delimiter=',')
        
#         # Slice the matrix to keep only the first 4 rows and first 4 columns
#         matrix = matrix[:4, :4]  # Now shape is (4, 4)
        
#         # Convert matrix to tensor and keep its 4x4 shape
#         matrix_tensor = torch.tensor(matrix, dtype=torch.float32)  # Shape: (4, 4)
        
#         # Extract additional parameters (numerical features)
#         params = [
#             row['upper_diff'], row['lower_diff'],
#             row['attack_diff'], row['defense_diff'],
#             row['WL'], row['prob_draw'],
#             row['attack_diff_op'], row['defense_diff_op'],
#             #row['elo_diff']
#         ]
#         params_tensor = torch.tensor(params, dtype=torch.float32)  # Shape: (num_params,)
        
#         # Store matrices and params separately
#         matrices_list.append(matrix_tensor)     # List of tensors with shape (4, 4)
#         params_list.append(params_tensor)       # List of tensors with shape (num_params,)
        
#         # Store team and tournament IDs
#         home_ids.append(torch.tensor(row['homeId'], dtype=torch.long))
#         away_ids.append(torch.tensor(row['awayId'], dtype=torch.long))
#         tournament_ids.append(torch.tensor(row['tournament_id'], dtype=torch.long))
        
#         # Store target (0 for home win, 1 for away win)
#         new_label = 0 if ftr == 0 else 1
#         targets.append(torch.tensor(new_label, dtype=torch.long))
#     else:
#         print(f"Matrix file for Event ID {event_id} not found.")

# # Stack the lists into tensors
# matrices_tensor = torch.stack(matrices_list)        # Shape: (batch_size, 4, 4)
# params_tensor = torch.stack(params_list)            # Shape: (batch_size, num_params)
# home_ids_tensor = torch.stack(home_ids)             # Shape: (batch_size,)
# away_ids_tensor = torch.stack(away_ids)             # Shape: (batch_size,)
# tournament_ids_tensor = torch.stack(tournament_ids) # Shape: (batch_size,)
# targets_tensor = torch.stack(targets)               # Shape: (batch_size,)

# # Print shapes for verification
# print("Matrices Shape:", matrices_tensor.shape)
# print("Params Shape:", params_tensor.shape)
# print("Home IDs Shape:", home_ids_tensor.shape)
# print("Away IDs Shape:", away_ids_tensor.shape)
# print("Tournament IDs Shape:", tournament_ids_tensor.shape)
# print("Targets Shape:", targets_tensor.shape)


In [None]:
# home_sequences_list = []
# away_sequences_list = []
# sequence_length = 3  # Adjust as needed
# sequence_features = [
#     'upper_diff', 'lower_diff',
#     'attack_diff', 'defense_diff',
#     'WL', 'prob_draw',
#     'attack_diff_op', 'defense_diff_op',
#     # Add more features if necessary
# ]
df['date'] = pd.to_datetime(df['date'])
df_sorted = df.sort_values('date').reset_index(drop=True)
len(df)


5926

In [None]:
# # Initialize lists for data
# params_list = []
# matrices_list = []
# home_ids = []
# away_ids = []
# tournament_ids = []
# targets = []
# home_sequences_list = []
# away_sequences_list = []

# matrices_dir = 'matrices'
# sequence_length = 3  # Define your sequence length
# sequence_features = [
#     'upper_diff', 'lower_diff',
#     'attack_diff', 'defense_diff',
#     'WL', 'prob_draw',
#     'attack_diff_op', 'defense_diff_op',
#     'HomeTeamGlickoRating','AwayTeamGlickoRating',
#     'homeAttack','awayAttack',
#     'homeDefence','awayDefence'
#     # Add more features if necessary
# ]

# # Initialize dictionary to store past matches for each team
# team_past_matches = {}  # Key: team ID, Value: list of past match feature vectors

# for idx, row in df_sorted.iterrows():
#     event_id = row['eventId']
#     ftr = row['FTR']  # Target

#     if ftr == 1:
#         continue

#     home_id = row['homeId']
#     away_id = row['awayId']
#     tournament_id = row['tournament_id']
#     date = row['date']

#     # Load the corresponding matrix
#     matrix_file = os.path.join(matrices_dir, f"{event_id}.csv")

#     if os.path.exists(matrix_file):
#         matrix = np.loadtxt(matrix_file, delimiter=',')
        
#         # Slice the matrix to keep only the first 4 rows and first 4 columns
#         matrix = matrix[:4, :4]  # Now shape is (4, 4)
        
#         # Convert matrix to tensor and keep its 4x4 shape
#         matrix_tensor = torch.tensor(matrix, dtype=torch.float32)  # Shape: (4, 4)
        
#         # Extract additional parameters (numerical features) for the current match
#         params = row[sequence_features].values.astype(np.float32)
#         params_tensor = torch.tensor(params, dtype=torch.float32)  # Shape: (num_params,)
        
#         # Collect sequences for home and away teams
#         # Get past matches for home team
#         home_past_matches = team_past_matches.get(home_id, [])
#         if len(home_past_matches) < sequence_length:
#             # If not enough past matches, pad with zeros
#             padding_needed = sequence_length - len(home_past_matches)
#             home_padding = [np.zeros(len(sequence_features), dtype=np.float32)] * padding_needed
#             home_sequence = home_padding + home_past_matches
#         else:
#             # Take the last 'sequence_length' matches
#             home_sequence = home_past_matches[-sequence_length:]

#         # Repeat for away team
#         away_past_matches = team_past_matches.get(away_id, [])
#         if len(away_past_matches) < sequence_length:
#             padding_needed = sequence_length - len(away_past_matches)
#             away_padding = [np.zeros(len(sequence_features), dtype=np.float32)] * padding_needed
#             away_sequence = away_padding + away_past_matches
#         else:
#             away_sequence = away_past_matches[-sequence_length:]

#         # Convert sequences to tensors
#         home_sequence_tensor = torch.tensor(home_sequence, dtype=torch.float32)  # Shape: (sequence_length, num_features)
#         away_sequence_tensor = torch.tensor(away_sequence, dtype=torch.float32)

#         # Store sequences
#         home_sequences_list.append(home_sequence_tensor)
#         away_sequences_list.append(away_sequence_tensor)

#         # Store matrices and params
#         matrices_list.append(matrix_tensor)     # List of tensors with shape (4, 4)
#         params_list.append(params_tensor)       # List of tensors with shape (num_params,)
        
#         # Store team and tournament IDs
#         home_ids.append(torch.tensor(home_id, dtype=torch.long))
#         away_ids.append(torch.tensor(away_id, dtype=torch.long))
#         tournament_ids.append(torch.tensor(tournament_id, dtype=torch.long))
        
#         # Store target (0 for home win, 1 for away win)
#         new_label = 0 if ftr == 0 else 1
#         targets.append(torch.tensor(new_label, dtype=torch.long))

#         # Update past matches for both teams
#         # For the home team
#         home_match_features = params  # You can customize which features to include
#         team_past_matches.setdefault(home_id, []).append(home_match_features)
#         # For the away team
#         away_match_features = params  # Same features as home_match_features
#         team_past_matches.setdefault(away_id, []).append(away_match_features)

#     else:
#         print(f"Matrix file for Event ID {event_id} not found.")


In [None]:
def get_team_match_features(row, is_home_team):
    if is_home_team:
        team_id = row['homeId']
        is_home = 1
        goals_scored = row['homeScore']
        goals_conceded = row['awayScore']
        team_metrics = row[home_columns].to_dict() 
        # attack = row['homeAttack']
        # defence = row['homeDefence']
        # upper = row['homeTeamGlickoUpper']
        # lower = row['homeTeamGlickoLower']
        # xg = row['home_xG']
        glicko_diff = row['glicko_diff']
        attack_diff = row['attack_diff']
        defense_diff = row['defense_diff']
        attack_diff_op = row['attack_diff_op']
        defense_diff_op = row['defense_diff_op']
        wl = row['WL']
        ftr_team = 1 if row['FTR'] == 0 else 0  # Historical FTR as 1 (win), 0 (loss)
    else:
        team_id = row['awayId']
        is_home = 0
        goals_scored = row['awayScore']
        goals_conceded = row['homeScore']
        # attack = row['awayAttack']
        # defence = row['awayDefence']
        # upper = row['awayTeamGlickoUpper']
        # lower = row['awayTeamGlickoLower']
        # xg = row['away_xG']
        team_metrics = row[away_columns].to_dict() 
        glicko_diff = -row['glicko_diff']
        attack_diff = -row['attack_diff']
        defense_diff = -row['defense_diff']
        attack_diff_op = -row['attack_diff_op']
        defense_diff_op = -row['defense_diff_op']
        wl = -row['WL']
        ftr_team = 1 if row['FTR'] == 2 else 0  # Historical FTR as 1 (win), 0 (loss)

    # Include FTR as a historical feature but not for the current match
    match_features = {
        **team_metrics,
        'goals_scored': goals_scored,
        'goals_conceded': goals_conceded,
        'is_home': is_home,
        'attack_diff': attack_diff,
        'defense_diff': defense_diff,
        'attack_diff_op': attack_diff_op,
        'defense_diff_op': defense_diff_op,
        'glicko_diff': glicko_diff,
        #'Attack': attack,
        #'Defence': defence,
        #'XG': xg,
        #'upper': upper,
        #'lower': lower,
        'wl': wl,
        'draw': row['prob_draw'],
        'historical_FTR': ftr_team  # Add historical FTR for past matches only
    }

    return team_id, match_features


In [None]:
# Initialize lists for data
tournament_ids = []
targets = []
home_sequences_list = []
away_sequences_list = []

home_columns = [col for col in df.columns if col.startswith('home')]
away_columns = [col for col in df.columns if col.startswith('away')]

sequence_length = 4  # Define your sequence length

# Define the features to include in sequences
sequence_features = [
    'goals_scored', 'goals_conceded',
    'is_home', 'FTR',
    'attack_diff', 'defense_diff',
    'attack_diff_op', 'defense_diff_op',
    'Attack', 'Defence',
    'XG','glicko_diff','upper','lower',
    'wl','draw'
    #Include xG
    # Add more features if necessary
]


In [None]:


# Initialize dictionary to store past matches for each team
team_past_matches = {}  # Key: team ID, Value: list of past match feature dictionaries


for idx, row in df_sorted.iterrows():
    event_id = row['eventId']
    ftr = row['FTR']  # Target

    # Skip matches with draws if not handling them
    if ftr == 1:
        print(f"Skipping match {event_id} as it's a draw.")
        continue

    home_id = row['homeId']
    away_id = row['awayId']
    tournament_id = row['tournament_id']
    date = row['date']

    print(f"Processing match {event_id}: HomeID={home_id}, AwayID={away_id}, FTR={ftr}")

    # Extract team-specific features
    home_team_id, home_match_features = get_team_match_features(row, is_home_team=True)
    away_team_id, away_match_features = get_team_match_features(row, is_home_team=False)

    # Update past matches for both teams
    team_past_matches.setdefault(home_team_id, []).append(home_match_features)
    team_past_matches.setdefault(away_team_id, []).append(away_match_features)

    # Prepare sequences for the home team
    home_past_matches = team_past_matches[home_team_id][:-1]  # Exclude current match's FTR
    if len(home_past_matches) < sequence_length:
        padding_needed = sequence_length - len(home_past_matches)
        default_values = {key: 0 for key in sequence_features}
        home_padding = [default_values] * padding_needed
        home_sequence = home_padding + home_past_matches
    else:
        home_sequence = home_past_matches[-sequence_length:]


    # Prepare sequences for the away team
    away_past_matches = team_past_matches[away_team_id][:-1]  # Exclude current match's FTR
    if len(away_past_matches) < sequence_length:
        padding_needed = sequence_length - len(away_past_matches)
        default_values = {key: 0 for key in sequence_features}
        away_padding = [default_values] * padding_needed
        away_sequence = away_padding + away_past_matches
    else:
        away_sequence = away_past_matches[-sequence_length:]

    # Convert sequences to tensors
    home_sequence_values = [list(match.values()) for match in home_sequence]
    home_sequence_tensor = torch.tensor(home_sequence_values, dtype=torch.float32)
    away_sequence_values = [list(match.values()) for match in away_sequence]
    away_sequence_tensor = torch.tensor(away_sequence_values, dtype=torch.float32)


    # Store sequences
    home_sequences_list.append(home_sequence_tensor)
    away_sequences_list.append(away_sequence_tensor)

    tournament_ids.append(torch.tensor(tournament_id, dtype=torch.long))

    # Store target (0 for home win, 1 for away win)
    target = 0 if ftr == 0 else 1
    targets.append(torch.tensor(target, dtype=torch.long))

    print(f"Target for match {event_id}: {target}\n")



Processing match 10952495: HomeID=278, AwayID=126, FTR=0
Target for match 10952495: 0

Processing match 10952498: HomeID=277, AwayID=125, FTR=0
Target for match 10952498: 0

Processing match 10952494: HomeID=312, AwayID=277, FTR=0


ValueError: expected sequence of length 16 at dim 1 (got 64)

In [None]:
# Stack the lists into tensors
#matrices_tensor = torch.stack(matrices_list)            # Shape: (batch_size, 4, 4)
params_tensor = torch.stack(params_list)                # Shape: (batch_size, num_params)
home_ids_tensor = torch.stack(home_ids)                 # Shape: (batch_size,)
away_ids_tensor = torch.stack(away_ids)                 # Shape: (batch_size,)
tournament_ids_tensor = torch.stack(tournament_ids)     # Shape: (batch_size,)
targets_tensor = torch.stack(targets)                   # Shape: (batch_size,)
home_sequences_tensor = torch.stack(home_sequences_list)  # Shape: (batch_size, sequence_length, num_features)
away_sequences_tensor = torch.stack(away_sequences_list)  # Same shape as above

# Print shapes for verification
print("Matrices Shape:", matrices_tensor.shape)
print("Params Shape:", params_tensor.shape)
print("Home IDs Shape:", home_ids_tensor.shape)
print("Away IDs Shape:", away_ids_tensor.shape)
print("Tournament IDs Shape:", tournament_ids_tensor.shape)
print("Targets Shape:", targets_tensor.shape)
print("Home Sequences Shape:", home_sequences_tensor.shape)
print("Away Sequences Shape:", away_sequences_tensor.shape)


RuntimeError: stack expects a non-empty TensorList

In [None]:
# class FootballDataset(torch.utils.data.Dataset):
#     def __init__(self,home_ids_tensor, away_ids_tensor,
#                  tournament_ids_tensor, targets_tensor, home_sequences_tensor, away_sequences_tensor):
#         self.home_ids = home_ids_tensor
#         self.away_ids = away_ids_tensor
#         self.tournament_ids = tournament_ids_tensor
#         self.targets = targets_tensor
#         self.home_sequences = home_sequences_tensor
#         self.away_sequences = away_sequences_tensor

#     def __len__(self):
#         return len(self.targets)

#     def __getitem__(self, idx):
#         return (
#             self.home_ids[idx],
#             self.away_ids[idx],
#             self.tournament_ids[idx],
#             self.targets[idx],
#             self.home_sequences[idx],
#             self.away_sequences[idx]
#         )


In [None]:
class FootballDataset(torch.utils.data.Dataset):
    def __init__(self, tournament_ids_tensor, targets_tensor,
                 home_sequences_tensor, away_sequences_tensor,
                 home_ids_tensor=None, away_ids_tensor=None):
        self.tournament_ids = tournament_ids_tensor
        self.targets = targets_tensor
        self.home_sequences = home_sequences_tensor
        self.away_sequences = away_sequences_tensor
        self.home_ids = home_ids_tensor
        self.away_ids = away_ids_tensor

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        if self.home_ids is not None and self.away_ids is not None:
            return (
                self.tournament_ids[idx],
                self.targets[idx],
                self.home_sequences[idx],
                self.away_sequences[idx],
                self.home_ids[idx],
                self.away_ids[idx]
            )
        else:
            return (
                self.tournament_ids[idx],
                self.targets[idx],
                self.home_sequences[idx],
                self.away_sequences[idx]
            )


In [None]:
# from torch.utils.data import DataLoader, random_split

# # Create dataset
# dataset = FootballDataset()

# # Split into training and testing sets
# train_size = int(0.8 * len(dataset))
# test_size = len(dataset) - train_size
# train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# # Create DataLoaders
# batch_size = 32  # Adjust as needed
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [None]:
# from torch.utils.data import random_split, TensorDataset, DataLoader
# from torch.utils.data import Subset
# # Combine numerical features, tournament IDs, and targets into a TensorDataset
# # Combine numerical features, home IDs, away IDs, tournament IDs, and targets into a TensorDataset
# full_dataset = TensorDataset(
#     data_tensors,               #params
#     matrices_tensor,
#     home_ids_tensor,            # Home team IDs
#     away_ids_tensor,            # Away team IDs
#     tournament_ids_tensor,      # Tournament IDs
#     targets_tensor              # Labels
# )

# test_tournaments = [
#     "Liga Portugal 24/25",
#     "UEFA Champions League 24/25",
#     "UEFA Europa League 24/25",
#     "Premier League 24/25",
#     "Bundesliga 24/25",
#     "Ligue 1 24/25",
#     "LaLiga 24/25",
#     "Championship 24/25",
#     "Brasileiro Serie A 2024",
#     "MLS 2024"
# ]

# # Get the encoded tournament IDs for the test tournaments
# test_tournament_ids = le.transform(test_tournaments)

# # Filter the dataset using the encoded tournament IDs (5th element is tournament_id)
# test_indices = [i for i, data in enumerate(full_dataset) if data[4].item() in test_tournament_ids]

# # Use only the last half of the test data
# half_test_size = len(test_indices) // 2
# test_indices = test_indices[half_test_size:]  # Last half of the test indices

# # Get train indices as the complement of test indices
# train_indices = [i for i in range(len(full_dataset)) if i not in test_indices]

# # Create Subset datasets
# train_dataset = Subset(full_dataset, train_indices)
# test_dataset = Subset(full_dataset, test_indices)

# # Check dataset sizes
# print(f"Training set size: {len(train_dataset)}")
# print(f"Test set size: {len(test_dataset)}")






In [None]:
# Assuming you have already prepared the tensors
# matrices_tensor, params_tensor, home_ids_tensor, away_ids_tensor,
# tournament_ids_tensor, targets_tensor, home_sequences_tensor, away_sequences_tensor

# Initialize the custom dataset
full_dataset = FootballDataset(
    home_ids_tensor=home_ids_tensor,
    away_ids_tensor=away_ids_tensor,
    tournament_ids_tensor=tournament_ids_tensor,
    targets_tensor=targets_tensor,
    home_sequences_tensor=home_sequences_tensor,
    away_sequences_tensor=away_sequences_tensor
)

# Define test tournaments and get encoded IDs
test_tournaments = [
    "Liga Portugal 24/25",
    "UEFA Champions League 24/25",
    "UEFA Europa League 24/25",
    "Premier League 24/25",
    "Bundesliga 24/25",
    "Ligue 1 24/25",
    "LaLiga 24/25",
    "Championship 24/25",
    "Brasileiro Serie A 2024",
    "MLS 2024"
]

test_tournament_ids = le.transform(test_tournaments)

# Convert tournament IDs to numpy array
tournament_ids_np = full_dataset.tournament_ids.numpy()

# Get test indices
test_indices = [i for i, tid in enumerate(tournament_ids_np) if tid in test_tournament_ids]
half_test_size = len(test_indices) // 2
test_indices = test_indices[half_test_size:]

# Get train indices
train_indices = [i for i in range(len(full_dataset)) if i not in test_indices]

# # Create Subsets
# train_dataset = Subset(full_dataset, train_indices)
# test_dataset = Subset(full_dataset, test_indices)

# # Create DataLoaders
# batch_size = 128
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# # Check sizes
# print(f"Training set size: {len(train_dataset)}")
# print(f"Test set size: {len(test_dataset)}")


In [None]:
# Assume all tensors are prepared as before

# Convert tensors to NumPy arrays
home_sequences_np = home_sequences_tensor.numpy()
away_sequences_np = away_sequences_tensor.numpy()
#home_ids_np = home_ids_tensor.numpy()
#away_ids_np = away_ids_tensor.numpy()
tournament_ids_np = tournament_ids_tensor.numpy()
targets_np = targets_tensor.numpy()

# Split data using your indices
train_indices = np.array(train_indices)
test_indices = np.array(test_indices)

# Split sequences
home_sequences_train = home_sequences_np[train_indices]
home_sequences_test = home_sequences_np[test_indices]
away_sequences_train = away_sequences_np[train_indices]
away_sequences_test = away_sequences_np[test_indices]

# Split IDs and targets
#home_ids_train = home_ids_np[train_indices]
#home_ids_test = home_ids_np[test_indices]
#away_ids_train = away_ids_np[train_indices]
#away_ids_test = away_ids_np[test_indices]
tournament_ids_train = tournament_ids_np[train_indices]
tournament_ids_test = tournament_ids_np[test_indices]
targets_train = targets_np[train_indices]
targets_test = targets_np[test_indices]

# Reshape sequences for scaling
home_sequences_train_reshaped = home_sequences_train.reshape(-1, home_sequences_train.shape[2])
away_sequences_train_reshaped = away_sequences_train.reshape(-1, away_sequences_train.shape[2])

# Fit the scaler on training data
from sklearn.preprocessing import StandardScaler

combined_sequences_train = np.vstack((home_sequences_train_reshaped, away_sequences_train_reshaped))
scaler = StandardScaler()
scaler.fit(combined_sequences_train)

# Transform sequences
home_sequences_train_scaled = scaler.transform(home_sequences_train_reshaped).reshape(home_sequences_train.shape)
away_sequences_train_scaled = scaler.transform(away_sequences_train_reshaped).reshape(away_sequences_train.shape)

home_sequences_test_reshaped = home_sequences_test.reshape(-1, home_sequences_test.shape[2])
away_sequences_test_reshaped = away_sequences_test.reshape(-1, away_sequences_test.shape[2])

home_sequences_test_scaled = scaler.transform(home_sequences_test_reshaped).reshape(home_sequences_test.shape)
away_sequences_test_scaled = scaler.transform(away_sequences_test_reshaped).reshape(away_sequences_test.shape)

# Convert back to tensors
home_sequences_train_tensor = torch.tensor(home_sequences_train_scaled, dtype=torch.float32)
away_sequences_train_tensor = torch.tensor(away_sequences_train_scaled, dtype=torch.float32)
home_sequences_test_tensor = torch.tensor(home_sequences_test_scaled, dtype=torch.float32)
away_sequences_test_tensor = torch.tensor(away_sequences_test_scaled, dtype=torch.float32)

# Convert IDs and targets to tensors
#home_ids_train_tensor = torch.tensor(home_ids_train, dtype=torch.long)
#away_ids_train_tensor = torch.tensor(away_ids_train, dtype=torch.long)
tournament_ids_train_tensor = torch.tensor(tournament_ids_train, dtype=torch.long)
targets_train_tensor = torch.tensor(targets_train, dtype=torch.long)

#home_ids_test_tensor = torch.tensor(home_ids_test, dtype=torch.long)
#away_ids_test_tensor = torch.tensor(away_ids_test, dtype=torch.long)
tournament_ids_test_tensor = torch.tensor(tournament_ids_test, dtype=torch.long)
targets_test_tensor = torch.tensor(targets_test, dtype=torch.long)

# Create datasets
train_dataset = FootballDataset(
    #home_ids_tensor=home_ids_train_tensor,
    #away_ids_tensor=away_ids_train_tensor,
    tournament_ids_tensor=tournament_ids_train_tensor,
    targets_tensor=targets_train_tensor,
    home_sequences_tensor=home_sequences_train_tensor,
    away_sequences_tensor=away_sequences_train_tensor
)

test_dataset = FootballDataset(
    #home_ids_tensor=home_ids_test_tensor,
    #away_ids_tensor=away_ids_test_tensor,
    tournament_ids_tensor=tournament_ids_test_tensor,
    targets_tensor=targets_test_tensor,
    home_sequences_tensor=home_sequences_test_tensor,
    away_sequences_tensor=away_sequences_test_tensor
)

# # Create DataLoaders
# batch_size = 128
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [None]:
# class FullyConnectedNN(nn.Module):
#     def __init__(self):
#         super(FullyConnectedNN, self).__init__()
#         self.network = nn.Sequential(
#             nn.Linear(43, 64),       # Input layer (44 features to 256 neurons)
#             nn.ReLU(),      # Early dropout for regularization
            
#             nn.Linear(64, 128),      # Hidden layer 1
#             nn.ReLU(),
            
#             nn.Linear(128, 256),      # Hidden layer 2
#             nn.ReLU(),          # Dropout after hidden layer
#             nn.Dropout(0.2),

#             nn.Linear(256, 512),       # Hidden layer 3
#             nn.ReLU(),

#             nn.Linear(512, 256),        # Hidden layer 4
#             nn.ReLU(),

            
#             nn.Linear(256, 128),        # Hidden layer 5
#             nn.ReLU(),
#             nn.Dropout(0.2),

#             nn.Linear(128, 64),        # Hidden layer 6
#             nn.ReLU(),

#             nn.Linear(64, 32),        # Hidden layer 7
#             nn.ReLU(),

#             nn.Linear(32, 16),        # Hidden layer 7
#             nn.ReLU(),

#             nn.Dropout(0.3),          # Dropout on deeper layer
#             nn.Linear(16, 1)          # Output layer (binary classification)
#         )

#     def forward(self, x):
#         x = self.network(x)
#         return x  # No sigmoid here for BCEWithLogitsLoss

import torch
import torch.nn as nn

import torch

# class FullyConnectedNN(nn.Module):
#     def __init__(self, num_tournaments, num_teams, input_dim=0, tournament_embedding_dim=8, team_embedding_dim=20):
#         super(FullyConnectedNN, self).__init__()
        
#         # Embedding layers for tournaments, home, and away teams
#         self.tournament_embedding = nn.Embedding(num_tournaments, tournament_embedding_dim)
#         self.home_embedding = nn.Embedding(num_teams, team_embedding_dim)
#         self.away_embedding = nn.Embedding(num_teams, team_embedding_dim)
        
#         # Fully connected layers
#         combined_dim = input_dim + tournament_embedding_dim + 2 * team_embedding_dim + team_embedding_dim ** 2
#         print(combined_dim)
#         self.network = nn.Sequential(
#             nn.Linear(combined_dim, 64),
#             nn.BatchNorm1d(64),
#             nn.ReLU(),

#             nn.Linear(64, 128),
#             nn.BatchNorm1d(128),
#             nn.ReLU(),
#             nn.Dropout(0.2),

#             nn.Linear(128, 256),
#             nn.ReLU(),

#             nn.Linear(256, 128),
#             nn.ReLU(),
#             nn.Dropout(0.2),

#             nn.Linear(128, 64),
#             nn.BatchNorm1d(64),
#             nn.ReLU(),

#             nn.Linear(64, 32),
#             nn.ReLU(),
#             nn.Dropout(0.2),

#             nn.Linear(32, 1)  # Output layer
#         )
    
#     def forward(self, x, tournament_id, home_id, away_id):
#         # Get embeddings
#         tournament_embeds = self.tournament_embedding(tournament_id)  # (batch_size, tournament_embedding_dim)
#         home_embeds = self.home_embedding(home_id)  # (batch_size, team_embedding_dim)
#         away_embeds = self.away_embedding(away_id)  # (batch_size, team_embedding_dim)
        
#         # Element-wise multiplication to get interaction embedding
#         interaction_matrix = torch.bmm(home_embeds.unsqueeze(2), away_embeds.unsqueeze(1))  # (batch_size, 20, 20)
        
#         # Flatten the interaction matrix
#         interaction_flattened = interaction_matrix.view(interaction_matrix.size(0), -1)  # (batch_size, 400)

#         # Concatenate with other inputs
#         x = torch.cat((x, tournament_embeds, home_embeds, away_embeds, interaction_flattened), dim=1)
        
#         # Pass through the network
#         return self.network(x)



In [None]:

# import torch
# import torch.nn as nn
# import torch.nn.functional as F

# class FootballMatchPredictor(nn.Module):
#     def __init__(self, num_tournaments, num_teams, numeric_input_dim, tournament_embedding_dim=7, team_embedding_dim=16):
#         super(FootballMatchPredictor, self).__init__()
        
#         # Store team_embedding_dim as an instance variable
#         self.team_embedding_dim = team_embedding_dim

#         # Embedding layers with different dimensions
#         self.tournament_embedding = nn.Embedding(num_tournaments, tournament_embedding_dim)
#         self.home_embedding = nn.Embedding(num_teams, self.team_embedding_dim)
#         self.away_embedding = nn.Embedding(num_teams, self.team_embedding_dim)
        
#         # Numerical features branch
#         self.numeric_layers = nn.Sequential(
#             nn.Linear(numeric_input_dim, 16),
#             nn.GELU(),
#             nn.BatchNorm1d(16),
#             nn.Dropout(0.1),
#         )
        
#         # Probability matrix branch
#         self.prob_conv_layers = nn.Sequential(
#             nn.Conv2d(1, 16, kernel_size=2, padding=1),  # Input: (batch_size, 1, 6, 6)
#             nn.ReLU(),
#             nn.AvgPool2d(2),                  # After pooling: (batch_size, 16, 2, 2)# After conv: (batch_size, 32, 1, 1)
#             nn.Flatten(),
#             nn.Dropout(0.1)# Output: (batch_size, 32)
#         )
        
#         # Calculate output dimension from prob_conv_layers
#         prob_conv_output_dim = 16  # From the Flatten layer after conv and pooling
        
#         # Initialize the attention layer
#         self.attention_layer = nn.MultiheadAttention(embed_dim=self.team_embedding_dim, num_heads=2,dropout=0.1)
        
#         # Final layers
#         combined_dim = (
#             64 +                        # Numerical features branch output
#             prob_conv_output_dim +      # Probability matrix branch output
#             2 * self.team_embedding_dim +  # Embedding features after attention
#             tournament_embedding_dim    # Tournament embedding
#         )
#         print(f"Combined input dimension to final layers: {combined_dim}")
        
#         self.final_layers = nn.Sequential(
#             nn.Linear(combined_dim, 256),
#             nn.GELU(),
#             nn.BatchNorm1d(256),
#             nn.Linear(256, 128),
#             nn.Dropout(0.2),
#             nn.ReLU(),
#             nn.BatchNorm1d(128),
#             nn.Dropout(0.1),
#             nn.Linear(128, 1)  # Output layer
#         )

#     def forward(self, x_numeric, prob_matrix, tournament_id, home_id, away_id):
#         # Process numerical features
#         numeric_features = self.numeric_layers(x_numeric)  # Shape: (batch_size, 64)
        
#         # Process probability matrix
#         prob_matrix = prob_matrix.unsqueeze(1)  # Shape: (batch_size, 1, 6, 6)
#         prob_features = self.prob_conv_layers(prob_matrix)  # Shape: (batch_size, 32)
        
#         # Get embeddings
#         tournament_embed = self.tournament_embedding(tournament_id)  # Shape: (batch_size, tournament_embedding_dim)
#         home_embed = self.home_embedding(home_id)  # Shape: (batch_size, team_embedding_dim)
#         away_embed = self.away_embedding(away_id)  # Shape: (batch_size, team_embedding_dim)
        
#         # Prepare embeddings for attention mechanism
#         # Stack home and away embeddings to create a sequence length of 2
#         team_embeds = torch.stack([home_embed, away_embed], dim=0)  # Shape: (2, batch_size, team_embedding_dim)
        
#         # Apply attention mechanism
#         # Note: nn.MultiheadAttention expects inputs in (sequence_length, batch_size, embed_dim)
#         attn_output, _ = self.attention_layer(team_embeds, team_embeds, team_embeds)
#         # attn_output shape: (2, batch_size, team_embedding_dim)
        
#         # Flatten attention output
#         attn_output = attn_output.permute(1, 0, 2).contiguous()  # Shape: (batch_size, 2, team_embedding_dim)
#         embedding_features = attn_output.view(attn_output.size(0), -1)  # Shape: (batch_size, 2 * team_embedding_dim)
        
#         # Concatenate all features
#         combined_features = torch.cat([
#             numeric_features,
#             prob_features,
#             embedding_features,
#             tournament_embed
#         ], dim=1)
        
#         # Pass through final layers
#         output = self.final_layers(combined_features)
#         return output






In [None]:
# import torch
# import torch.nn as nn
# import torch.nn.functional as F

# class FootballMatchPredictor(nn.Module):
#     def __init__(self, num_tournaments, num_teams, numeric_input_dim,
#                  num_sequence_features, tournament_embedding_dim=7,
#                  team_embedding_dim=16, lstm_hidden_size=32):
#         super(FootballMatchPredictor, self).__init__()
        
#         self.team_embedding_dim = team_embedding_dim
#         self.lstm_hidden_size = lstm_hidden_size
        
#         # Embedding layers
#         self.tournament_embedding = nn.Embedding(num_tournaments, tournament_embedding_dim)
#         self.team_embedding = nn.Embedding(num_teams, team_embedding_dim)  # If you still want to use team embeddings
        
#         # Numerical features branch
#         self.numeric_layers = nn.Sequential(
#             nn.Linear(numeric_input_dim, 16),
#             nn.GELU(),
#             nn.BatchNorm1d(16),
#             nn.Dropout(0.1),
#         )
        
#         # Probability matrix branch
#         self.prob_conv_layers = nn.Sequential(
#             nn.Conv2d(1, 16, kernel_size=2, padding=1),
#             nn.ReLU(),
#             nn.AvgPool2d(2),
#             nn.Flatten(),
#             nn.Dropout(0.1)
#         )
        
#         # Initialize LSTM layers for home and away sequences
#         self.home_lstm = nn.LSTM(input_size=num_sequence_features, hidden_size=lstm_hidden_size,
#                                  num_layers=1, batch_first=True,
#                                  )
#         self.away_lstm = nn.LSTM(input_size=num_sequence_features, hidden_size=lstm_hidden_size,
#                                  num_layers=1, batch_first=True)
        
#         # Calculate output dimension from prob_conv_layers
#         prob_conv_output_dim = 64  # From the Flatten layer after conv and pooling
        
#         # Final layers
#         combined_dim = (
#             16 +                        # Numerical features branch output
#             prob_conv_output_dim +      # Probability matrix branch output
#             2 * lstm_hidden_size +      # LSTM outputs for home and away sequences
#             tournament_embedding_dim    # Tournament embedding
#         )
#         print(f"Combined input dimension to final layers: {combined_dim}")
        
#         self.final_layers = nn.Sequential(
#             nn.Linear(combined_dim, 256),
#             nn.GELU(),
#             nn.BatchNorm1d(256),
#             nn.Linear(256, 128),
#             nn.Dropout(0.2),
#             nn.ReLU(),
#             nn.BatchNorm1d(128),
#             nn.Dropout(0.1),
#             nn.Linear(128, 1)  # Output layer
#         )
    
#     def forward(self, x_numeric, prob_matrix, tournament_id, home_id, away_id,
#                 home_sequence, away_sequence):
#         # Process numerical features
#         numeric_features = self.numeric_layers(x_numeric)  # Shape: (batch_size, 16)
        
#         # Process probability matrix
#         prob_matrix = prob_matrix.unsqueeze(1)  # Shape: (batch_size, 1, 4, 4)
#         prob_features = self.prob_conv_layers(prob_matrix)  # Shape: (batch_size, 16)
        
#         # Get embeddings
#         tournament_embed = self.tournament_embedding(tournament_id)  # Shape: (batch_size, tournament_embedding_dim)
        
#         # Process sequences through LSTM
#         # home_sequence and away_sequence shape: (batch_size, sequence_length, num_sequence_features)
#         home_lstm_out, _ = self.home_lstm(home_sequence)
#         away_lstm_out, _ = self.away_lstm(away_sequence)
        
#         # Get the last output of the LSTM (many-to-one)
#         home_lstm_last = home_lstm_out[:, -1, :]  # Shape: (batch_size, lstm_hidden_size)
#         away_lstm_last = away_lstm_out[:, -1, :]  # Shape: (batch_size, lstm_hidden_size)
        
#         # Concatenate all features
#         combined_features = torch.cat([
#             numeric_features,
#             prob_features,
#             home_lstm_last,
#             away_lstm_last,
#             tournament_embed
#         ], dim=1)
        
#         # Pass through final layers
#         output = self.final_layers(combined_features)
#         return output
    
# num_sequence_features = len(sequence_features)  # Should match the number of features in your sequences
# num_tournaments = len(df['tournament_id'].unique())  # Unique tournament IDs
# num_teams = len(pd.concat([df['homeId'], df['awayId']]).unique())  # Unique team IDs


# # Assuming params_tensor and matrices_tensor are defined from your data preparation
# numeric_input_dim = params_tensor.shape[1]

In [None]:
# import torch
# import torch.nn as nn
# import torch.nn.functional as F

# class FootballMatchPredictor(nn.Module):
#     def __init__(self, num_tournaments, num_teams, numeric_input_dim,
#                  num_sequence_features, tournament_embedding_dim=7,
#                  team_embedding_dim=16, lstm_hidden_size=32):
#         super(FootballMatchPredictor, self).__init__()
        
#         self.team_embedding_dim = team_embedding_dim
#         self.lstm_hidden_size = lstm_hidden_size
        
#         # Embedding layers
#         self.tournament_embedding = nn.Embedding(num_tournaments, tournament_embedding_dim)
#         # Uncomment the next line if you want to use team embeddings
#         # self.team_embedding = nn.Embedding(num_teams, team_embedding_dim)
        
#         # Numerical features branch
#         self.numeric_layers = nn.Sequential(
#             nn.Linear(numeric_input_dim, 16),
#             nn.GELU(),
#             nn.BatchNorm1d(16),
#             nn.Dropout(0.1),
#         )
        
#         # Remove the probability matrix branch since it's no longer used
#         # self.prob_conv_layers = nn.Sequential(...)
        
#         # Initialize LSTM layers for home and away sequences
#         self.home_lstm = nn.LSTM(input_size=num_sequence_features, hidden_size=lstm_hidden_size,
#                                  num_layers=1, batch_first=True)
#         self.away_lstm = nn.LSTM(input_size=num_sequence_features, hidden_size=lstm_hidden_size,
#                                  num_layers=1, batch_first=True)
        
#         # Calculate the combined dimension for the final layers
#         combined_dim = (
#             16 +                        # Output from numeric_layers
#             2 * lstm_hidden_size +      # Outputs from home and away LSTMs
#             tournament_embedding_dim    # Tournament embedding
#             # If using team embeddings, add 2 * team_embedding_dim
#         )
#         print(f"Combined input dimension to final layers: {combined_dim}")
        
#         # Final layers
#         self.final_layers = nn.Sequential(
#             nn.Linear(combined_dim, 256),
#             nn.GELU(),
#             nn.BatchNorm1d(256),
#             nn.Linear(256, 128),
#             nn.Dropout(0.2),
#             nn.ReLU(),
#             nn.BatchNorm1d(128),
#             nn.Dropout(0.1),
#             nn.Linear(128, 1)  # Output layer
#         )
    
#     def forward(self, x_numeric, tournament_id, home_id, away_id,
#                 home_sequence, away_sequence):
#         # Process numerical features
#         numeric_features = self.numeric_layers(x_numeric)  # Shape: (batch_size, 16)
        
#         # Remove the probability matrix processing
#         # prob_features = self.prob_conv_layers(prob_matrix)
        
#         # Get embeddings
#         tournament_embed = self.tournament_embedding(tournament_id)  # Shape: (batch_size, tournament_embedding_dim)
#         # If using team embeddings, uncomment the next two lines
#         # home_embed = self.team_embedding(home_id)
#         # away_embed = self.team_embedding(away_id)
        
#         # Process sequences through LSTMs
#         home_lstm_out, _ = self.home_lstm(home_sequence)
#         away_lstm_out, _ = self.away_lstm(away_sequence)
        
#         # Get the last outputs of the LSTMs
#         home_lstm_last = home_lstm_out[:, -1, :]  # Shape: (batch_size, lstm_hidden_size)
#         away_lstm_last = away_lstm_out[:, -1, :]
        
#         # Concatenate all features
#         combined_features = torch.cat([
#             numeric_features,
#             home_lstm_last,
#             away_lstm_last,
#             tournament_embed
#             # If using team embeddings, include home_embed and away_embed
#             # home_embed,
#             # away_embed
#         ], dim=1)
        
#         # Pass through the final layers
#         output = self.final_layers(combined_features)
#         return output

# # Ensure that num_sequence_features matches the number of features in your sequences



In [None]:
# from torch.optim.lr_scheduler import StepLR
# device = torch.device('cpu')
# model.to(device)

# # Define loss function and optimizer
# # If you have class imbalance, you can use pos_weight
# # For example, pos_weight = torch.tensor([weight_value]).to(device)
# criterion = torch.nn.BCEWithLogitsLoss()  # Updated for binary classification
# optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
# scheduler = StepLR(optimizer, step_size=5, gamma=0.5)  # Halves LR every 5 epochs


In [None]:
# Total number of home wins (0s) and away wins (1s)
num_home_wins = (targets_tensor == 0).sum().item()
num_away_wins = (targets_tensor == 1).sum().item()

pos_weight = num_home_wins / num_away_wins
print(pos_weight*0.68)

In [None]:
# import torch
# import torch.nn as nn
# import torch.nn.functional as F

# class FootballMatchPredictor(nn.Module):
#     def __init__(self, num_tournaments, num_teams, numeric_input_dim, 
#                  tournament_embedding_dim=7, team_embedding_dim=16):
#         super(FootballMatchPredictor, self).__init__()
        
#         # Store dimensions
#         self.team_embedding_dim = team_embedding_dim
#         self.numeric_feature_dim = 16  # Output dimension of numeric_layers
        
#         # Embedding layers
#         self.tournament_embedding = nn.Embedding(num_tournaments, tournament_embedding_dim)
#         self.home_embedding = nn.Embedding(num_teams, team_embedding_dim)
#         self.away_embedding = nn.Embedding(num_teams, team_embedding_dim)
        
#         # Numerical features branch
#         self.numeric_layers = nn.Sequential(
#             nn.Linear(numeric_input_dim, self.numeric_feature_dim),
#             nn.ReLU(),
#             nn.BatchNorm1d(self.numeric_feature_dim),
#             nn.Dropout(0.1)
#         )
        
#         # Probability matrix branch
#         self.prob_conv_layers = nn.Sequential(
#             nn.Conv2d(1, 16, kernel_size=2,padding=1),
#             nn.ReLU(),
#             nn.AvgPool2d(2),
#             nn.Flatten()
#         )
        
#         # Calculate output dimension from prob_conv_layers dynamically
#         self.prob_conv_output_dim = self.prob_conv_layers(torch.randn(1, 1, 6, 6)).shape[1]
        
#         # Initialize the attention layers
#         self.attention_layer = nn.MultiheadAttention(embed_dim=self.team_embedding_dim, num_heads=2, dropout=0.1)
        
#         # Initialize cross-attention layer
#         self.numeric_projection = nn.Linear(self.numeric_feature_dim, self.team_embedding_dim)
#         self.cross_attention_layer = nn.MultiheadAttention(embed_dim=self.team_embedding_dim, num_heads=2, dropout=0.1)
        
#         # Final layers
#         combined_dim = (
#             self.team_embedding_dim +    # Output from cross-attention
#             self.prob_conv_output_dim + # Probability matrix branch output
#             tournament_embedding_dim     # Tournament embedding
#         )
#         print(f"Combined input dimension to final layers: {combined_dim}")
        
#         self.final_layers = nn.Sequential(
#             nn.Linear(87, 256),
#             nn.GELU(),  # Alternative nonlinearity
#             nn.Dropout(0.1),
#             nn.Linear(256, 128),
#             nn.GELU(),
#             nn.BatchNorm1d(128),
#             nn.Dropout(0.1),
#             nn.Linear(128, 1)  # Output layer (no sigmoid needed with BCEWithLogitsLoss)
#         )
#     def forward(self, x_numeric, prob_matrix, tournament_id, home_id, away_id):
#         batch_size = x_numeric.size(0)

#         # Process numerical features
#         numeric_features = self.numeric_layers(x_numeric)  # Shape: (batch_size, numeric_feature_dim)

#         # Project numeric features to match embedding dimension
#         numeric_features_proj = self.numeric_projection(numeric_features)  # Shape: (batch_size, team_embedding_dim)

#         # Process probability matrix
#         prob_matrix = prob_matrix.unsqueeze(1)  # Shape: (batch_size, 1, 6, 6)
#         prob_features = self.prob_conv_layers(prob_matrix)  # Shape: (batch_size, prob_conv_output_dim)

#         # Get embeddings
#         tournament_embed = self.tournament_embedding(tournament_id)  # Shape: (batch_size, tournament_embedding_dim)
#         home_embed = self.home_embedding(home_id)                    # Shape: (batch_size, team_embedding_dim)
#         away_embed = self.away_embedding(away_id)                    # Shape: (batch_size, team_embedding_dim)

#         # Prepare embeddings for self-attention with batch_first=True
#         team_embeds = torch.stack([home_embed, away_embed], dim=1)  # Shape: (batch_size, 2, team_embedding_dim)

#         # Apply self-attention on team embeddings
#         attn_output, _ = self.attention_layer(team_embeds, team_embeds, team_embeds)
#         # attn_output shape: (batch_size, 2, team_embedding_dim)

#         # Aggregate attention output by taking the mean over the sequence dimension
#         attn_output = attn_output.mean(dim=1)  # Shape: (batch_size, team_embedding_dim)

#         # Prepare inputs for cross-attention
#         attn_output_seq = attn_output.unsqueeze(1)                 # Shape: (batch_size, 1, team_embedding_dim)
#         numeric_features_seq = numeric_features_proj.unsqueeze(1)  # Shape: (batch_size, 1, team_embedding_dim)

#         # Apply cross-attention
#         cross_attn_output, _ = self.cross_attention_layer(
#             attn_output_seq,  # Query
#             numeric_features_seq,  # Key
#             numeric_features_seq   # Value
#         )
#         # cross_attn_output shape: (batch_size, 1, team_embedding_dim)

#         # Squeeze the sequence dimension
#         cross_attn_output = cross_attn_output.squeeze(1)  # Shape: (batch_size, team_embedding_dim)

#         # Concatenate all features
#         combined_features = torch.cat([
#             cross_attn_output,     # Output from cross-attention
#             prob_features,         # Probability matrix features
#             tournament_embed       # Tournament embedding
#         ], dim=1)

#         # Pass through final layers
#         output = self.final_layers(combined_features)
#         return output


# num_tournaments = len(df['tournament_id'].unique())  # Unique tournament IDs
# num_teams = len(pd.concat([df['homeId'], df['awayId']]).unique())  # Unique team IDs


# # Assuming params_tensor and matrices_tensor are defined from your data preparation
# numeric_input_dim = params_tensor.shape[1]

# # Instantiate the model
# model = FootballMatchPredictor(
#     num_tournaments=num_tournaments,
#     num_teams=num_teams,
#     numeric_input_dim=numeric_input_dim       # You can adjust this value
# )


In [None]:
from torch.utils.data import DataLoader, TensorDataset



train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)


# Output sizes for verification
print(f"Training Dataset Size: {len(train_loader.dataset)}")
print(f"Testing Dataset Size: {len(test_loader.dataset)}")


In [None]:
# import matplotlib.pyplot as plt
# from sklearn.metrics import confusion_matrix
# import seaborn as sns
# from sklearn.metrics import roc_auc_score, f1_score


# # Initialize lists to store training and validation metrics
# train_losses = []
# val_losses = []
# train_accuracies = []
# val_accuracies = []

# # Initialize lists to store all predictions and labels for confusion matrix
# all_preds = []
# all_labels = []

# num_epochs = 15
# for epoch in range(num_epochs):
#     model.train()
#     running_loss = 0.0
#     correct_train = 0  # Reset correct predictions for each epoch
#     total_train = 0    # Reset total samples for each epoch

#     for numerical_inputs, home_ids, away_ids, tournament_ids, labels in train_loader:
#         # Forward pass
#         outputs = model(numerical_inputs, tournament_ids, home_ids, away_ids)
#         loss = criterion(outputs.squeeze(1), labels.float())
        
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()
        
#         running_loss += loss.item()
        
#         # Calculate correct predictions for this batch
#         predicted_train = (torch.sigmoid(outputs) > 0.5).long()
#         correct_train += (predicted_train == labels).sum().item()  # Correct predictions in this batch
#         total_train += labels.size(0)  # Total samples in this batch

#     avg_loss = running_loss / len(train_loader)  # Average loss over all training batches
#     train_accuracy = 100 * correct_train / (total_train *32) # Epoch-level accuracy

#     train_losses.append(avg_loss)
#     train_accuracies.append(train_accuracy)

#     print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {avg_loss:.4f}, Training Accuracy: {train_accuracy:.2f}%')

#     # Validation phase
#     model.eval()
#     val_loss = 0.0
#     correct_val = 0  # Reset for each epoch
#     total_val = 0    # Reset for each epoch

#     with torch.no_grad():
#         for numerical_inputs, home_ids, away_ids, tournament_ids, labels in test_loader:
#             # Forward pass for single-sample batches
#             outputs = model(numerical_inputs, tournament_ids, home_ids, away_ids)
#             loss = criterion(outputs.squeeze(1), labels.float())
#             val_loss += loss.item()
            
#             predicted_val = (torch.sigmoid(outputs) > 0.5).long()
            
#             # Store all predictions and labels for confusion matrix
#             all_preds.extend(predicted_val.cpu().numpy())
#             all_labels.extend(labels.cpu().numpy())
            
#             correct_val += (predicted_val == labels).sum().item()  # Correct predictions
#             total_val += labels.size(0)  # Total validation samples

#     avg_val_loss = val_loss / len(test_loader)  # Average validation loss
#     val_accuracy = 100 * correct_val / total_val  # Validation accuracy

#     val_losses.append(avg_val_loss)
#     val_accuracies.append(val_accuracy)

#     print(f'Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.2f}%')




In [None]:
import torch
import torch.nn as nn

class FootballMatchPredictor(nn.Module):
    def __init__(self, num_tournaments, num_sequence_features, tournament_embedding_dim=7,
                 lstm_hidden_size=64):
        super(FootballMatchPredictor, self).__init__()
        
        self.lstm_hidden_size = lstm_hidden_size
        
        # Embedding layer for tournament IDs
        self.tournament_embedding = nn.Embedding(num_tournaments, tournament_embedding_dim)
        
        # LSTM layers for home and away sequences
        self.home_lstm = nn.LSTM(input_size=num_sequence_features, hidden_size=lstm_hidden_size,
                                 num_layers=2, bidirectional=True, batch_first=True)
        self.away_lstm = nn.LSTM(input_size=num_sequence_features, hidden_size=lstm_hidden_size,
                                 num_layers=2, bidirectional=True,batch_first=True)
        
        # Calculate combined dimension for the final layers
        combined_dim = (
            4 * lstm_hidden_size +      # Outputs from home and away LSTMs
            tournament_embedding_dim    # Tournament embedding
        )
        print(f"Combined input dimension to final layers: {combined_dim}")
        
        self.final_layers = nn.Sequential(
            nn.Linear(combined_dim, 128),
            nn.BatchNorm1d(128),  # Move BatchNorm here
            nn.GELU(),
            nn.Dropout(0.2),
            nn.Linear(128, 256),
            nn.BatchNorm1d(256),  # Move BatchNorm here
            nn.GELU(),
            nn.Dropout(0.1),
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),  # Move BatchNorm here
            nn.GELU(),
            nn.Linear(128, 1)  # Output layer for binary classification
        )

    
    def forward(self, tournament_id, home_sequence, away_sequence):
        # Get tournament embeddings
        tournament_embed = self.tournament_embedding(tournament_id)  # Shape: (batch_size, tournament_embedding_dim)
        
        # Process sequences through LSTMs
        home_lstm_out, _ = self.home_lstm(home_sequence)
        away_lstm_out, _ = self.away_lstm(away_sequence)
        
        # Get the last outputs of the LSTMs
        home_lstm_last = home_lstm_out[:, -1, :]  # Shape: (batch_size, lstm_hidden_size)
        away_lstm_last = away_lstm_out[:, -1, :]
        
        # Concatenate features
        combined_features = torch.cat([
            home_lstm_last,
            away_lstm_last,
            tournament_embed
        ], dim=1)
        
        # Pass through the final layers
        output = self.final_layers(combined_features)
        return output
    
    num_sequence_features = len(sequence_features)

# Update num_tournaments and num_teams based on your dataset
num_tournaments = len(df['tournament_id'].unique())
num_teams = len(pd.concat([df['homeId'], df['awayId']]).unique())

# Update numeric_input_dim based on your x_numeric tensor
# Assuming x_numeric is a tensor containing your numerical features for the current match


In [None]:
device = torch.device('cpu')

num_sequence_features = home_sequences_train_tensor.shape[2]
num_tournaments = len(np.unique(tournament_ids_np))

model = FootballMatchPredictor(
    num_tournaments=num_tournaments,
    num_sequence_features=num_sequence_features,
    tournament_embedding_dim=7,
    lstm_hidden_size=32
).to(device)


In [None]:
num_epochs = 25
learning_rate = 0.0001  # Define your learning rate

# Set device
device = torch.device('cpu')

# Assuming 'model' is already defined and moved to device
# For example:
# model = FootballMatchPredictor(...).to(device)
criterion = nn.BCEWithLogitsLoss(pos_weight=torch.tensor(pos_weight*0.68))
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = StepLR(optimizer, step_size=5, gamma=0.5)  # Halves LR every 5 epochs

In [None]:
# Initialize lists to store training and validation metrics
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []



for epoch in range(num_epochs):
    # Training phase
    model.train()
    running_loss = 0.0
    correct_train = 0
    total_train = 0

    for (tournament_ids, labels, home_sequences, away_sequences) in train_loader:
        # Move data to device
        tournament_ids = tournament_ids.to(device)
        labels = labels.float().unsqueeze(1).to(device)
        home_sequences = home_sequences.to(device)
        away_sequences = away_sequences.to(device)

        # Forward pass
        outputs = model(
            tournament_ids,
            home_sequences,
            away_sequences
        )
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Accumulate loss multiplied by batch size
        running_loss += loss.item() * labels.size(0)

        # Predictions and accuracy
        probs = torch.sigmoid(outputs)
        predicted_train = (probs >= 0.5).float()

        # Convert to int before comparison
        correct_train += (predicted_train.int() == labels.int()).sum().item()
        total_train += labels.size(0)

    avg_loss = running_loss / total_train
    train_accuracy = 100 * correct_train / total_train

    train_losses.append(avg_loss)
    train_accuracies.append(train_accuracy)

    print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {avg_loss:.4f}, Training Accuracy: {train_accuracy:.2f}%')

    # Validation phase
    model.eval()
    val_loss = 0.0
    correct_val = 0
    total_val = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for (tournament_ids, labels, home_sequences, away_sequences) in test_loader:
            # Move data to device
            tournament_ids = tournament_ids.to(device)
            labels = labels.float().unsqueeze(1).to(device)
            home_sequences = home_sequences.to(device)
            away_sequences = away_sequences.to(device)

            # Forward pass
            outputs = model(
                tournament_ids,
                home_sequences,
                away_sequences
            )
            loss = criterion(outputs, labels)

            # Accumulate loss multiplied by batch size
            val_loss += loss.item() * labels.size(0)

            # Predictions and accuracy
            probs = torch.sigmoid(outputs)
            predicted_val = (probs >= 0.5).float()

            # Store predictions and labels, flatten arrays
            all_preds.extend(predicted_val.cpu().numpy().flatten())
            all_labels.extend(labels.cpu().numpy().flatten())

            # Convert to int before comparison
            correct_val += (predicted_val.int() == labels.int()).sum().item()
            total_val += labels.size(0)

    avg_val_loss = val_loss / total_val
    val_accuracy = 100 * correct_val / total_val

    val_losses.append(avg_val_loss)
    val_accuracies.append(val_accuracy)

    print(f'Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.2f}%')

    # Convert predictions and labels to integer type
    all_preds_array = np.array(all_preds).astype(int)
    all_labels_array = np.array(all_labels).astype(int)

    # Compute additional metrics
    f1 = f1_score(all_labels_array, all_preds_array, average='binary')
    precision = precision_score(all_labels_array, all_preds_array)
    recall = recall_score(all_labels_array, all_preds_array)
    print(f'Validation F1 Score: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}')

    # Adjust learning rate
    scheduler.step()

# Plot training and validation losses
plt.figure(figsize=(10, 5))
plt.plot(range(1, num_epochs + 1), train_losses, label='Training Loss')
plt.plot(range(1, num_epochs + 1), val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Loss over Epochs')
plt.legend()
plt.show()

# Plot training and validation accuracies
plt.figure(figsize=(10, 5))
plt.plot(range(1, num_epochs + 1), train_accuracies, label='Training Accuracy')
plt.plot(range(1, num_epochs + 1), val_accuracies, label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy (%)')
plt.title('Accuracy over Epochs')
plt.legend()
plt.show()


In [None]:
# Validation phase for binary classification
model.eval()
val_loss = 0.0
correct_val = 0
total_val = 0
all_preds = []
all_labels = []
all_logits = []  # To store raw model logits for ROC AUC
tournament_results = {}  # To store results for each tournament

with torch.no_grad():
    for numerical_inputs, prob_matrix, home_ids, away_ids, tournament_ids, labels in test_loader:
        # Move data to device
        numerical_inputs = numerical_inputs.to(device)
        prob_matrix = prob_matrix.to(device)
        home_ids = home_ids.to(device)
        away_ids = away_ids.to(device)
        tournament_ids = tournament_ids.to(device)
        
        # Convert labels to float and reshape to (batch_size, 1)
        labels = labels.float().unsqueeze(1).to(device)  # Shape: (batch_size, 1)

        # Forward pass
        outputs = model(numerical_inputs, prob_matrix, tournament_ids, home_ids, away_ids)  # Shape: [batch_size, 1]
        loss = criterion(outputs, labels)  # Binary classification loss
        val_loss += loss.item()

        # Apply sigmoid to get probabilities
        probs = torch.sigmoid(outputs)  # Shape: (batch_size, 1)
        predicted_val = (probs >= 0.5).float()  # Threshold at 0.5

        # Store logits for ROC AUC calculation
        all_logits.extend(outputs.cpu().numpy())  # Raw logits

        # Store predictions and labels for overall metrics
        all_preds.extend(predicted_val.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

        # Store predictions and labels for each tournament
        for t_id, pred, true_label in zip(
            tournament_ids.cpu().numpy(),
            predicted_val.cpu().numpy().flatten(),
            labels.cpu().numpy().flatten()
        ):
            if t_id not in tournament_results:
                tournament_results[t_id] = {'preds': [], 'labels': []}
            tournament_results[t_id]['preds'].append(pred)
            tournament_results[t_id]['labels'].append(true_label)

        # Compute correct predictions
        correct_val += (predicted_val == labels).sum().item()
        total_val += labels.size(0)

avg_val_loss = val_loss / len(test_loader)
val_accuracy = 100 * correct_val / total_val

val_losses.append(avg_val_loss)
val_accuracies.append(val_accuracy)

print(f'Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.2f}%')

# Tournament-specific accuracies
tournament_accuracies = {}
tournament_prediction_counts = {}

for t_id, results in tournament_results.items():
    preds = np.array(results['preds'])
    labels = np.array(results['labels'])
    correct_predictions = (preds == labels).sum()
    total_predictions = len(labels)
    accuracy = 100 * correct_predictions / total_predictions

    # Get the original tournament name using the mapping
    tournament_name = tournament_mapping.get(t_id, f"Unknown Tournament ({t_id})")

    # Store accuracy and count of predictions
    tournament_accuracies[tournament_name] = accuracy
    tournament_prediction_counts[tournament_name] = total_predictions

    print(f"Tournament: {tournament_name}, Accuracy: {accuracy:.2f}%, Predictions: {total_predictions}")

# Compute additional metrics
all_preds_array = np.array(all_preds).flatten()
all_labels_array = np.array(all_labels).flatten()
all_logits_array = np.array(all_logits).flatten()  # Shape: [num_samples]

# Apply sigmoid to logits to get probabilities
all_probs_array = torch.sigmoid(torch.tensor(all_logits_array)).numpy()

# Compute ROC AUC if needed
from sklearn.metrics import roc_auc_score

roc_auc = roc_auc_score(all_labels_array, all_probs_array)
print(f'Validation ROC AUC: {roc_auc:.4f}')


In [None]:
# Sort tournament accuracies by accuracy in descending order
sorted_tournament_stats = sorted(tournament_accuracies.items(), key=lambda x: x[1], reverse=True)

print("\nTournament Performance (Sorted by Accuracy):")
print(f"{'Tournament':<30} {'Accuracy (%)':<15} {'Predictions':<10}")
print("="*60)

for tournament_name, accuracy in sorted_tournament_stats:
    total_predictions = tournament_prediction_counts[tournament_name]
    print(f"{tournament_name:<30} {accuracy:<15.2f} {total_predictions:<10}")

# Compute and print overall metrics
print("\nOverall Metrics:")
print(f'Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.2f}%')
print(f'ROC AUC Score: {roc_auc:.4f}, F1 Score: {f1:.4f}')


In [None]:
import matplotlib.pyplot as plt

# Sort the data by accuracy
sorted_accuracies = sorted(tournament_accuracies.items(), key=lambda x: x[1], reverse=True)
sorted_counts = {tournament: tournament_prediction_counts[tournament] for tournament, _ in sorted_accuracies}

# Plot
plt.figure(figsize=(12, 6))
bars = plt.barh(
    [tournament for tournament, _ in sorted_accuracies],
    [accuracy for _, accuracy in sorted_accuracies]
)

# Add counts as text annotations on the bars
for bar, (tournament, accuracy) in zip(bars, sorted_accuracies):
    count = sorted_counts[tournament]
    plt.text(
        bar.get_width() + 0.5,  # Offset slightly to the right of the bar
        bar.get_y() + bar.get_height() / 2,  # Vertically centered on the bar
        f'{count} preds',
        va='center'
    )

# Label axes and title
plt.xlabel('Accuracy (%)')
plt.ylabel('Tournament')
plt.title('Accuracy by Tournament with Prediction Counts')
plt.xlim(0, 100)  # Ensure the x-axis is properly scaled
plt.show()


In [None]:
# Plotting losses and accuracies
plt.figure(figsize=(12, 5))

# Plot training and validation loss
plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()

# Plot training and validation accuracy
plt.subplot(1, 2, 2)
plt.plot(train_accuracies, label='Training Accuracy')
plt.plot(val_accuracies, label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy')
plt.legend()

plt.tight_layout()
plt.show()

# Plot confusion matrix
conf_matrix = confusion_matrix(all_labels, all_preds)

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Home Win', 'Away Win'], 
            yticklabels=['Home Win', 'Away Win'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()


# After collecting all_preds and all_labels
auc_score = roc_auc_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds)

print(f"AUC Score: {auc_score:.4f}, F1 Score: {f1:.4f}")


In [None]:
# Validation phase
model.eval()
val_loss = 0.0
correct_val = 0
total_val = 0
all_preds = []
all_labels = []

with torch.no_grad():
    for numerical_inputs, prob_matrix, home_ids, away_ids, tournament_ids, labels in test_loader:
        # Move data to device
        numerical_inputs = numerical_inputs.to(device)
        prob_matrix = prob_matrix.to(device)
        home_ids = home_ids.to(device)
        away_ids = away_ids.to(device)
        tournament_ids = tournament_ids.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(numerical_inputs, prob_matrix, tournament_ids, home_ids, away_ids)
        
        # Ensure outputs and labels have matching shapes
        outputs_flat = outputs.squeeze(dim=1)
        loss = criterion(outputs_flat, labels.float())
        val_loss += loss.item()

        # Predictions
        predicted_val = (torch.sigmoid(outputs_flat) > 0.5).long()

        # Store predictions and labels
        all_preds.extend(predicted_val.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

        correct_val += (predicted_val == labels).sum().item()
        total_val += labels.size(0)


In [None]:
num_epochs = 30
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for inputs, labels in train_loader:
        labels = labels.float()  # Ensure labels are float for BCEWithLogitsLoss
        
        # Forward pass
        outputs = model(inputs).squeeze(1)  # Squeeze outputs to match target size [batch_size]
        loss = criterion(outputs, labels)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        
        # Calculate predictions for accuracy
        predicted = (torch.sigmoid(outputs) > 0.5).long()  # Apply sigmoid and threshold at 0.5
        total += labels.size(0)
        correct += (predicted == labels.long()).sum().item()

    # Calculate average loss and accuracy
    avg_loss = running_loss / len(train_loader)
    accuracy = 100 * correct / total
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, Training Accuracy: {accuracy:.2f}%')


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
data_tensors = torch.tensor(scaler.fit_transform(numerical_features_tensor), dtype=torch.float32)


In [None]:
model.eval()  # Set the model to evaluation mode
test_loss = 0.0
correct = 0
total = 0

with torch.no_grad():  # Disable gradient calculation for evaluation
    for numerical_inputs, tournament_ids, labels in test_loader:  # Unpack all 3 tensors
        labels = labels.float()  # Ensure labels are float for BCEWithLogitsLoss
        
        # Forward pass with numerical inputs and tournament IDs
        outputs = model(numerical_inputs, tournament_ids).squeeze(1)  # Squeeze outputs to match [batch_size]
        loss = criterion(outputs, labels)
        
        test_loss += loss.item()
        
        # Predictions: Apply sigmoid and threshold at 0.5 for binary classification
        predicted = (torch.sigmoid(outputs) > 0.5).long()
        total += labels.size(0)
        correct += (predicted == labels.long()).sum().item()

# Calculate test loss and accuracy
avg_test_loss = test_loss / len(test_loader)
test_accuracy = 100 * correct / total

print(f'Test Loss: {avg_test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%')
