In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split

# Load the main match data
df = pd.read_csv("23_24_match_details.csv", sep=",")

# Load the historical achievements data
achievement_df = pd.read_csv("historical_team_achievements.csv")

# Calculate Points Per Game (PPG) for each team per season
achievement_df['PPG'] = achievement_df['points'] / achievement_df['played']

# Calculate the years ago for each season
achievement_df['years_ago'] = 2024 - achievement_df['season_end_year']

# Define half-life in years for weighting
half_life = 5

# Compute weight for each season using exponential decay
achievement_df['weight'] = 0.5 ** (achievement_df['years_ago'] / half_life)

# Create a DataFrame with all combinations of team and season_end_year
teams = achievement_df['team'].unique()
all_seasons = pd.DataFrame({'season_end_year': range(1993, 2025)})
all_teams_seasons = pd.MultiIndex.from_product(
    [teams, all_seasons['season_end_year']],
    names=['team', 'season_end_year']
).to_frame(index=False)

# Merge the achievement data with the all_teams_seasons DataFrame
achievement_df_full = pd.merge(
    all_teams_seasons,
    achievement_df[['team', 'season_end_year', 'PPG', 'notes', 'weight']],
    on=['team', 'season_end_year'],
    how='left'
)

# Fill missing PPGs with zeros (for seasons where the team did not participate)
achievement_df_full['PPG'] = achievement_df_full['PPG'].fillna(0)

# Recalculate 'years_ago' and 'weight' for missing entries
achievement_df_full['years_ago'] = 2024 - achievement_df_full['season_end_year']
achievement_df_full['weight'] = 0.5 ** (achievement_df_full['years_ago'] / half_life)

# Adjust PPG for relegated teams
achievement_df_full['notes'] = achievement_df_full['notes'].fillna('')

def adjust_PPG(row):
    if 'Relegated' in row['notes']:
        return 0  # Penalize by setting PPG to zero
    else:
        return row['PPG']

achievement_df_full['PPG_adj'] = achievement_df_full.apply(adjust_PPG, axis=1)
achievement_df_full['weighted_PPG'] = achievement_df_full['PPG_adj'] * achievement_df_full['weight']

# Compute the achievement score for each team
achievement_scores = achievement_df_full.groupby('team').agg(
    achievement_score=('weighted_PPG', 'sum'),
    total_weight=('weight', 'sum')
)
achievement_scores['achievement_score'] = achievement_scores['achievement_score'] / achievement_scores['total_weight']

# Define comprehensive team name variations
team_variations = {
    "Team_1": ["Arsenal"],
    "Team_2": ["Aston Villa"],
    "Team_3": ["Bournemouth"],
    "Team_4": ["Brentford"],
    "Team_5": ["Brighton & Hove Albion", "Brighton"],
    "Team_6": ["Burnley"],
    "Team_7": ["Chelsea"],
    "Team_8": ["Crystal Palace"],
    "Team_9": ["Everton"],
    "Team_10": ["Fulham"],
    "Team_11": ["Liverpool"],
    "Team_12": ["Luton Town"],
    "Team_13": ["Manchester City", "Man City"],
    "Team_14": ["Manchester United", "Manchester Utd"],
    "Team_15": ["Newcastle United", "Newcastle Utd", "Newcastle"],
    "Team_16": ["Nottingham Forest"],
    "Team_17": ["Sheffield United", "Sheffield Utd"],
    "Team_18": ["Tottenham Hotspur", "Tottenham"],
    "Team_19": ["West Ham United", "West Ham"],
    "Team_20": ["Wolves", "Wolverhampton Wanderers"],
}

# Create a reverse mapping dictionary with lowercase keys for case-insensitive matching
mask_mapping = {}
for team_label, variations in team_variations.items():
    for name in variations:
        mask_mapping[name.lower()] = team_label

# Function to replace team names in text with their masked counterparts
def mask_teams(text, mapping):
    if pd.isnull(text):
        return text
    sorted_teams = sorted(mapping.keys(), key=len, reverse=True)
    for team in sorted_teams:
        pattern = r"\b" + re.escape(team) + r"(?:'s)?\b"
        replacement = mapping[team]
        text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
    return text

# Apply masking to 'Home' and 'Away' columns
df["Home_Masked"] = df["Home"].str.lower().map(mask_mapping)
df["Away_Masked"] = df["Away"].str.lower().map(mask_mapping)

# Apply masking to 'events' and 'summary' columns
df["text_Masked"] = df["events"].apply(lambda x: mask_teams(x, mask_mapping))

# Map masked team names back to actual team names
masked_team_to_team_name = {}
for team_label, names in team_variations.items():
    masked_team_to_team_name[team_label] = names[0]

df['Home_Team'] = df['Home_Masked'].map(masked_team_to_team_name)
df['Away_Team'] = df['Away_Masked'].map(masked_team_to_team_name)

# Merge achievement scores into the main DataFrame for Home and Away teams
df = df.merge(
    achievement_scores[['achievement_score']],
    left_on='Home_Team',
    right_index=True,
    how='left'
)
df.rename(columns={'achievement_score': 'Home_Achievement_Score'}, inplace=True)

df = df.merge(
    achievement_scores[['achievement_score']],
    left_on='Away_Team',
    right_index=True,
    how='left'
)
df.rename(columns={'achievement_score': 'Away_Achievement_Score'}, inplace=True)

# Define labels based on which team has higher achievement score
def label_match(row):
    if row['Home_Achievement_Score'] > row['Away_Achievement_Score']:
        return 'Team1_Higher'
    elif row['Home_Achievement_Score'] < row['Away_Achievement_Score']:
        return 'Team2_Higher'
    else:
        return 'Equal'

df['Achievement_Label'] = df.apply(label_match, axis=1)

# Rename columns to prepare for tokenization and processing
df.rename(columns={"id": "textid", "text_Masked": "text", "Achievement_Label": "label"}, inplace=True)

# Ensure all text inputs are strings and create nested list structure
df["text"] = df["text"].fillna("").astype(str)
df["text"] = df["text"].apply(lambda x: [x])

# Split the data into training (80%) and temp (20%)
train_data, temp_data = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label"])

# Split the temp data into validation (10%) and test (10%)
validation_data, test_data = train_test_split(
    temp_data, test_size=0.5, random_state=42, stratify=temp_data["label"]
)

# Check the distribution of labels in each split
print("\nDistribution of Achievement Labels:")
print("Training Set:")
print(train_data["label"].value_counts(normalize=True))
print("\nValidation Set:")
print(validation_data["label"].value_counts(normalize=True))
print("\nTest Set:")
print(test_data["label"].value_counts(normalize=True))

# Save each split to separate files
train_data.to_csv("train_data.tsv", sep="\t", index=False)
validation_data.to_csv("validation_data.tsv", sep="\t", index=False)
test_data.to_csv("test_data.tsv", sep="\t", index=False)

print("\nData has been split and saved into the following files:")
print("1. Training Data: train_data.tsv")
print("2. Validation Data: validation_data.tsv")
print("3. Test Data: test_data.tsv")


FileNotFoundError: [Errno 2] No such file or directory: '23_24_match_details.csv'