In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import re
from sklearn.model_selection import train_test_split

# Load the data
df = pd.read_csv("23_24_match_details.csv", sep=",")

# Define comprehensive team name variations
team_variations = {
    "Team_1": ["Arsenal"],
    "Team_2": ["Aston Villa"],
    "Team_3": ["Bournemouth"],
    "Team_4": ["Brentford"],
    "Team_5": ["Brighton & Hove Albion", "Brighton"],
    "Team_6": ["Burnley"],
    "Team_7": ["Chelsea"],
    "Team_8": ["Crystal Palace"],
    "Team_9": ["Everton"],
    "Team_10": ["Fulham"],
    "Team_11": ["Liverpool"],
    "Team_12": ["Luton Town"],
    "Team_13": ["Manchester City", "Man City"],
    "Team_14": ["Manchester United"],
    "Team_15": ["Newcastle United", "Newcastle"],
    "Team_16": ["Nottingham Forest"],
    "Team_17": ["Sheffield United"],
    "Team_18": ["Tottenham Hotspur", "Tottenham"],
    "Team_19": ["West Ham United", "West Ham"],
    "Team_20": ["Wolves"],
}

# Create a reverse mapping dictionary with lowercase keys for case-insensitive matching
mask_mapping = {}
for team_label, variations in team_variations.items():
    for name in variations:
        mask_mapping[name.lower()] = team_label

# Function to replace team names in text with their masked counterparts
def mask_teams(text, mapping):
    if pd.isnull(text):
        return text
    sorted_teams = sorted(mapping.keys(), key=len, reverse=True)
    for team in sorted_teams:
        pattern = r"\b" + re.escape(team) + r"(?:'s)?\b"
        replacement = mapping[team]
        text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
    return text

# Apply masking to 'Home' and 'Away' columns
if "Home" in df.columns:
    df["Home_Masked"] = df["Home"].str.lower().map(mask_mapping)
else:
    df["Home_Masked"] = np.nan

if "Away" in df.columns:
    df["Away_Masked"] = df["Away"].str.lower().map(mask_mapping)
else:
    df["Away_Masked"] = np.nan

# Apply masking to 'events' and 'summary' columns
if "events" in df.columns:
    df["text_Masked"] = df["events"].apply(lambda x: mask_teams(x, mask_mapping))
else:
    df["text_Masked"] = np.nan

# Define binary event functions
def red_card_followed_by_goal(text):
    """Check if there was a red card followed by a goal."""
    if pd.isnull(text): return 0
    if re.search(r"red card.*?goal", text, re.IGNORECASE | re.DOTALL):
        return 1
    return 0

def last_minute_goal(text):
    """Check if a goal was scored in the last minute or extra time."""
    if pd.isnull(text): return 0
    if re.search(r"goal.*?(90'|extra time)", text, re.IGNORECASE):
        return 1
    return 0

def comeback_attempt(text):
    """Check if there was an attempt to come back (team scores after being down)."""
    if pd.isnull(text): return 0
    if re.search(r"down.*?goal", text, re.IGNORECASE | re.DOTALL):
        return 1
    return 0

def var_review_red_card(text):
    """Check if a VAR review led to a red card."""
    if pd.isnull(text): return 0
    if re.search(r"VAR review.*?red card", text, re.IGNORECASE | re.DOTALL):
        return 1
    return 0

def hat_trick(text):
    """Check if there was a hat-trick."""
    if pd.isnull(text): return 0
    if re.search(r"hat[- ]trick", text, re.IGNORECASE):
        return 1
    return 0

def penalty_drama(text):
    """Check if there was a missed or contested penalty."""
    if pd.isnull(text): return 0
    if re.search(r"penalty.*?(missed|saved|contested)", text, re.IGNORECASE):
        return 1
    return 0

# Apply the binary event functions
if "text_Masked" in df.columns:
    df["Red_Card_Followed_By_Goal"] = df["text_Masked"].apply(red_card_followed_by_goal)
    df["Last_Minute_Goal"] = df["text_Masked"].apply(last_minute_goal)
    df["Comeback_Attempt"] = df["text_Masked"].apply(comeback_attempt)
    df["VAR_Review_Red_Card"] = df["text_Masked"].apply(var_review_red_card)
    df["Hat_Trick"] = df["text_Masked"].apply(hat_trick)
    df["Penalty_Drama"] = df["text_Masked"].apply(penalty_drama)
else:
    df["Red_Card_Followed_By_Goal"] = 0
    df["Last_Minute_Goal"] = 0
    df["Comeback_Attempt"] = 0
    df["VAR_Review_Red_Card"] = 0
    df["Hat_Trick"] = 0
    df["Penalty_Drama"] = 0

# Calculate overall excitement score by summing the binary event columns
df["Excitement_Score"] = (
    df["Red_Card_Followed_By_Goal"] +
    df["Last_Minute_Goal"] +
    df["Comeback_Attempt"] +
    df["VAR_Review_Red_Card"] +
    df["Hat_Trick"] +
    df["Penalty_Drama"]
)

# Label excitement level based on score threshold
def label_excitement(score):
    if score >= 2:
        return "Exciting"
    else:
        return "Normal"

df["Excitement_Label"] = df["Excitement_Score"].apply(label_excitement)

# Rename columns to prepare for tokenization and processing
df.rename(columns={"id": "textid", "text_Masked": "text", "Excitement_Label": "label"}, inplace=True)

# Ensure all text inputs are strings and create nested list structure
df["text"] = df["text"].fillna("").astype(str)  # Ensure all text is string
df["text"] = df["text"].apply(lambda x: [x])  # Convert to nested list format

# Split the data into training (80%) and temp (20%)
train_data, temp_data = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label"])

# Split the temp data into validation (10%) and test (10%)
validation_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42, stratify=temp_data["label"])

# Check the distribution of labels in each split
print("\nDistribution of Excitement Levels:")
print("Training Set:")
print(train_data["label"].value_counts(normalize=True))
print("\nValidation Set:")
print(validation_data["label"].value_counts(normalize=True))
print("\nTest Set:")
print(test_data["label"].value_counts(normalize=True))

# Save each split to separate files
train_data.to_csv("train_data.tsv", sep="\t", index=False)
validation_data.to_csv("validation_data.tsv", sep="\t", index=False)
test_data.to_csv("test_data.tsv", sep="\t", index=False)

print("\nData has been split and saved into the following files:")
print("1. Training Data: train_data.tsv")
print("2. Validation Data: validation_data.tsv")
print("3. Test Data: test_data.tsv")


Distribution of Excitement Levels:
Training Set:
label
Normal      0.640496
Exciting    0.359504
Name: proportion, dtype: float64

Validation Set:
label
Normal      0.633333
Exciting    0.366667
Name: proportion, dtype: float64

Test Set:
label
Normal      0.645161
Exciting    0.354839
Name: proportion, dtype: float64

Data has been split and saved into the following files:
1. Training Data: train_data.tsv
2. Validation Data: validation_data.tsv
3. Test Data: test_data.tsv
