In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import re

# Load the data
df = pd.read_csv("23_24_match_details.csv", sep=",")

# Verify column names
print("Columns in the DataFrame:")
print(df.columns.tolist())

# Define comprehensive team name variations
team_variations = {
    "Team_1": ["Arsenal"],
    "Team_2": ["Aston Villa"],
    "Team_3": ["Bournemouth"],
    "Team_4": ["Brentford"],
    "Team_5": ["Brighton & Hove Albion", "Brighton"],
    "Team_6": ["Burnley"],
    "Team_7": ["Chelsea"],
    "Team_8": ["Crystal Palace"],
    "Team_9": ["Everton"],
    "Team_10": ["Fulham"],
    "Team_11": ["Liverpool"],
    "Team_12": ["Luton Town"],
    "Team_13": ["Manchester City", "Man City"],
    "Team_14": ["Manchester United"],
    "Team_15": ["Newcastle United", "Newcastle"],
    "Team_16": ["Nottingham Forest"],
    "Team_17": ["Sheffield United"],
    "Team_18": ["Tottenham Hotspur", "Tottenham"],
    "Team_19": ["West Ham United", "West Ham"],
    "Team_20": ["Wolves"],
}

# Create a reverse mapping dictionary with lowercase keys for case-insensitive matching
mask_mapping = {name.lower(): label for label, variations in team_variations.items() for name in variations}
print("\nMask Mapping Dictionary (Lowercased):")
print(mask_mapping)

# Function to replace team names in text with their masked counterparts
def mask_teams(text, mapping):
    if pd.isnull(text):
        return text
    sorted_teams = sorted(mapping.keys(), key=len, reverse=True)
    for team in sorted_teams:
        pattern = r"\b" + re.escape(team) + r"(?:'s)?\b"
        replacement = mapping[team]
        text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
    return text

# Apply masking to 'Home' and 'Away' columns
df["Home_Masked"] = df["Home"].str.lower().map(mask_mapping)
df["Away_Masked"] = df["Away"].str.lower().map(mask_mapping)

# Apply masking to 'events' and 'summary' columns
df["text_Masked"] = df["events"].apply(lambda x: mask_teams(x, mask_mapping)) if "events" in df.columns else np.nan
df["summary_Masked"] = df["summary"].apply(lambda x: mask_teams(x, mask_mapping)) if "summary" in df.columns else np.nan

# Define functions for labeling specific scenarios in the 'events' text
def label_red_card_scenario(events):
    events = events.lower()
    if "red card" in events:
        if "goal" in events:
            return "Red Card Comeback"
        elif "shutout" in events or "no goal":
            return "Red Card Shutout"
    return None

def label_goal_thriller(events):
    goal_count = len(re.findall(r'goal', events.lower()))
    if goal_count >= 5:
        return "Goal-Filled Thriller"
    return None

def label_last_minute_goal(events):
    if "goal" in events and ("90'" in events or "extra time" in events):
        return "Last-Minute Goal"
    return None

def label_comeback(events):
    if "comeback" in events.lower():
        if "win" in events:
            return "Late Comeback Victory"
        elif "draw" in events:
            return "Comeback to Draw"
        else:
            return "Failed Comeback"
    return None

# Apply scenario-specific labeling functions to create additional columns
df['Red_Card_Scenario'] = df['events'].apply(label_red_card_scenario)
df['Goal_Thriller'] = df['events'].apply(label_goal_thriller)
df['Last_Minute_Goal'] = df['events'].apply(label_last_minute_goal)
df['Comeback'] = df['events'].apply(label_comeback)

# Define an overall excitement level based on the scenario-specific labels
def overall_excitement(row):
    if row['Red_Card_Scenario'] == "Red Card Comeback" or row['Last_Minute_Goal'] == "Last-Minute Goal":
        return "Exciting"
    elif row['Goal_Thriller'] == "Goal-Filled Thriller" or row['Comeback'] in ["Late Comeback Victory", "Comeback to Draw"]:
        return "Very Exciting"
    else:
        return "Normal"

df['Enhanced_Label'] = df.apply(overall_excitement, axis=1)

# Display the enhanced labeled data
print("\nEnhanced Labeled Data:")
print(df[["id", "Home_Masked", "Away_Masked", "Enhanced_Label"]].head())

# Summary counts of enhanced labels
summary_counts = df["Enhanced_Label"].value_counts(dropna=True)
print("\nSummary Counts:")
print(summary_counts)

# Visualize the enhanced label counts
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x="Enhanced_Label", palette="viridis")
plt.title("Count of Matches by Enhanced Label")
plt.xlabel("Match Label")
plt.ylabel("Number of Matches")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Pie chart of enhanced label distribution
plt.figure(figsize=(8, 8))
plt.pie(summary_counts, labels=summary_counts.index, autopct="%1.1f%%", colors=sns.color_palette("viridis"))
plt.title("Distribution of Enhanced Match Labels")
plt.axis("equal")
plt.show()

# Define save paths
save_path = "enhanced_processed_data.tsv"
columns_to_save = [
    "id", "Home_Masked", "Away_Masked", "Date", "Stadium", "Attendance", "Referee", "Enhanced_Label"
]

if "text_Masked" in df.columns:
    df = df.rename(columns={"text_Masked": "text"})
    columns_to_save.append("text")

if "summary_Masked" in df.columns:
    df = df.rename(columns={"summary_Masked": "summary"})
    columns_to_save.append("summary")

# Save enhanced processed data
result_table = df[columns_to_save]
result_table.to_csv(save_path, sep="\t", index=False)
print(f"\nEnhanced processed data saved to {save_path}")

# Define paths for minimal examples
minimal_example_path = "lab7_minimal_example.tsv"
minimal_example_path2 = "lab7_minimal_example2.tsv"

# Save one of each enhanced label category
minimal_example = df.groupby("Enhanced_Label").first().reset_index()[columns_to_save]
minimal_example.to_csv(minimal_example_path, sep="\t", index=False)

minimal_example2 = df.groupby("Enhanced_Label").last().reset_index()[columns_to_save]
minimal_example2.to_csv(minimal_example_path2, sep="\t", index=False)

print(f"Minimal examples saved as '{minimal_example_path}' and '{minimal_example_path2}'")


Columns in the DataFrame:
['id', 'Home', 'Away', 'Date', 'Stadium', 'Attendance', 'Referee', 'events', 'summary']

Mask Mapping Dictionary (Lowercased):
{'arsenal': 'Team_1', 'aston villa': 'Team_2', 'bournemouth': 'Team_3', 'brentford': 'Team_4', 'brighton & hove albion': 'Team_5', 'brighton': 'Team_5', 'burnley': 'Team_6', 'chelsea': 'Team_7', 'crystal palace': 'Team_8', 'everton': 'Team_9', 'fulham': 'Team_10', 'liverpool': 'Team_11', 'luton town': 'Team_12', 'manchester city': 'Team_13', 'man city': 'Team_13', 'manchester united': 'Team_14', 'newcastle united': 'Team_15', 'newcastle': 'Team_15', 'nottingham forest': 'Team_16', 'sheffield united': 'Team_17', 'tottenham hotspur': 'Team_18', 'tottenham': 'Team_18', 'west ham united': 'Team_19', 'west ham': 'Team_19', 'wolves': 'Team_20'}


AttributeError: 'float' object has no attribute 'lower'