In [17]:
import pandas as pd
import json
from datetime import datetime, timedelta
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pickle
import math

# Configuration
DECAY_RATE = 0.03  # 3% monthly decay
BASE_SEASON_START_MONTH = 8  # August

# Load data
features_df = pd.read_csv("Team_Match_Features.csv", parse_dates=["Match_Date"])
with open("train.json") as f:
    train_data = json.load(f)
mapping_df = pd.read_csv("auto_team_name_mapping.csv")

# Team mappings and constants
team_map = dict(zip(mapping_df["train_team"], mapping_df["Mapped_Team_Name"]))
features_df["Team_Name"] = features_df["Team_Name"].str.upper()

t5_teams = set([
    "Manchester City", "Manchester United", "Chelsea", "Arsenal", "Liverpool", 
    "Tottenham Hotspur", "Newcastle United", "Real Madrid", "Barcelona", 
    "Atletico Madrid", "Sevilla", "Valencia", "Villarreal", "Athletic Bilbao",
    "Bayern Munich", "Borussia Dortmund", "RB Leipzig", "Bayer Leverkusen", 
    "Schalke 04", "Wolfsburg", "Eintracht Frankfurt", "Juventus", "Inter Milan", 
    "Milan", "Napoli", "Roma", "Atalanta", "Lazio", "Paris Saint-Germain", 
    "Monaco", "Lille", "Lyon", "Marseille"
])

base_prestige_map = {
    "REAL MADRID": 6, "BARCELONA": 5, "BAYERN MUNICH": 5, "JUVENTUS": 4,
    "MANCHESTER CITY": 5, "LIVERPOOL": 5, "CHELSEA": 4, "PARIS SAINT-GERMAIN": 4,
    "ATLETICO MADRID": 4, "AC MILAN": 4, "INTER MILAN": 4, "MANCHESTER UNITED": 4,
    "ARSENAL": 3, "BORUSSIA DORTMUND": 3, "PORTO": 2, "BENFICA": 2, "ROMA": 3,
    "NAPOLI": 3, "SEVILLA": 3, "TOTTENHAM HOTSPUR": 3, "MONACO": 2, "LYON": 2,
    "VILLARREAL": 2, "CELTIC": 1, "RANGERS": 1
}

class PrestigeCalculator:
    def __init__(self):
        self.team_history = {}
        
    def add_achievement(self, team, date, bonus, achievement_type):
        if team not in self.team_history:
            self.team_history[team] = []
            
        # Different bonuses for different achievements
        if achievement_type == "ucl_final":
            bonus = 4.0
        elif achievement_type == "ucl_semi":
            bonus = 2.5
        elif achievement_type == "ucl_qf":
            bonus = 1.5
        elif achievement_type == "league_title":
            bonus = 2.0
            
        self.team_history[team].append({
            "date": date,
            "bonus": bonus,
            "type": achievement_type
        })
    
    def calculate_prestige(self, team, current_date):
        base = base_prestige_map.get(team, 1)
        if team not in self.team_history:
            return base
        
        total_bonus = 0
        for achievement in self.team_history[team]:
            months_passed = ((current_date - achievement["date"]).days) / 30.44
            # Slower decay for more significant achievements
            decay_rate = DECAY_RATE * (0.7 if achievement["type"] == "ucl_final" else 1.0)
            decayed_bonus = achievement["bonus"] * math.exp(-decay_rate * months_passed)
            total_bonus += decayed_bonus
        
        return min(base + total_bonus, 10)  # Cap at 10

prestige_calc = PrestigeCalculator()

# Enhanced feature engineering
def create_features(row1, row2, team1, team2, date):
    p1 = prestige_calc.calculate_prestige(team1, date)
    p2 = prestige_calc.calculate_prestige(team2, date)
    
    # Additional features
    team1_avg_goals = row1["Team_Goals_Scored"] / row1["Matches_Played"]
    team2_avg_goals = row2["Team_Goals_Scored"] / row2["Matches_Played"]
    
    return {
        "team1_goals_scored": float(row1["Team_Goals_Scored"]),
        "team1_goals_conceded": float(row1["Team_Goals_Conceded"]),
        "team1_points": float(row1["Team_Points"]),
        "team1_form": float(row1["Recent_Form"]),
        "team1_is_top5": int(team1 in t5_teams),
        "team1_prestige": p1,
        "team1_avg_goals": team1_avg_goals,
        "team2_goals_scored": float(row2["Team_Goals_Scored"]),
        "team2_goals_conceded": float(row2["Team_Goals_Conceded"]),
        "team2_points": float(row2["Team_Points"]),
        "team2_form": float(row2["Recent_Form"]),
        "team2_is_top5": int(team2 in t5_teams),
        "team2_prestige": p2,
        "team2_avg_goals": team2_avg_goals,
        "goal_diff": float(row1["Team_Goals_Scored"] - row2["Team_Goals_Scored"]),
        "form_diff": float(row1["Recent_Form"] - row2["Recent_Form"]),
        "prestige_diff": float(p1 - p2),
        "prestige_ratio": float(p1 / (p2 + 0.1)),  # Avoid division by zero
        "label": int(winner == team1)
    }

# Process historical achievements
for season, rounds in train_data.items():
    season_year = int(season[:4])
    season_start = datetime(season_year, BASE_SEASON_START_MONTH, 1)
    
    if "final" in rounds:
        final = rounds["final"][0]
        print(final)
        date = datetime.strptime(final["date"], "%d/%m/%Y")
        for team in [final["team_1"], final["team_2"]]:
            mapped_team = team_map.get(team, team.upper())
            prestige_calc.add_achievement(mapped_team, date, 2.0,"ucl_")

# Create training samples
samples = []
for season, rounds in train_data.items():
    for round_name, matches in rounds.items():
        for match in matches:
            team_1 = team_map.get(match["team_1"], match["team_1"].upper())
            team_2 = team_map.get(match["team_2"], match["team_2"].upper())
            date = datetime.strptime(match["date"], "%d/%m/%Y")
            winner = team_map.get(match["winner"], match["winner"].upper())

            # Get recent features
            def get_team_features(team):
                team_rows = features_df[
                    (features_df["Team_Name"] == team) & 
                    (features_df["Match_Date"] < date)
                ]
                if team_rows.empty:
                    return None
                return team_rows.sort_values("Match_Date").iloc[-1]
            
            row1 = get_team_features(team_1)
            row2 = get_team_features(team_2)
            if row1 is None or row2 is None:
                continue

            # Calculate decayed prestige
            p1 = prestige_calc.calculate_prestige(team_1, date)
            p2 = prestige_calc.calculate_prestige(team_2, date)

            features = {
                "team1_goals_scored": float(row1["Team_Goals_Scored"]),
                "team1_goals_conceded": float(row1["Team_Goals_Conceded"]),
                "team1_points": float(row1["Team_Points"]),
                "team1_form": float(row1["Recent_Form"]),
                "team1_is_top5": int(team_1 in t5_teams),
                "team1_prestige": p1,
                "team2_goals_scored": float(row2["Team_Goals_Scored"]),
                "team2_goals_conceded": float(row2["Team_Goals_Conceded"]),
                "team2_points": float(row2["Team_Points"]),
                "team2_form": float(row2["Recent_Form"]),
                "team2_is_top5": int(team_2 in t5_teams),
                "team2_prestige": p2,
                "goal_diff": float(row1["Team_Goals_Scored"] - row2["Team_Goals_Scored"]),
                "form_diff": float(row1["Recent_Form"] - row2["Recent_Form"]),
                "prestige_diff": float(p1 - p2),
                "label": int(winner == team_1)
            }
            samples.append(features)

# Train model
train_df = pd.DataFrame(samples)
X = train_df.drop("label", axis=1)
y = train_df["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

ensemble_model = VotingClassifier(estimators=[
    ('xgb', XGBClassifier(eval_metric="logloss", n_estimators=150)),
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42))
], voting='soft')

ensemble_model.fit(X_train, y_train)
pickle.dump({
    'model': ensemble_model,
    'prestige_calc': prestige_calc,
    'base_prestige_map': base_prestige_map
}, open("model_with_decay.pkl", "wb"))

print(f"✅ Training Accuracy: {accuracy_score(y_test, ensemble_model.predict(X_test)):.2%}")

# Load model and data
model_data = pickle.load(open("model_with_decay.pkl", "rb"))
model = model_data['model']
base_prestige_map = model_data['base_prestige_map']

features_df = pd.read_csv("Team_Match_Features.csv", parse_dates=["Match_Date"])
mapping_df = pd.read_csv("auto_team_name_mapping.csv")
with open("test_matchups.json") as f:
    test_data = json.load(f)

# Initialize prestige calculator for test predictions
test_prestige = PrestigeCalculator()
test_prestige.team_history = model_data['prestige_calc'].team_history  # Carry over training history

team_map = dict(zip(mapping_df["train_team"], mapping_df["Mapped_Team_Name"]))
features_df["Team_Name"] = features_df["Team_Name"].str.upper()

t5_teams = set([
    "Manchester City", "Manchester United", "Chelsea", "Arsenal", "Liverpool",
    "Tottenham Hotspur", "Newcastle United", "Real Madrid", "Barcelona",
    "Atletico Madrid", "Sevilla", "Valencia", "Villarreal", "Athletic Bilbao",
    "Bayern Munich", "Borussia Dortmund", "RB Leipzig", "Bayer Leverkusen",
    "Schalke 04", "Wolfsburg", "Eintracht Frankfurt", "Juventus", "Inter Milan",
    "Milan", "Napoli", "Roma", "Atalanta", "Lazio", "Paris Saint-Germain",
    "Monaco", "Lille", "Lyon", "Marseille"
])

def get_team_features(team, date):
    team_rows = features_df[
        (features_df["Team_Name"] == team) &
        (features_df["Match_Date"] < date)
    ]
    if team_rows.empty:
        return None
    return team_rows.sort_values("Match_Date").iloc[-1]

def predict_match(team1, team2, date):
    """Predict match winner between team1 and team2 on given date.
    Args:
        team1: Name of first team
        team2: Name of second team
        date: Either datetime object or string in YYYY-MM-DD format
    """
    # Handle both string and datetime inputs
    if isinstance(date, str):
        date = datetime.strptime(date, "%Y-%m-%d")
    elif not isinstance(date, datetime):
        raise ValueError("date must be either string (YYYY-MM-DD) or datetime object")
    
    team1 = team_map.get(team1, team1.upper())
    team2 = team_map.get(team2, team2.upper())
    
    def get_team_features(team):
        rows = features_df[
            (features_df["Team_Name"] == team) &
            (features_df["Match_Date"] < date)
        ]
        if rows.empty:
            return None
        return rows.sort_values("Match_Date").iloc[-1]
    
    row1 = get_team_features(team1)
    row2 = get_team_features(team2)
    
    if row1 is None or row2 is None:
        print(f"⚠️ Missing team feature rows for {team1} or {team2} on {date}")
        return team1  # or any default fallback
    
    # If any values in the rows are NaN, fallback or skip
    if row1.isnull().any() or row2.isnull().any():
        print(f"⚠️ NaNs in feature rows for {team1} or {team2} on {date}")
        return team1  # or skip or add imputation logic
    
    p1 = test_prestige.calculate_prestige(team1, date)
    p2 = test_prestige.calculate_prestige(team2, date)
    
    features = pd.DataFrame([{
        "team1_goals_scored": float(row1["Team_Goals_Scored"]),
        "team1_goals_conceded": float(row1["Team_Goals_Conceded"]),
        "team1_points": float(row1["Team_Points"]),
        "team1_form": float(row1["Recent_Form"]),
        "team1_is_top5": int(team1 in t5_teams),
        "team1_prestige": p1,
        "team2_goals_scored": float(row2["Team_Goals_Scored"]),
        "team2_goals_conceded": float(row2["Team_Goals_Conceded"]),
        "team2_points": float(row2["Team_Points"]),
        "team2_form": float(row2["Recent_Form"]),
        "team2_is_top5": int(team2 in t5_teams),
        "team2_prestige": p2,
        "goal_diff": float(row1["Team_Goals_Scored"] - row2["Team_Goals_Scored"]),
        "form_diff": float(row1["Recent_Form"] - row2["Recent_Form"]),
        "prestige_diff": float(p1 - p2),
    }])

    if features.isnull().any().any():
        print(f"⚠️ Feature vector contains NaN values for {team1} vs {team2} on {date}")
        return team1  # or use imputer or skip
    
    pred_prob = model.predict_proba(features)[0]
    return team1 if pred_prob[1] >= 0.5 else team2

def resolve_placeholder(name, r16_results, qf_results=None, sf_results=None):
    if name.startswith("Winner of QF"):
        idx = int(name.replace("Winner of QF", "")) - 1
        if qf_results is not None and 0 <= idx < len(qf_results):
            return qf_results[idx]["winner"]
        raise ValueError(f"Could not resolve placeholder '{name}' in qf_results")
    elif name.startswith("Winner of SF"):
        idx = int(name.replace("Winner of SF", "")) - 1
        if sf_results is not None and 0 <= idx < len(sf_results):
            return sf_results[idx]["winner"]
        raise ValueError(f"Could not resolve placeholder '{name}' in sf_results")
    elif name.startswith("Winner of "):
        matchup = name[len("Winner of "):]
        for match in r16_results:
            if f"{match['team_1']} vs {match['team_2']}" == matchup:
                return match["winner"]
        raise ValueError(f"Could not resolve placeholder '{name}' in r16_results")
    else:
        return name

submission_rows = []
season_id = 0

for season, rounds in test_data.items():
    # --- Round of 16 ---
    r16_matches = rounds["round_of_16_matchups"]
    r16_results = []
    for match in r16_matches:
        t1, t2 = match["team_1"], match["team_2"]
        date = datetime.strptime(match["date"], "%Y-%m-%d")
        winner = predict_match(t1, t2, date)
        r16_results.append({"team_1": t1, "team_2": t2, "winner": winner})

    # --- Quarterfinals ---
    qf_matches = rounds["quarter_finals_matchups"]
    qf_results = []
    for match in qf_matches:
        t1 = resolve_placeholder(match["team_1"], r16_results)
        t2 = resolve_placeholder(match["team_2"], r16_results)
        date = datetime.strptime(match["date"], "%Y-%m-%d")
        winner = predict_match(t1, t2, date)
        qf_results.append({"team_1": t1, "team_2": t2, "winner": winner})

    # --- Semifinals ---
    sf_matches = rounds["semi_finals_matchups"]
    sf_results = []
    for match in sf_matches:
        t1 = resolve_placeholder(match["team_1"], r16_results, qf_results=qf_results)
        t2 = resolve_placeholder(match["team_2"], r16_results, qf_results=qf_results)
        date = datetime.strptime(match["date"], "%Y-%m-%d")
        winner = predict_match(t1, t2, date)
        sf_results.append({"team_1": t1, "team_2": t2, "winner": winner})

    # --- Final ---
    final_match = rounds["final_matchup"]
    t1 = resolve_placeholder(final_match["team_1"], r16_results, qf_results=qf_results, sf_results=sf_results)
    t2 = resolve_placeholder(final_match["team_2"], r16_results, qf_results=qf_results, sf_results=sf_results)
    date = datetime.strptime(final_match["date"], "%Y-%m-%d")
    winner = predict_match(t1, t2, date)
    final_results = [{"team_1": t1, "team_2": t2, "winner": winner}]

    # --- Collect all rounds for output ---
    round_results = {
        "round_of_16": r16_results,
        "quarter_finals": qf_results,
        "semi_finals": sf_results,
        "final": final_results
    }
    submission_rows.append({
        "id": season_id,
        "season": season,
        "predictions": json.dumps(round_results)
    })
    season_id += 1

# Save to CSV in sample_submission format
pd.DataFrame(submission_rows).to_csv("predictions.csv", index=False)
print("✅ sample_submission_like_predictions.csv generated.")

{'team_1': 'Milan', 'team_2': 'Liverpool', 'date': '25/05/2005', 'winner': 'Liverpool'}
{'team_1': 'Arsenal', 'team_2': 'Barcelona', 'date': '17/05/2006', 'winner': 'Barcelona'}
{'team_1': 'Milan', 'team_2': 'Liverpool', 'date': '23/05/2007', 'winner': 'Milan'}
{'team_1': 'Manchester United', 'team_2': 'Chelsea', 'date': '21/05/2008', 'winner': 'Manchester United'}
{'team_1': 'Barcelona', 'team_2': 'Manchester United', 'date': '27/05/2009', 'winner': 'Barcelona'}
{'team_1': 'Bayern Munich', 'team_2': 'Internazionale', 'date': '22/05/2010', 'winner': 'Internazionale'}
{'team_1': 'Barcelona', 'team_2': 'Manchester United', 'date': '28/05/2011', 'winner': 'Barcelona'}
{'team_1': 'Bayern Munich', 'team_2': 'Chelsea', 'date': '19/05/2012', 'winner': 'Chelsea'}
{'team_1': 'Borussia Dortmund', 'team_2': 'Bayern Munich', 'date': '25/05/2013', 'winner': 'Bayern Munich'}
{'team_1': 'Real Madrid', 'team_2': 'Atletico Madrid', 'date': '24/05/2014', 'winner': 'Real Madrid'}
{'team_1': 'Juventus', '