# Load Cleaned Data

In [10]:

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('/Users/nazmul/Desktop/Project/ucl/data/ucl_stats_cleaned.csv')
print(f"Loaded cleaned dataset: {df.shape}")
df.head()

Loaded cleaned dataset: (685, 11)


Unnamed: 0,year,team,match_played,wins,draws,losts,goals_scored,goals_conceded,gd,group_point,champions
0,1993,Marseille,6,3,3,0,14,4,10,9,1
1,1993,Milan,7,6,0,1,11,2,9,12,0
2,1993,Rangers,6,2,4,0,7,5,2,8,0
3,1993,Club Brugge,6,2,1,3,5,8,-3,5,0
4,1993,CSKA Moscow,6,0,2,4,2,11,-9,2,0


In [11]:
#Rename Columns for Clarity
df = df.rename(columns={
    'match_played': 'matches_played',
    'losts': 'losses',
    'gd': 'goal_diff'
})
print(" Columns renamed for clarity")
print(f"Columns: {list(df.columns)}")


 Columns renamed for clarity
Columns: ['year', 'team', 'matches_played', 'wins', 'draws', 'losses', 'goals_scored', 'goals_conceded', 'goal_diff', 'group_point', 'champions']


# Create League Categories

In [13]:

leagues = {
    "LaLiga": ["Barcelona", "Real Madrid", "Atletico Madrid", "Valencia", 
               "Deportivo La Coruna", "Real Sociedad", "Celta Vigo", 
               "Villarreal", "Real Betis", "Sevilla", "Malaga"],
    
    "Premier": ["Manchester United", "Blackburn Rovers", "Newcastle United", 
                "Chelsea", "Leeds United", "Liverpool", "Tottenham Hotspur", 
                "Manchester City", "Leicester City", "Arsenal"],
    
    "Bundesliga": ["Werder Bremen", "Bayern Munich", "Borussia Dortmund", 
                   "Bayer Leverkusen", "Hertha BSC", "Stuttgart", "Schalke 04", 
                   "Hamburg", "Wolfsburg", "Borussia Monchengladbach", 
                   "RB Leipzig", "1899 Hoffenheim"],
    
    "Ligue1": ["Marseille", "Monaco", "Paris Saint-Germain", "Nantes", 
               "Auxerre", "Bordeaux", "Lyon", "Lille", "Montpellier"],
    
    "SerieA": ["Milan", "Juventus", "Parma", "Fiorentina", "Lazio", 
               "Roma", "Udinese", "Napoli", "Inter Milan", "Atalanta", 
               "Internazionale", "AC Milan", "Inter"],
    
    "Misc": []  # Will be filled with remaining teams
}

# Initialize league column
df['league'] = 'Misc'

# Assign leagues
for league, teams in leagues.items():
    if league != 'Misc':
        df.loc[df['team'].isin(teams), 'league'] = league

print(" League categories created")
print(f"\nLeague distribution:")
print(df['league'].value_counts())


 League categories created

League distribution:
league
Misc          333
Premier        84
LaLiga         80
SerieA         69
Bundesliga     63
Ligue1         56
Name: count, dtype: int64


# Create Ratio and Performance Features

In [14]:

print("\n Creating advanced features...")

# Win/Loss Ratios
df['win_ratio'] = df['wins'] / df['matches_played']
df['loss_ratio'] = df['losses'] / df['matches_played']
df['draw_ratio'] = df['draws'] / df['matches_played']

# Goals Ratios
df['goals_scored_per_match'] = df['goals_scored'] / df['matches_played']
df['goals_conceded_per_match'] = df['goals_conceded'] / df['matches_played']

# Performance Metrics
df['win_to_loss_ratio'] = df['wins'] / (df['losses'] + 1)  # +1 to avoid division by zero
df['goal_efficiency'] = df['wins'] / (df['goals_scored'] + 1)
df['defensive_strength'] = df['matches_played'] / (df['goals_conceded'] + 1)
df['points_per_match'] = df['group_point'] / df['matches_played']

# Advanced Features
df['goal_difference_per_match'] = df['goal_diff'] / df['matches_played']
df['clean_sheet_potential'] = (df['matches_played'] - df['goals_conceded']) / df['matches_played']
# Dominance Score (composite feature)
df['dominance_score'] = (
    df['win_ratio'] * 0.4 + 
    df['goal_difference_per_match'] * 0.3 + 
    df['points_per_match'] * 0.3
)

print(f" Features created. New shape: {df.shape}")
print(f"\nNew features: {list(df.columns[11:])}")



 Creating advanced features...
 Features created. New shape: (685, 24)

New features: ['league', 'win_ratio', 'loss_ratio', 'draw_ratio', 'goals_scored_per_match', 'goals_conceded_per_match', 'win_to_loss_ratio', 'goal_efficiency', 'defensive_strength', 'points_per_match', 'goal_difference_per_match', 'clean_sheet_potential', 'dominance_score']


In [15]:
#Encode Categorical Variables
from sklearn.preprocessing import LabelEncoder

# Save original team names for reference
team_encoder = LabelEncoder()
league_encoder = LabelEncoder()

df['team_encoded'] = team_encoder.fit_transform(df['team'])
df['league_encoded'] = league_encoder.fit_transform(df['league'])

# Save encoders for later use
import pickle
with open('/Users/nazmul/Desktop/Project/ucl/models/team_encoder.pkl', 'wb') as f:
    pickle.dump(team_encoder, f)
with open('/Users/nazmul/Desktop/Project/ucl/models/league_encoder.pkl', 'wb') as f:
    pickle.dump(league_encoder, f)

print("Categorical variables encoded")
print(f"Teams encoded: {len(team_encoder.classes_)}")
print(f"Leagues encoded: {len(league_encoder.classes_)}")


Categorical variables encoded
Teams encoded: 142
Leagues encoded: 6


# Prepare Final Dataset

In [16]:
# Drop original categorical columns, keep encoded versions
feature_cols = [col for col in df.columns if col not in ['team', 'league', 'year']]
df_final = df[feature_cols].copy()

print(f"\n Final dataset prepared")
print(f"Shape: {df_final.shape}")
print(f"Features: {list(df_final.columns)}")




 Final dataset prepared
Shape: (685, 23)
Features: ['matches_played', 'wins', 'draws', 'losses', 'goals_scored', 'goals_conceded', 'goal_diff', 'group_point', 'champions', 'win_ratio', 'loss_ratio', 'draw_ratio', 'goals_scored_per_match', 'goals_conceded_per_match', 'win_to_loss_ratio', 'goal_efficiency', 'defensive_strength', 'points_per_match', 'goal_difference_per_match', 'clean_sheet_potential', 'dominance_score', 'team_encoded', 'league_encoded']


In [17]:
# Save Engineered Dataset
df_final.to_csv('/Users/nazmul/Desktop/Project/ucl/data/ucl_stats_engineered.csv', index=False)

# Also save the full dataframe with original team names for reference
df.to_csv('/Users/nazmul/Desktop/Project/ucl/data/ucl_stats_with_names.csv', index=False)

print("\n Datasets saved:")
print("   -ucl_stats_engineered.csv (for modeling)")
print("   -ucl_stats_with_names.csv (with original names)")

# Display final summary
print("\n" + "=" * 70)
print("FEATURE ENGINEERING COMPLETE")
print("=" * 70)
df_final.head()


 Datasets saved:
   -ucl_stats_engineered.csv (for modeling)
   -ucl_stats_with_names.csv (with original names)

FEATURE ENGINEERING COMPLETE


Unnamed: 0,matches_played,wins,draws,losses,goals_scored,goals_conceded,goal_diff,group_point,champions,win_ratio,...,goals_conceded_per_match,win_to_loss_ratio,goal_efficiency,defensive_strength,points_per_match,goal_difference_per_match,clean_sheet_potential,dominance_score,team_encoded,league_encoded
0,6,3,3,0,14,4,10,9,1,0.5,...,0.666667,3.0,0.2,1.2,1.5,1.666667,0.333333,1.15,87,2
1,7,6,0,1,11,2,9,12,0,0.857143,...,0.285714,3.0,0.5,2.333333,1.714286,1.285714,0.714286,1.242857,88,5
2,6,2,4,0,7,5,2,8,0,0.333333,...,0.833333,2.0,0.25,1.0,1.333333,0.333333,0.166667,0.633333,106,3
3,6,2,1,3,5,8,-3,5,0,0.333333,...,1.333333,0.5,0.333333,0.666667,0.833333,-0.5,-0.333333,0.233333,43,3
4,6,0,2,4,2,11,-9,2,0,0.0,...,1.833333,0.0,0.0,0.5,0.333333,-1.5,-0.833333,-0.35,38,3
