In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

def get_result(row):
    if row['gh'] > row['ga']:
        return 'home_win'
    elif row['gh'] < row['ga']:
        return 'away_win'
    else:
        return 'draw'

# Load and preprocess data
df = pd.read_parquet('games.parquet')
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df = df[df['date'] >= '2000-01-01']

# Basic filtering (as in original code)
top_leagues = ['england', 'spain', 'italy', 'germany', 'france']
uefa_comp = ['UEFA EL', 'UEFA CL', 'UEFA CONF L']
condition_domestic = df['competition'].str.lower().isin(top_leagues)
condition_uefa = df['competition'].str.upper().isin(uefa_comp)
df_filtered = df[condition_domestic | condition_uefa]
df_filtered = df_filtered.sort_values(by='date')

# Create target variable
df_filtered['match_result'] = df_filtered.apply(get_result, axis=1)

# Feature engineering with proper time-based splits
def calculate_rolling_stats(data, date):
    """Calculate rolling statistics using only past data"""
    past_data = data[data['date'] < date].copy()
    
    # Calculate team performance metrics
    team_stats = {}
    
    for team in data['home'].unique():
        # Get past home games
        home_games = past_data[past_data['home'] == team].tail(5)
        away_games = past_data[past_data['away'] == team].tail(5)
        
        # Calculate average goals
        avg_goals_scored = (home_games['gh'].mean() + away_games['ga'].mean()) / 2
        avg_goals_conceded = (home_games['ga'].mean() + away_games['gh'].mean()) / 2
        
        # Calculate win rate
        home_wins = (home_games['gh'] > home_games['ga']).mean()
        away_wins = (away_games['ga'] > away_games['gh']).mean()
        win_rate = (home_wins + away_wins) / 2
        
        team_stats[team] = {
            'avg_goals_scored': avg_goals_scored,
            'avg_goals_conceded': avg_goals_conceded,
            'win_rate': win_rate
        }
    
    return team_stats

def prepare_features(df, team_stats):
    """Prepare features for a single match"""
    features = pd.DataFrame()
    
    # Add team performance metrics
    for index, row in df.iterrows():
        home_team = row['home']
        away_team = row['away']
        
        if home_team in team_stats and away_team in team_stats:
            features.loc[index, 'home_avg_goals'] = team_stats[home_team]['avg_goals_scored']
            features.loc[index, 'home_avg_conceded'] = team_stats[home_team]['avg_goals_conceded']
            features.loc[index, 'home_win_rate'] = team_stats[home_team]['win_rate']
            
            features.loc[index, 'away_avg_goals'] = team_stats[away_team]['avg_goals_scored']
            features.loc[index, 'away_avg_conceded'] = team_stats[away_team]['avg_goals_conceded']
            features.loc[index, 'away_win_rate'] = team_stats[away_team]['win_rate']
    
    # Add competition features
    features = pd.concat([features, pd.get_dummies(df['competition'], prefix='comp')], axis=1)
    
    # Add month feature (seasonality)
    features['month'] = df['date'].dt.month
    
    return features

# Split data chronologically
train_date = df_filtered['date'].quantile(0.8)
train_df = df_filtered[df_filtered['date'] <= train_date]
test_df = df_filtered[df_filtered['date'] > train_date]

# Prepare features for training data
X_train = pd.DataFrame()
y_train = []

# Process training data in chronological order
for date in sorted(train_df['date'].unique()):
    # Calculate stats using only past data
    team_stats = calculate_rolling_stats(train_df, date)
    
    # Get matches for current date
    current_matches = train_df[train_df['date'] == date]
    
    # Prepare features
    current_features = prepare_features(current_matches, team_stats)
    
    if not current_features.empty:
        X_train = pd.concat([X_train, current_features])
        y_train.extend(current_matches['match_result'].tolist())

# Prepare features for test data
X_test = pd.DataFrame()
y_test = []

# Process test data using only training data for statistics
for date in sorted(test_df['date'].unique()):
    team_stats = calculate_rolling_stats(train_df, date)  # Use only training data
    current_matches = test_df[test_df['date'] == date]
    current_features = prepare_features(current_matches, team_stats)
    
    if not current_features.empty:
        X_test = pd.concat([X_test, current_features])
        y_test.extend(current_matches['match_result'].tolist())

# Handle missing values
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)

# Scale numeric features
numeric_features = X_train.select_dtypes(include=['float64', 'int64']).columns
scaler = StandardScaler()
X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test[numeric_features] = scaler.transform(X_test[numeric_features])

# Train model
model = LogisticRegression(multi_class='multinomial', max_iter=1000)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

KeyboardInterrupt: 