In [10]:
import pandas as pd
import os

# --- 1. Load and Clean Historical Kaggle Data ---
print("Loading and cleaning historical Kaggle data...")
kaggle_file_path = r"C:\Users\HRITHIK S\MY PROJECTS\football-predictor\data\raw\english_premier_league_2000-2025.csv"
df_kaggle = pd.read_csv(kaggle_file_path)
df_kaggle_clean = df_kaggle[['Season', 'MatchDate', 'HomeTeam', 'AwayTeam', 'FullTimeHomeGoals', 'FullTimeAwayGoals', 'FullTimeResult']].copy()
df_kaggle_clean.rename(columns={
    'Season': 'season', 'MatchDate': 'date', 'HomeTeam': 'home_team', 'AwayTeam': 'away_team',
    'FullTimeHomeGoals': 'home_goals', 'FullTimeAwayGoals': 'away_goals', 'FullTimeResult': 'result_short'
}, inplace=True)
df_kaggle_clean['result'] = df_kaggle_clean['result_short'].map({'H': 'HOME_TEAM', 'A': 'AWAY_TEAM', 'D': 'DRAW'})
df_kaggle_clean.drop(columns=['result_short'], inplace=True)
df_kaggle_clean['season'] = df_kaggle_clean['season'].apply(lambda x: int(x.split('/')[0]))
df_kaggle_clean = df_kaggle_clean[df_kaggle_clean['season'] >= 2018]
df_kaggle_clean['date'] = pd.to_datetime(df_kaggle_clean['date']).dt.tz_localize('UTC')

# --- 2. Load and Clean Recent API Data ---
print("Loading and cleaning recent API data...")
api_file_path = r"C:\Users\HRITHIK S\MY PROJECTS\football-predictor\data\raw\premier_league_matches.csv"
df_api = pd.read_csv(api_file_path)
df_api['date'] = pd.to_datetime(df_api['date'])

# --- 3. Load and Clean Betting Odds Data ---
print("Loading and cleaning betting odds data...")
odds_file_path = r"C:\Users\HRITHIK S\MY PROJECTS\football-predictor\data\raw\all_avail_games.csv"
try:
    df_odds_raw = pd.read_csv(odds_file_path, encoding='latin1')
    df_odds = df_odds_raw[df_odds_raw['Div'] == 'E0'].copy()
    odds_cols_to_keep = ['Date', 'HomeTeam', 'AwayTeam', 'B365H', 'B365D', 'B365A']
    df_odds = df_odds[odds_cols_to_keep]
    df_odds.rename(columns={
    'HomeTeam': 'home_team', 'AwayTeam': 'away_team', 'Date': 'date',
    'B365H': 'odds_home_win', 'B365D': 'odds_draw', 'B365A': 'odds_away_win'
}, inplace=True)
    df_odds['date'] = pd.to_datetime(df_odds['date']).dt.tz_localize('UTC')
except FileNotFoundError:
    print(f"ERROR: Make sure the odds file is in your 'data/raw/' folder.")
    raise

# --- 4. Combine All Datasets ---
print("Combining all data sources...")
# First, combine the match results
df_combined = pd.concat([df_kaggle_clean, df_api], ignore_index=True)
df_combined.sort_values('date', inplace=True)
df_combined.drop_duplicates(subset=['date', 'home_team', 'away_team'], keep='last', inplace=True)

# Now, merge the combined results with the odds data
df_final = pd.merge(df_combined, df_odds, on=['date', 'home_team', 'away_team'], how='left')

# --- 5. Final Touches and Save ---
df_final['target'] = df_final['result'].map({'HOME_TEAM': 1, 'DRAW': 0, 'AWAY_TEAM': 2})
df_final.dropna(subset=['target'], inplace=True)
df_final['target'] = df_final['target'].astype(int)

# Fill any missing odds with 0 for now
df_final.fillna(0, inplace=True)

# Save to a new 'processed' file
save_path = r"C:\Users\HRITHIK S\MY PROJECTS\football-predictor\data\processed\matches_with_features.csv"
os.makedirs(os.path.dirname(save_path), exist_ok=True)
df_final.to_csv(save_path, index=False)

print(f"\\nSUCCESS! Clean master dataset with odds saved to: {save_path}")
print(f"Total matches: {len(df_final)}")

Loading and cleaning historical Kaggle data...
Loading and cleaning recent API data...
Loading and cleaning betting odds data...
Combining all data sources...
\nSUCCESS! Clean master dataset with odds saved to: C:\Users\HRITHIK S\MY PROJECTS\football-predictor\data\processed\matches_with_features.csv
Total matches: 2660
