In [2]:
import pandas as pd

# 1. Load both datasets
# Make sure the filenames and paths are correct
kaggle_file_path = r'C:\Users\HRITHIK S\MY PROJECTS\football-predictor\data\raw\english_premier_league_2000-2025.csv'
df_kaggle = pd.read_csv(kaggle_file_path)

api_file_path = r'C:\Users\HRITHIK S\MY PROJECTS\football-predictor\data\raw\premier_league_matches.csv'
df_api = pd.read_csv(api_file_path)
df_api['date'] = pd.to_datetime(df_api['date'])

# 2. --- Clean and prepare the Kaggle Data ---
# Select only the columns we need
df_kaggle_clean = df_kaggle[['Season', 'MatchDate', 'HomeTeam', 'AwayTeam', 'FullTimeHomeGoals', 'FullTimeAwayGoals', 'FullTimeResult']].copy()

# Rename columns to match our API data's format
df_kaggle_clean.rename(columns={
    'Season': 'season',
    'MatchDate': 'date',
    'HomeTeam': 'home_team',
    'AwayTeam': 'away_team',
    'FullTimeHomeGoals': 'home_goals',
    'FullTimeAwayGoals': 'away_goals',
    'FullTimeResult': 'result_short' # Rename temporarily before mapping
}, inplace=True)

# Map the result ('H', 'A', 'D') to the same format as our API data
df_kaggle_clean['result'] = df_kaggle_clean['result_short'].map({'H': 'HOME_TEAM', 'A': 'AWAY_TEAM', 'D': 'DRAW'})
df_kaggle_clean.drop(columns=['result_short'], inplace=True)

# Convert the date column to datetime objects for proper sorting
df_kaggle_clean['date'] = pd.to_datetime(df_kaggle_clean['date'])

df_kaggle_clean['date'] = df_kaggle_clean['date'].dt.tz_localize('UTC')

# Extract the starting year of the season (e.g., '2023/24' -> 2023)
df_kaggle_clean['season'] = df_kaggle_clean['season'].apply(lambda x: int(x.split('/')[0]))


# 3. Filter for the 7 seasons we want (2018-2025)
df_kaggle_clean = df_kaggle_clean[df_kaggle_clean['season'] >= 2018]


# 4. --- Combine the historical and recent data ---
df_combined = pd.concat([df_kaggle_clean, df_api], ignore_index=True)


# 5. --- Final Cleanup ---
# Sort by date and remove any duplicate matches, keeping the last one
df_combined.sort_values('date', inplace=True)
df_combined.drop_duplicates(subset=['date', 'home_team', 'away_team'], keep='last', inplace=True)


# --- Final Verification ---
print("Successfully combined historical (Kaggle) and recent (API) data.")
print(f"Total matches in the final dataset: {len(df_combined)}")
print("\nLast 5 rows of the new combined dataset:")
print(df_combined.tail())

# We will now use df_combined as our main DataFrame for the rest of the project
df = df_combined.copy()

Successfully combined historical (Kaggle) and recent (API) data.
Total matches in the final dataset: 2660

Last 5 rows of the new combined dataset:
      season                      date                  home_team  \
2655     NaN 2025-08-30 16:30:00+00:00            Leeds United FC   
2656     NaN 2025-08-31 13:00:00+00:00  Brighton & Hove Albion FC   
2657     NaN 2025-08-31 13:00:00+00:00       Nottingham Forest FC   
2658     NaN 2025-08-31 15:30:00+00:00               Liverpool FC   
2659     NaN 2025-08-31 18:00:00+00:00             Aston Villa FC   

                away_team  home_goals  away_goals     result  
2655  Newcastle United FC           0           0       DRAW  
2656   Manchester City FC           2           1  HOME_TEAM  
2657   West Ham United FC           0           3  AWAY_TEAM  
2658           Arsenal FC           1           0  HOME_TEAM  
2659    Crystal Palace FC           0           3  AWAY_TEAM  


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# --- 1. Data Cleaning and Preparation ---

# Fix the missing season values for recent games by inferring from the date
df['season'] = df.apply(lambda row: row['date'].year if row['date'].month >= 8 else row['date'].year - 1, axis=1)

# Ensure date is a datetime object and sort
df['date'] = pd.to_datetime(df['date'])
df.sort_values('date', inplace=True)

# Create our numerical target variable
df['target'] = df['result'].map({'HOME_TEAM': 1, 'DRAW': 0, 'AWAY_TEAM': 2})
# Drop rows where the result is unknown for any reason
df.dropna(subset=['target'], inplace=True)
df['target'] = df['target'].astype(int)


# --- 2. Feature Engineering: Rolling Form ---
print("Calculating rolling form features...")
ROLLING_WINDOW = 5
teams = pd.unique(df[['home_team', 'away_team']].values.ravel('K'))
form_features = ['form_points', 'form_goals_scored', 'form_goals_conceded', 'form_goal_difference']
for side in ['home', 'away']:
    for feature in form_features:
        df[f'{side}_{feature}'] = 0.0

for team in teams:
    team_matches = df[(df['home_team'] == team) | (df['away_team'] == team)]
    points = team_matches.apply(lambda row: 3 if (row['home_team'] == team and row['result'] == 'HOME_TEAM') or \
                                                 (row['away_team'] == team and row['result'] == 'AWAY_TEAM') else \
                                             1 if row['result'] == 'DRAW' else 0, axis=1)
    goals_scored = team_matches.apply(lambda row: row['home_goals'] if row['home_team'] == team else row['away_goals'], axis=1)
    goals_conceded = team_matches.apply(lambda row: row['away_goals'] if row['home_team'] == team else row['home_goals'], axis=1)
    
    rolling_stats = {
        'form_points': points.shift(1).rolling(window=ROLLING_WINDOW, min_periods=1).mean(),
        'form_goals_scored': goals_scored.shift(1).rolling(window=ROLLING_WINDOW, min_periods=1).mean(),
        'form_goals_conceded': goals_conceded.shift(1).rolling(window=ROLLING_WINDOW, min_periods=1).mean(),
        'form_goal_difference': (goals_scored - goals_conceded).shift(1).rolling(window=ROLLING_WINDOW, min_periods=1).mean()
    }

    for feature, values in rolling_stats.items():
        df.loc[team_matches.index, f'home_{feature}'] = team_matches.index.map(values).where(team_matches['home_team'] == team, df.loc[team_matches.index, f'home_{feature}'])
        df.loc[team_matches.index, f'away_{feature}'] = team_matches.index.map(values).where(team_matches['away_team'] == team, df.loc[team_matches.index, f'away_{feature}'])

df.fillna(0, inplace=True)


# --- 3. Feature Engineering: League Table (with season resets) ---
print("Calculating league table features...")
team_stats = {}
current_season = None
for side in ['home', 'away']:
    df[f'{side}_rank'], df[f'{side}_points'], df[f'{side}_goals_for'], df[f'{side}_goals_against'] = [20, 0, 0, 0]

for index, row in df.iterrows():
    if row['season'] != current_season:
        current_season = row['season']
        team_stats = {}
        print(f"--- Processing season: {current_season}-{current_season+1} ---")

    for team_type, team in [('home', row['home_team']), ('away', row['away_team'])]:
        if team not in team_stats:
            team_stats[team] = {'points': 0, 'goals_for': 0, 'goals_against': 0}

    if len(team_stats) > 0:
        temp_table = pd.DataFrame.from_dict(team_stats, orient='index')
        temp_table['goal_difference'] = temp_table['goals_for'] - temp_table['goals_against']
        temp_table = temp_table.sort_values(['points', 'goal_difference'], ascending=False)
        temp_table['rank'] = range(1, len(temp_table) + 1)
        
        home_team, away_team = row['home_team'], row['away_team']
        df.loc[index, 'home_rank'] = temp_table.loc[home_team, 'rank']
        df.loc[index, 'away_rank'] = temp_table.loc[away_team, 'rank']
        df.loc[index, 'home_points'] = team_stats[home_team]['points']
        df.loc[index, 'away_points'] = team_stats[away_team]['points']

    if row['result'] == 'HOME_TEAM': team_stats[row['home_team']]['points'] += 3
    elif row['result'] == 'AWAY_TEAM': team_stats[row['away_team']]['points'] += 3
    else:
        team_stats[row['home_team']]['points'] += 1
        team_stats[row['away_team']]['points'] += 1
    
    team_stats[row['home_team']]['goals_for'] += row['home_goals']
    team_stats[row['home_team']]['goals_against'] += row['away_goals']
    team_stats[row['away_team']]['goals_for'] += row['away_goals']
    team_stats[row['away_team']]['goals_against'] += row['home_goals']


# --- 4. Model Training ---
print("\nTraining the final model...")
features = [
    'home_form_points', 'home_form_goals_scored', 'home_form_goals_conceded', 'home_form_goal_difference',
    'away_form_points', 'away_form_goals_scored', 'away_form_goals_conceded', 'away_form_goal_difference',
    'home_rank', 'home_points',
    'away_rank', 'away_points',
]

# Drop the first season (2018) as the features will be unstable
df_model = df[df['season'] > 2018].copy()

X = df_model[features]
y = df_model['target']

# Use data from before the 2024 season to train, and test on the 2024-25 season
X_train = X[df_model['season'] < 2024]
X_test = X[df_model['season'] >= 2024]
y_train = y[df_model['season'] < 2024]
y_test = y[df_model['season'] >= 2024]

# Initialize and train the Random Forest model
final_model = RandomForestClassifier(n_estimators=100, random_state=42, min_samples_leaf=5)
final_model.fit(X_train, y_train)

# Evaluate the final model
final_accuracy = final_model.score(X_test, y_test)

print("\n--- FINAL MODEL EVALUATION ---")
print(f"Model accuracy on unseen data (2024+ seasons): {final_accuracy*100:.2f}%")



Calculating rolling form features...
Calculating league table features...
--- Processing season: 2018-2019 ---
--- Processing season: 2019-2020 ---
--- Processing season: 2020-2021 ---
--- Processing season: 2021-2022 ---
--- Processing season: 2022-2023 ---
--- Processing season: 2023-2024 ---
--- Processing season: 2024-2025 ---
--- Processing season: 2025-2026 ---

Training the final model...

--- FINAL MODEL EVALUATION ---
Model accuracy on unseen data (2024+ seasons): 47.89%


In [5]:
import pandas as pd
import os

# --- 1. Load Both Raw Datasets ---
kaggle_file_path = r'C:\Users\HRITHIK S\MY PROJECTS\football-predictor\data\raw\english_premier_league_2000-2025.csv'
df_kaggle = pd.read_csv(kaggle_file_path)

# This is the file created by our API script
api_file_path = r'C:\Users\HRITHIK S\MY PROJECTS\football-predictor\data\raw\premier_league_matches.csv'
df_api = pd.read_csv(api_file_path)


# --- 2. Clean and Unify the Kaggle Data ---
print("Cleaning historical data from Kaggle...")
# Select and rename columns to match our convention
df_kaggle_clean = df_kaggle[['Season', 'MatchDate', 'HomeTeam', 'AwayTeam', 'FullTimeHomeGoals', 'FullTimeAwayGoals', 'FullTimeResult']].copy()
df_kaggle_clean.rename(columns={
    'Season': 'season', 'MatchDate': 'date', 'HomeTeam': 'home_team', 'AwayTeam': 'away_team',
    'FullTimeHomeGoals': 'home_goals', 'FullTimeAwayGoals': 'away_goals', 'FullTimeResult': 'result_short'
}, inplace=True)

# Map the result format ('H'/'A'/'D') to our standard format
df_kaggle_clean['result'] = df_kaggle_clean['result_short'].map({'H': 'HOME_TEAM', 'A': 'AWAY_TEAM', 'D': 'DRAW'})
df_kaggle_clean.drop(columns=['result_short'], inplace=True)

# Extract the season year
df_kaggle_clean['season'] = df_kaggle_clean['season'].apply(lambda x: int(x.split('/')[0]))
df_kaggle_clean = df_kaggle_clean[df_kaggle_clean['season'] >= 2018]


# --- 3. Clean and Unify the API Data ---
print("Cleaning recent data from the API...")
# The API data already has the correct column names, we just need to handle the date
# The date column from the API is already timezone-aware (UTC), which is good.
df_api['date'] = pd.to_datetime(df_api['date'])


# --- 4. FIX THE TIMEZONE ERROR ---
# Convert the Kaggle date column to datetime, then make it timezone-aware (localizing to UTC)
df_kaggle_clean['date'] = pd.to_datetime(df_kaggle_clean['date']).dt.tz_localize('UTC')


# --- 5. Combine and Save the Final Dataset ---
print("Combining datasets...")
df_combined = pd.concat([df_kaggle_clean, df_api], ignore_index=True)

# Sort by date and remove any duplicates
df_combined.sort_values('date', inplace=True)
df_combined.drop_duplicates(subset=['date', 'home_team', 'away_team'], keep='last', inplace=True)

# Add our numerical target column
df_combined['target'] = df_combined['result'].map({'HOME_TEAM': 1, 'DRAW': 0, 'AWAY_TEAM': 2})
df_combined.dropna(subset=['target'], inplace=True) # Drop rows if result is unknown
df_combined['target'] = df_combined['target'].astype(int)


# --- 6. Save to a new 'processed' folder ---
save_path = r'C:\Users\HRITHIK S\MY PROJECTS\football-predictor\data\raw\cleaned_matches.csv'
os.makedirs(os.path.dirname(save_path), exist_ok=True)
df_combined.to_csv(save_path, index=False)

print(f"\\nSUCCESS! Clean master dataset saved to: {save_path}")
print(f"Total matches: {len(df_combined)}")
print("You can now use this clean file for all future feature engineering and modeling.")


Cleaning historical data from Kaggle...
Cleaning recent data from the API...
Combining datasets...
\nSUCCESS! Clean master dataset saved to: C:\Users\HRITHIK S\MY PROJECTS\football-predictor\data\raw\cleaned_matches.csv
Total matches: 2660
You can now use this clean file for all future feature engineering and modeling.
