In [None]:
# Cleaning la_liga_history.csv
import os
import pandas as pd
from pathlib import Path

# Set project root and define directories (Update these paths before running)
project_root = Path("PATH/TO/La_Liga_Match_Predictor").resolve()  # Replace with your actual path
os.chdir(project_root)

data_dir = project_root / "01_scraping"
output_dir = project_root / "02_preprocessing"
output_dir.mkdir(parents=True, exist_ok=True)

# Load La Liga data (last 5 seasons)
la_liga_data_path = data_dir / "la_liga_history.csv"
la_liga_history = pd.read_csv(la_liga_data_path)

# Convert Date column to datetime
la_liga_history["Date"] = pd.to_datetime(la_liga_history["Date"])

# Keep only relevant columns
keep_cols = ["Season", "Date", "Team", "Opponent", "Result", "GF", "GA", "SoT", "PK", "PKatt"]
la_liga_history = la_liga_history[keep_cols]

# Convert Result column into binary Target variable
la_liga_history["Target"] = (la_liga_history["Result"] == "W").astype(int)

# Clean Data
la_liga_history["Team"] = la_liga_history["Team"].str.replace("-", " ")
la_liga_history["Opponent"] = la_liga_history["Opponent"].str.replace("-", " ")

# Standardize team names
team_name_mapping = {
    "Alavés": "Alaves",
    "Cádiz": "Cadiz",
    "Atlético Madrid": "Atletico Madrid",
    "Leganés" : "Leganes",
    "Betis": "Real Betis"
}
la_liga_history["Team"] = la_liga_history["Team"].replace(team_name_mapping)
la_liga_history["Opponent"] = la_liga_history["Opponent"].replace(team_name_mapping)

# Assign numeric codes to teams
team_mapping = {team: idx for idx, team in enumerate(sorted(la_liga_history["Team"].unique()))}
la_liga_history["Team_Code"] = la_liga_history["Team"].map(team_mapping)
la_liga_history["Opp_Code"] = la_liga_history["Opponent"].map(team_mapping)

# Save cleaned data
cleaned_data_path = output_dir / "la_liga_history.csv"
la_liga_history.to_csv(cleaned_data_path, index=False) 

print("Success.")

In [4]:
# Cleaning la_liga_recent.csv using la_liga_history.csv
from pathlib import Path
import pandas as pd
import os

# Set the project root
project_root = Path("PATH/TO/La_Liga_Match_Predictor").resolve()  # Replace with your actual path
os.chdir(project_root)

data_dir = project_root / "01_scraping"
output_dir = project_root / "02_preprocessing"
output_dir.mkdir(parents=True, exist_ok=True)

recent_games_path = data_dir / "la_liga_recent.csv"
recent_games = pd.read_csv(recent_games_path)

# Clean Data
recent_games["Team"] = recent_games["Team"].str.replace("-", " ")

# Standardize team names
recent_games["Team"] = recent_games["Team"].replace({
    "Alavés": "Alaves",
    "Cádiz": "Cadiz",
    "Atlético Madrid": "Atletico Madrid",
    "Leganés": "Leganes",
    "Betis": "Real Betis"
})

# Load the cleaned history file with Team_Code
history_data_path = output_dir / "la_liga_history.csv"
history_data = pd.read_csv(history_data_path)

# Extract the team mapping
team_mapping = history_data[["Team", "Team_Code"]].drop_duplicates()
team_mapping_dict = dict(zip(team_mapping["Team"], team_mapping["Team_Code"]))

# Map Team_Code for recent games
recent_games["Team_Code"] = recent_games["Team"].map(team_mapping_dict)

# Ensure all teams are mapped properly
if recent_games["Team_Code"].isna().sum() > 0:
    print("Some teams in `recent_games.csv` are missing a `Team_Code`.")

# Compute Recent Form Score
def calculate_recent_form_score(team, data):
    recent_matches = data[data["Team_Code"] == team].sort_values("Date", ascending=False).head(6)
    wins = sum(recent_matches["Result"] == "W")
    goal_difference = recent_matches["GF"].sum() - recent_matches["GA"].sum()
    
    return ((wins / 6) * 30) + ((goal_difference / 10) * 5)  

recent_games["Recent_Form_Score"] = recent_games["Team_Code"].apply(
    lambda team: calculate_recent_form_score(team, recent_games)
)


# Compute Key Stats Score

key_stats = ["GF", "GA", "SoT", "PK", "PKatt"]

# Scale key stats based on recent games
for stat in key_stats:
    recent_games[f"{stat}_Scaled"] = (recent_games[stat] - recent_games[stat].min()) / (recent_games[stat].max() - recent_games[stat].min())

recent_games["Key_Stats_Score"] = recent_games[[f"{stat}_Scaled" for stat in key_stats]].sum(axis=1) * 5  

# Save `la_liga_recent.csv`
recent_games_path = output_dir / "la_liga_recent.csv"
recent_games.to_csv(recent_games_path, index=False)

print("Success.")

Success.


In [1]:
# Getting md23.csv from la_liga_24_25

import pandas as pd
from pathlib import Path

# Set paths
project_root = Path("PATH/TO/La_Liga_Match_Predictor").resolve()  # Replace with your actual path
data_dir = project_root / "01_scraping"
output_dir = project_root / "02_preprocessing"
output_dir.mkdir(parents=True, exist_ok=True)

# Load 24/25 La Liga data
future_games_path = data_dir / "la_liga_24_25.csv"
future_games = pd.read_csv(future_games_path)

# Select only rows 212 to 221 (Matchweek 23)
md23 = future_games.loc[221:230].reset_index(drop=True)

# Rename columns
md23 = md23.rename(columns={"Home": "Team", "Away": "Opponent"})

# Standardize team names
team_name_mapping = {
    "Alavés": "Alaves",
    "Cádiz": "Cadiz",
    "Atlético Madrid": "Atletico Madrid",
    "Leganés": "Leganes",
    "Betis": "Real Betis"
}
md23["Team"] = md23["Team"].replace(team_name_mapping)
md23["Opponent"] = md23["Opponent"].replace(team_name_mapping)

# Load history data to get team codes
history_data_path = output_dir / "la_liga_history.csv"
history_data = pd.read_csv(history_data_path)

# Map Team_Code and Opp_Code
team_mapping = dict(zip(history_data["Team"], history_data["Team_Code"]))
md23["Team_Code"] = md23["Team"].map(team_mapping)
md23["Opp_Code"] = md23["Opponent"].map(team_mapping)

# Save cleaned `md23.csv`
md23_path = output_dir / "md23.csv"
md23.to_csv(md23_path, index=False)

print("Success.")

Success.


In [3]:
# Cleaning md23.csv using la_liga_history.csv

import pandas as pd
import numpy as np
from pathlib import Path

# Set paths
preprocessing_dir = Path("PATH/TO/La_Liga_Match_Predictor").resolve()  # Replace with your actual path

# Load files
history_data_path = preprocessing_dir / "la_liga_history.csv"
recent_data_path = preprocessing_dir / "la_liga_recent.csv"
md23_path = preprocessing_dir / "md23.csv"
md23_scores_path = preprocessing_dir / "md23.csv"

history_data = pd.read_csv(history_data_path)
recent_data = pd.read_csv(recent_data_path)
md23 = pd.read_csv(md23_path)

# Convert Date to datetime for sorting
history_data["Date"] = pd.to_datetime(history_data["Date"])
recent_data["Date"] = pd.to_datetime(recent_data["Date"])

# Compute Past Encounters Score 
def calculate_past_encounter_score(team, opponent, data):
    past_matches = data[((data["Team_Code"] == team) & (data["Opp_Code"] == opponent)) |
                        ((data["Team_Code"] == opponent) & (data["Opp_Code"] == team))]
    
    past_matches = past_matches.sort_values("Date", ascending=False).head(5)
    team_wins = sum((past_matches["Team_Code"] == team) & (past_matches["Result"] == "W"))
    
    return (team_wins / 5) * 35 

md23["Past_Encounters_Score"] = md23.apply(
    lambda row: calculate_past_encounter_score(row["Team_Code"], row["Opp_Code"], history_data), axis=1)

# Compute Recent Form Score 
def calculate_recent_form_score(team, data):
    recent_matches = data[data["Team_Code"] == team].sort_values("Date", ascending=False).head(6)
    wins = sum(recent_matches["Result"] == "W")
    goal_difference = recent_matches["GF"].sum() - recent_matches["GA"].sum()
    
    return ((wins / 6) * 30) + ((goal_difference / 10) * 5) 

md23["Recent_Form_Score"] = md23["Team_Code"].apply(
    lambda team: calculate_recent_form_score(team, recent_data))

# Compute Key Stats Score  using `la_liga_recent.csv`
key_stats = ["GF", "GA", "SoT", "PK", "PKatt"]

# Group by Team_Code and compute mean from recent games
team_avg_stats = recent_data.groupby("Team_Code")[key_stats].mean().reset_index()

# Merge into `md23`
md23 = md23.merge(team_avg_stats, on="Team_Code", how="left")

# Scale the Key Stats
for stat in key_stats:
    md23[f"{stat}_Scaled"] = (md23[stat] - md23[stat].min()) / (md23[stat].max() - md23[stat].min())

md23["Key_Stats_Score"] = md23[[f"{stat}_Scaled" for stat in key_stats]].sum(axis=1) * 5 

# Unpredictability Factor
md23["Unpredictability_Factor"] = np.random.randint(0, 6, size=len(md23))

# Save  `md23.csv`
md23.to_csv(md23_scores_path, index=False)

print("Success")

Success
