In [679]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
pd.set_option('display.max_rows', 10) 

In [680]:
# Load the CSV data into a DataFrame
df = pd.read_csv('LaLigaHistory.csv', index_col= 0)

# Convert the date column to datetime objects for easier date manipulation
df['date_GMT'] = pd.to_datetime(df['date_GMT'], format = ('%b %d %Y - %I:%M%p'))

In [681]:
# Display the DataFrame and its data types for verification
#print(df.dtypes)

In [682]:
# Convert categorical text data to numerical codes for model training

df["home_team_code"] = df["home_team_name"].astype("category").cat.codes
df["away_team_code"] = df["away_team_name"].astype("category").cat.codes
df["stadium_code"] = df["stadium_name"].astype("category").cat.codes

# Extract the day of the week from the date to use as a feature

df["day_of_week"] = df["date_GMT"].dt.day_of_week

# Map match results to numerical values: 1 for Win (W), 0 for Loss (L), 2 for Draw (D)

df["target"] = df["Result"].map({"W": 1, "L": 0, "D": 2})

# Display the first few rows to ensure correct mapping and transformations

#df.head()

In [683]:
train = df[df["date_GMT"] < '2023-08-05']
test = df[(df["date_GMT"] >= '2023-08-05') & (df["date_GMT"] < '2024-04-01')]

# Initialize the RandomForestClassifier with specified hyperparameters
rf = RandomForestClassifier(n_estimators=50, min_samples_split=10, random_state= 1)

# Define the predictor variables/features to use for training the model
predictor = ["home_team_code","away_team_code", "stadium_code", 
             "day_of_week","Game Week","time"]

rf.fit(train[predictor], train["target"])
preds = rf.predict(test[predictor])
error = accuracy_score(test["target"], preds)

In [684]:
# Create a DataFrame to compare actual vs predicted results
combined = pd.DataFrame(dict(actual=test["target"], predicted = preds))

# Display a crosstabulation of actual vs predicted results for better analysis
pd.crosstab(index = combined["actual"], columns=combined["predicted"])

predicted,0,1,2
actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,25,40,12
1,11,97,25
2,18,61,9


In [685]:
grouped_home_team_df = df.groupby("home_team_name")
group_home = grouped_home_team_df.get_group("Real Madrid")

In [686]:
def rolling_avg(group_home, cols, new_cols):
    group_home = group_home.sort_values("date_GMT")
    rolling_stats_home = group_home[cols].rolling(5, closed='left').mean()
    group_home[new_cols] = rolling_stats_home
    group_home = group_home.dropna(subset=new_cols)
    return group_home

In [687]:
cols = ["home_team_goal_count", 
        "home_team_shots",
        "home_team_shots_on_target", ]
new_cols = [f"{c}_rolling" for c in cols]

In [688]:
rolling_avg(group_home, cols, new_cols)

Unnamed: 0_level_0,date_GMT,status,attendance,home_team_name,away_team_name,referee,Game Week,Pre-Match PPG (Home),Pre-Match PPG (Away),home_ppg,...,Result,time,home_team_code,away_team_code,stadium_code,day_of_week,target,home_team_goal_count_rolling,home_team_shots_rolling,home_team_shots_on_target_rolling
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1541258100,2018-11-03 15:15:00,complete,68050.0,Real Madrid,Real Valladolid,Jesús Gil Manzano,11,2.00,1.80,2.11,...,W,15,19,21,21,5,1,1.6,15.6,7.8
1543693500,2018-12-01 19:45:00,complete,69653.0,Real Madrid,Valencia CF,José Luis González González,14,2.17,1.50,2.11,...,W,19,19,26,21,5,1,1.6,16.6,8.8
1544895000,2018-12-15 17:30:00,complete,55229.0,Real Madrid,Rayo Vallecano,Ricardo De Burgos Bengoetxea,16,2.29,0.57,2.11,...,L,17,19,17,21,5,0,1.2,15.8,8.0
1546795800,2019-01-06 17:30:00,complete,53412.0,Real Madrid,Real Sociedad,José Luis Munuera Montero,18,2.38,1.44,2.11,...,L,17,19,20,21,6,0,1.2,14.6,7.8
1547910900,2019-01-19 15:15:00,complete,68232.0,Real Madrid,Sevilla FC,Antonio Miguel Mateu Lahoz,20,2.11,1.30,2.11,...,L,15,19,24,21,5,0,1.2,15.4,8.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1743890400,2025-04-05 22:00:00,incomplete,,Real Madrid,Valencia CF,,30,0.00,0.00,0.00,...,D,22,19,26,21,5,2,0.0,-1.0,-1.0
1745100000,2025-04-19 22:00:00,incomplete,,Real Madrid,Athletic Club Bilbao,,32,0.00,0.00,0.00,...,D,22,19,1,21,5,2,0.0,-1.0,-1.0
1746309600,2025-05-03 22:00:00,incomplete,,Real Madrid,Celta de Vigo,,34,0.00,0.00,0.00,...,D,22,19,6,21,5,2,0.0,-1.0,-1.0
1747173600,2025-05-13 22:00:00,incomplete,,Real Madrid,RCD Mallorca,,36,0.00,0.00,0.00,...,D,22,19,16,21,1,2,0.0,-1.0,-1.0


In [689]:
df_rolling = df.groupby("home_team_name").apply(lambda x: rolling_avg(x,cols,new_cols))
#df_rolling

  df_rolling = df.groupby("home_team_name").apply(lambda x: rolling_avg(x,cols,new_cols))


In [690]:
df_rolling.droplevel("home_team_name")

Unnamed: 0_level_0,date_GMT,status,attendance,home_team_name,away_team_name,referee,Game Week,Pre-Match PPG (Home),Pre-Match PPG (Away),home_ppg,...,Result,time,home_team_code,away_team_code,stadium_code,day_of_week,target,home_team_goal_count_rolling,home_team_shots_rolling,home_team_shots_on_target_rolling
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1667044800,2022-10-29 12:00:00,complete,,Almería,Celta de Vigo,Carlos Del Cerro Grande,12,1.80,0.60,1.74,...,W,12,0,6,31,5,1,1.8,11.4,5.6
1668016800,2022-11-09 18:00:00,complete,11454.0,Almería,Getafe CF,Alejandro Muñíz Ruiz,14,2.00,1.33,1.74,...,W,18,0,10,31,2,1,2.2,13.4,5.4
1673182800,2023-01-08 13:00:00,complete,,Almería,Real Sociedad,Javier Alberola Rojas,16,2.14,2.14,1.74,...,L,13,0,20,31,6,0,2.0,12.6,5.0
1673795700,2023-01-15 15:15:00,complete,,Almería,Atletico Madrid,José María Sánchez Martínez,17,1.88,2.00,1.74,...,D,15,0,2,31,6,2,2.0,12.4,4.8
1674849600,2023-01-27 20:00:00,complete,11269.0,Almería,RCD Espanyol,Javier Iglesias Villanueva,19,1.78,1.22,1.74,...,W,20,0,15,31,4,1,1.6,13.8,4.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1743890400,2025-04-05 22:00:00,incomplete,,Villarreal,Athletic Club Bilbao,,30,1.00,0.00,1.00,...,D,22,27,1,27,5,2,0.0,-1.0,-1.0
1745100000,2025-04-19 22:00:00,incomplete,,Villarreal,Real Sociedad,,32,1.00,0.00,1.00,...,D,22,27,20,27,5,2,0.0,-1.0,-1.0
1746309600,2025-05-03 22:00:00,incomplete,,Villarreal,CA Osasuna,,34,1.00,0.00,1.00,...,D,22,27,4,27,5,2,0.0,-1.0,-1.0
1747173600,2025-05-13 22:00:00,incomplete,,Villarreal,Leganés,,36,1.00,1.00,1.00,...,D,22,27,13,27,1,2,0.0,-1.0,-1.0


In [691]:
df_rolling.index = range(df_rolling.shape[0])
#df_rolling

In [692]:
from sklearn.metrics import precision_score

def future(data, predictor):
    train = data[data["date_GMT"] < '2024-04-01']
    test = data[data["date_GMT"] > '2024-04-01']
    rf.fit(train[predictor], train["target"])
    preds = rf.predict(test[predictor])
    combined = pd.DataFrame(dict(actual = test["target"], predictor = preds), index = test.index)
    precision = precision_score(test["target"], preds, average='weighted')

    return  precision, combined



In [693]:
precision, combined = future(df_rolling, predictor + new_cols) 
precision

np.float64(0.8239309523744013)

In [694]:
#combined

In [695]:
combined  = combined.merge(df_rolling[["date_GMT", "home_team_name", "away_team_name", "Result"]], left_index = True, right_index = True)
combined

Unnamed: 0,actual,predictor,date_GMT,home_team_name,away_team_name,Result
29,0,1,2024-04-21 14:15:00,Almería,Villarreal,L
30,0,1,2024-04-27 14:15:00,Almería,Getafe CF,L
31,0,0,2024-05-16 19:30:00,Almería,FC Barcelona,L
32,1,1,2024-05-25 16:30:00,Almería,Cadiz,W
119,2,1,2024-04-14 16:30:00,Athletic Club Bilbao,Villarreal,D
...,...,...,...,...,...,...
2136,2,2,2025-04-05 22:00:00,Villarreal,Athletic Club Bilbao,D
2137,2,2,2025-04-19 22:00:00,Villarreal,Real Sociedad,D
2138,2,1,2025-05-03 22:00:00,Villarreal,CA Osasuna,D
2139,2,2,2025-05-13 22:00:00,Villarreal,Leganés,D


In [696]:

def calculate_final_standings(combined):
    standings = {}

    for index, row in combined.iterrows():
        home_team = row['home_team_name']
        away_team = row['away_team_name']
        result = row['predictor'] 
        
        if home_team not in standings:
            standings[home_team] = 0
        if away_team not in standings:
            standings[away_team] = 0


        if result == 1: 
            standings[home_team] += 3
        elif result == 0:  
            standings[away_team] += 3
        elif result == 2:
            standings[home_team] += 1
            standings[away_team] += 1



    standings_df = pd.DataFrame(list(standings.items()), columns=['Team', 'Points'])

    standings_df = standings_df.sort_values(by='Points' , ascending=False).reset_index(drop=True)

    standings_df = standings_df.head(20)

    print("\nFinal League Standings:")
    print(standings_df)


calculate_final_standings(combined)

    


Final League Standings:
               Team  Points
0       Real Madrid      91
1     Real Sociedad      82
2        Real Betis      69
3        Sevilla FC      68
4      FC Barcelona      68
..              ...     ...
15       Villarreal      47
16          Leganés      47
17     RCD Mallorca      45
18        Getafe CF      43
19  Real Valladolid      39

[20 rows x 2 columns]
