In [112]:
import pandas as pd

df = pd.read_csv('all_matches.csv')

#selecting only World Cup Qualifier Tournament
world_cup_qualifiers = df[df['tournament'].str.contains("World Cup Qualifier", case=False, na=False)]
#dropping every column that represent Neutral field as True
world_cup_qualifiers = world_cup_qualifiers[world_cup_qualifiers['neutral'] == False]

#reseting the index
world_cup_qualifiers.reset_index(drop=True, inplace=True)
world_cup_qualifiers.index += 1

world_cup_qualifiers

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,country,neutral
1,1933-06-11,Sweden,Estonia,6,2,World Cup qualifier,Sweden,False
2,1933-06-29,Lithuania,Sweden,0,2,World Cup qualifier,Lithuania,False
3,1933-09-24,Yugoslavia,Switzerland,2,2,World Cup qualifier,Yugoslavia,False
4,1933-10-15,Poland,Czechoslovakia,1,2,World Cup qualifier,Poland,False
5,1933-10-29,Switzerland,Romania,2,2,World Cup qualifier,Switzerland,False
...,...,...,...,...,...,...,...,...
6812,2024-10-15,Japan,Australia,1,1,World Cup qualifier,Japan,False
6813,2024-10-15,China,Indonesia,2,1,World Cup qualifier,China,False
6814,2024-10-15,Colombia,Chile,4,0,World Cup qualifier,Colombia,False
6815,2024-10-15,Brazil,Peru,4,0,World Cup qualifier,Brazil,False


In [113]:
#creating a new column on our dataset showing the goal difference for each match.
#this new column is gonna work as our target variable
world_cup_qualifiers['goal_difference'] = world_cup_qualifiers['home_score'] - world_cup_qualifiers['away_score']
world_cup_qualifiers

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,country,neutral,goal_difference
1,1933-06-11,Sweden,Estonia,6,2,World Cup qualifier,Sweden,False,4
2,1933-06-29,Lithuania,Sweden,0,2,World Cup qualifier,Lithuania,False,-2
3,1933-09-24,Yugoslavia,Switzerland,2,2,World Cup qualifier,Yugoslavia,False,0
4,1933-10-15,Poland,Czechoslovakia,1,2,World Cup qualifier,Poland,False,-1
5,1933-10-29,Switzerland,Romania,2,2,World Cup qualifier,Switzerland,False,0
...,...,...,...,...,...,...,...,...,...
6812,2024-10-15,Japan,Australia,1,1,World Cup qualifier,Japan,False,0
6813,2024-10-15,China,Indonesia,2,1,World Cup qualifier,China,False,1
6814,2024-10-15,Colombia,Chile,4,0,World Cup qualifier,Colombia,False,4
6815,2024-10-15,Brazil,Peru,4,0,World Cup qualifier,Brazil,False,4


In [115]:
#applying the avg to each row for Home and Away teams
def get_home_rolling_averages(row):
    home_team = row['home_team']
    current_date = row['date']
    home_scored_avg, home_conceded_avg, _, _ = calculate_averages(world_cup_qualifiers, home_team, current_date)
    return pd.Series([home_scored_avg, home_conceded_avg])

def get_away_rolling_averages(row):
    away_team = row['away_team']
    current_date = row['date']
    _, _, away_scored_avg, away_conceded_avg = calculate_averages(world_cup_qualifiers, away_team, current_date)
    return pd.Series([away_scored_avg, away_conceded_avg])

In [116]:
world_cup_qualifiers[['home_scored_avg', 'home_conceded_avg']] = world_cup_qualifiers.apply(get_home_rolling_averages, axis=1)
world_cup_qualifiers[['away_scored_avg', 'away_conceded_avg']] = world_cup_qualifiers.apply(get_away_rolling_averages, axis=1)

world_cup_qualifiers = world_cup_qualifiers.dropna()

world_cup_qualifiers

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,country,neutral,goal_difference,home_scored_avg,home_conceded_avg,away_scored_avg,away_conceded_avg
368,1965-09-19,Luxembourg,Yugoslavia,2,5,World Cup qualifier,Luxembourg,False,-3,1.0,3.7,1.3,1.3
403,1968-06-19,Finland,Belgium,1,2,World Cup qualifier,Finland,False,-1,1.1,2.9,2.8,2.5
408,1968-10-09,Belgium,Finland,6,1,World Cup qualifier,Belgium,False,5,2.3,1.6,0.8,4.4
413,1968-10-16,Belgium,Yugoslavia,3,0,World Cup qualifier,Belgium,False,3,2.7,1.3,1.3,1.0
426,1968-11-06,France,Norway,0,1,World Cup qualifier,France,False,-1,3.8,0.6,0.7,3.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6812,2024-10-15,Japan,Australia,1,1,World Cup qualifier,Japan,False,0,2.4,0.5,0.9,1.1
6813,2024-10-15,China,Indonesia,2,1,World Cup qualifier,China,False,1,1.5,0.8,0.7,3.6
6814,2024-10-15,Colombia,Chile,4,0,World Cup qualifier,Colombia,False,4,1.3,0.6,0.6,2.1
6815,2024-10-15,Brazil,Peru,4,0,World Cup qualifier,Brazil,False,4,2.4,0.4,0.3,1.1


In [117]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

#defining the features X
#defining the target variable Y
X = world_cup_qualifiers[['home_scored_avg', 'home_conceded_avg', 'away_scored_avg', 'away_conceded_avg']]
y = world_cup_qualifiers['goal_difference']

#getting the training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#initializing Gradient Boosing Regressor
#initializing Linear Regression
gb_model = GradientBoostingRegressor(random_state=42)
lr_model = LinearRegression()

gb_model.fit(X_train, y_train)
lr_model.fit(X_train, y_train)