In [2]:
import pandas as pd
import pickle
import requests
import time
from bs4 import BeautifulSoup

In [3]:
model = pickle.load(open('XGBoostmodel.pkl','rb'))

In [4]:
def rolling_averages(group, cols, new_cols):
    group = group.sort_values('date')
    # closed = 'left' is take the current week out
    rolling_stats = group[cols].rolling(3, closed='left').mean()
    group[new_cols] = rolling_stats
    group = group.dropna(subset=new_cols)
    return group

In [6]:

def get_data(team_name, opponent_name, home, time_start, day):
    standings_url = 'https://fbref.com/en/comps/9/Premier-League-Stats'
    data = requests.get(standings_url)
    soup = BeautifulSoup(data.text)
    standings_table = soup.select('table.stats_table')[0]
    links = [l.get('href') for l in standings_table.find_all('a')]
    links = [l for l in links if '/squads/' in l]
    name = [l.split('/')[-1].replace('-Stats', '').replace('-',' ') for l in links]
    df = pd.DataFrame({'team': name, 'link': links})
    team_url = df[df['team'] == team_name]['link'].values[0]

    team_data = requests.get(f'https://fbref.com{team_url}')
    matches = pd.read_html(team_data.text, match='Scores & Fixtures')[0]

    soup = BeautifulSoup(team_data.text)
    links = [l.get('href') for l in soup.find_all('a')]
    links = [l for l in links if l and '/all_comps/shooting/' in l]
    data = requests.get(f'https://fbref.com{links[0]}')
    shooting = pd.read_html(data.text, match='Shooting')[0]
    shooting.columns = shooting.columns.droplevel()

    # Because some team don't have shooting stat then when merge it error
    try:
        team_data = matches.merge(shooting[['Date', 'Sh', 'SoT', 'Dist', 'FK', 'PK', 'PKatt']], on='Date')
    except ValueError:
        # nothing to do
        pass

    # only want teamd data in PremierLeauge compettition
    team_data = team_data[team_data['Comp'] == 'Premier League']
    team_data['Season'] = 2024
    team_data['Team'] = team_name
    team_data.columns = [c.lower() for c in team_data.columns]

    # from parameter
    opponent_code_df = pd.read_csv('team_code.csv')
    opp_code = opponent_code_df[opponent_code_df['opponent'] == opponent_name]['opp_code'].values[0]
    venue_Home = 1 if home else 0
    mapping_day = {'Mon': 0, 'Tue': 1, 'Wed': 2, 'Thu': 3, 'Fri': 4, 'Sat': 5, 'Sun': 6}
    day_code = mapping_day[day]
    hour = time_start

    cols = ['gf', 'ga', 'sh', 'sot', 'dist', 'fk', 'pk', 'pkatt', 'xg', 'xga']
    new_cols = [f'{c}_rollinig' for c in cols]

    team_data['date'] = pd.to_datetime(team_data['date'])
    team_data = team_data.groupby('team').apply(lambda x: rolling_averages(x, cols, new_cols))
    team_data = team_data.droplevel('team')
    team_data.index = range(team_data.shape[0])

    team_data = team_data.tail(1)
    team_data['opp_code'] = opp_code
    team_data['venue_Home'] = venue_Home
    team_data['day_code'] = day_code
    team_data['hour'] = hour

    cols_to_return = ['venue_Home', 'opp_code', 'hour', 'day_code'] + new_cols

    return team_data[cols_to_return]



In [7]:
def predict_match(model, match_data_team1, match_data_team2):
    inverse_mapping = { 1: 'Win', 0: 'Not win'}
    match_data_to_predict = pd.concat([match_data_team1, match_data_team2], axis=0)
    predicted = model.predict(match_data_to_predict)
    predicted_team1 = inverse_mapping[predicted[0]]
    predicted_team2 = inverse_mapping[predicted[1]]
    return predicted_team1, predicted_team2

In [10]:
# Input Here 
time_start = 19
day = 'Wed'

# First Team
team1 = 'Southampton'
team2_opp = 'Chelsea'
home1 = True

# Second Team
team1_y = 'Chelsea'
team2_opp_y = 'Southampton'
home2 = False

match_data_team1 = get_data(team1, team2_opp, home=home1, time_start=time_start, day=day)
time.sleep(5)
match_data_team2 = get_data(team1_y, team2_opp_y, home=home2, time_start=time_start, day=day)

  matches = pd.read_html(team_data.text, match='Scores & Fixtures')[0]
  shooting = pd.read_html(data.text, match='Shooting')[0]
  team_data = team_data.groupby('team').apply(lambda x: rolling_averages(x, cols, new_cols))
  matches = pd.read_html(team_data.text, match='Scores & Fixtures')[0]
  shooting = pd.read_html(data.text, match='Shooting')[0]
  team_data = team_data.groupby('team').apply(lambda x: rolling_averages(x, cols, new_cols))


In [11]:
predicted_mutd_labels, predicted_ars_labels = predict_match(model, match_data_team1, match_data_team2)

print(f'{team1} will: {predicted_mutd_labels} {team2_opp}')
print(f'{team1_y} will: {predicted_ars_labels} {team2_opp_y}')

Southampton will: Not win Chelsea
Chelsea will: Win Southampton
