In [1]:
!playwright install
# !pip install playwright

In [2]:
import asyncio
import nest_asyncio
nest_asyncio.apply()

from playwright.async_api import async_playwright
from playwright.async_api import TimeoutError

import pandas as pd 
import numpy as np

In [8]:
START_YEAR = 2019
END_YEAR = 2023

COMPETITIONS = {
    'ENG': 'england/premier-league', 
    'SPA': 'spain/laliga',
    'ITA': 'italy/serie-a',
    'GER': 'germany/bundesliga',
    'FRA': 'france/ligue-1'
}

COUNTRIES = ['ENG', 'SPA', 'ITA', 'GER', 'FRA']

In [4]:
class Season:

    time_pattern = r'(\d{2}\.\d{2}\.\s\d{2}:\d{2})'

    def __init__(self, year, country, page):
        self.country = country
        self.year = year
        self.page = page
        self.results = {
                'time': [],
                'home_team': [],
                'home_score': [],
                'away_team': [],
                'away_score': [],
            }
        self.standings = []
        self.results_url = f"https://www.flashscore.com/football/{COMPETITIONS[country]}-{self.year}-{self.year+1}/results/"
        self.fixtures_url = f"https://www.flashscore.com/football/{COMPETITIONS[country]}-{self.year}-{self.year+1}/fixtures/"
        self.standings_url = f"https://www.flashscore.com/football/{COMPETITIONS[country]}-{self.year}-{self.year+1}/standings/"

    async def extract_results(self, url):
        try:
            await self.page.wait_for_selector('a.event__more.event__more--static', timeout=5000)
            await self.page.content()
            
            while True:
                show_more_button_locator = self.page.locator('a.event__more.event__more--static')
                if await show_more_button_locator.is_visible():
                    try:
                        await show_more_button_locator.click(timeout=10000)
                        await self.page.wait_for_load_state('domcontentloaded', timeout=10000)
                    except TimeoutError:
                        pass
                else:
                    print(f"No 'Show more' button visible for {self.year}.")
                    break
        except TimeoutError:
            print("No show more button on the page, trying to extract data")
        try:
            times = await self.page.locator("div.event__time").all_text_contents()
            home_teams = await self.page.locator("div.event__participant.event__participant--home").all_text_contents()
            home_scores = await self.page.locator("div.event__score.event__score--home").all_text_contents()
            away_teams = await self.page.locator("div.event__participant.event__participant--away").all_text_contents()
            away_scores = await self.page.locator("div.event__score.event__score--away").all_text_contents()

            self.results['time'].extend(times)
            self.results['home_team'].extend(home_teams)
            self.results['away_team'].extend(away_teams)
            if url == self.fixtures_url:
                self.results['home_score'].extend(np.full(len(times), np.nan))
                self.results['away_score'].extend(np.full(len(times), np.nan))
            else:
                self.results['home_score'].extend(home_scores)
                self.results['away_score'].extend(away_scores)
        except Exception as e:
            print(e)
            # print(f"{'fixtures' if url == self.fixtures_url else 'results'} not found for {COMPETITIONS[self.country]} {self.year} - {self.year+1}")
    
    async def extract_standings(self):
        try:
            await self.page.wait_for_selector('a.tableCellParticipant__name', timeout=5000)
            await self.page.content()

            print("extracting standings")
            teams = await self.page.locator("a.tableCellParticipant__name").all_text_contents()
            print(teams)
            for team in teams:
                team_container = self.page.locator(f"div.ui-table__row:has-text('{team}')")
                values = await team_container.locator("span.table__cell.table__cell--value").all_text_contents()

                team_data = {
                    'TEAM': team,
                    'MP': values[0],
                    'W': values[1],
                    'D': values[2],
                    'L': values[3],
                    'PTS': values[-1].strip()
                }
                self.standings.append(team_data)

            indiv_df = pd.DataFrame(self.standings)
            indiv_df.to_csv(f'../data/machine_learning/tables/{self.country}_{self.year}.csv', index=False)
        except Exception as e:
            print(e)

    async def get_data(self, results=True):
        if results:
            urls = [self.results_url, self.fixtures_url]
        else:
            urls = [self.standings_url]

        for url in urls:
            await self.page.goto(url)

            if results:
                await self.extract_results(url)
            else:
                await self.extract_standings()

    @staticmethod
    def determine_correct_year(row):
        month = int(row['time'].split('.')[1]) 
        if month >= 7:
            return row['season_start_year']
        else:
            return row['season_end_year']

    def get_frame_scores(self):

        print(f"Results: {self.results}")

        indiv_df = pd.DataFrame(self.results)
        indiv_df['season_start_year'] = self.year
        indiv_df['season_end_year'] = self.year+1
        indiv_df['year'] = indiv_df.apply(self.determine_correct_year, axis=1)
        indiv_df['country'] = self.country

        indiv_df['time'] = indiv_df['time'].str.extract(self.time_pattern)[0] # issue with Italy 20.09 Verona vs Roma 20/21
        indiv_df['time'] = pd.to_datetime(indiv_df['time'] + ' ' + indiv_df['year'].astype(str), format='%d.%m. %H:%M %Y')
        indiv_df = indiv_df.sort_values(by='time')
        indiv_df['weekday'] = indiv_df['time'].dt.weekday
        indiv_df['month'] = indiv_df['time'].dt.month
        indiv_df['day'] = indiv_df['time'].dt.day
        indiv_df['hour'] = indiv_df['time'].dt.hour
        indiv_df['minute'] = indiv_df['time'].dt.minute

        indiv_df.to_csv(f"../data/scores/raw/{self.country}{self.year}_scores.csv",index=False)
        print(f"{len(self.results['time'])} rows of data found in {self.year}, saved to CSV")

Get results

In [9]:
async with async_playwright() as p:

    browser = await p.chromium.launch(headless=True)
    page = await browser.new_page()

    for country in COMPETITIONS.keys():

        if country != 'ITA':
            continue
    
        # for year in range(START_YEAR, END_YEAR+1):
        for year in range(START_YEAR, END_YEAR+1):

            season = Season(year, country, page)
            await season.get_data()
            season.get_frame_scores()

    await browser.close()

No 'Show more' button visible for 2019.
No show more button on the page, trying to extract data
Results: {'time': ['03.08. 02:45', '03.08. 02:45', '03.08. 02:45', '03.08. 02:45', '03.08. 00:00', '02.08. 02:45', '02.08. 02:45', '02.08. 02:45', '02.08. 02:45', '02.08. 00:00', '30.07. 03:45', '30.07. 03:45', '30.07. 03:45', '30.07. 01:30', '30.07. 01:30', '30.07. 01:30', '30.07. 01:30', '30.07. 01:30', '29.07. 03:45', '29.07. 01:30', '27.07. 03:45', '27.07. 01:30', '27.07. 01:30', '27.07. 01:30', '27.07. 01:30', '26.07. 23:15', '26.07. 03:45', '26.07. 01:30', '25.07. 23:15', '25.07. 03:45', '24.07. 03:45', '24.07. 01:30', '23.07. 03:45', '23.07. 03:45', '23.07. 03:45', '23.07. 03:45', '23.07. 03:45', '23.07. 01:30', '22.07. 03:45', '22.07. 01:30', '21.07. 03:45', '20.07. 03:45', '20.07. 01:30', '20.07. 01:30', '20.07. 01:30', '20.07. 01:30', '19.07. 23:15', '19.07. 03:45', '19.07. 01:30', '18.07. 23:15', '17.07. 03:45', '17.07. 01:30', '16.07. 03:45', '16.07. 03:45', '16.07. 03:45', '16.0

Get table for 2023-24

In [None]:
async with async_playwright() as p:

    browser = await p.chromium.launch(headless=True)
    page = await browser.new_page()

    for country in COMPETITIONS.keys():
    
        # for year in range(START_YEAR, END_YEAR+1):
        for year in range(2019, 2024):

            season = Season(year, country, page)
            await season.get_data(results=False)

    await browser.close()