In [1]:
!playwright install
# !pip install playwright

In [1]:
import asyncio
import nest_asyncio
nest_asyncio.apply()

from playwright.async_api import async_playwright
from playwright.async_api import TimeoutError

import pandas as pd 


In [17]:
START_YEAR = 2023
END_YEAR = 2023

COMPETITIONS = {
    'ENG': 'england/premier-league', 
    'SPA': 'spain/laliga',
    'ITA': 'italy/serie-a',
    'GER': 'germany/bundesliga',
    'FRA': 'france/ligue-1'
}

COUNTRIES = ['ENG', 'SPA', 'ITA', 'GER', 'FRA']

In [18]:
class Season:

    time_pattern = r'(\d{2}\.\d{2}\.\s\d{2}:\d{2})'

    def __init__(self, year, country, page):
        self.country = country
        self.year = year
        self.page = page
        self.results = None
        self.url = f"https://www.flashscore.com/football/{COMPETITIONS[country]}-{self.year}-{self.year+1}/results/"

    async def get_results(self):
        await self.page.goto(self.url)
        await self.page.wait_for_selector('a.event__more.event__more--static')

        await self.page.content()
        
        while True:
            show_more_button_locator = self.page.locator('a.event__more.event__more--static')
            if await show_more_button_locator.is_visible():
                try:
                    await show_more_button_locator.click(timeout=10000)
                    await self.page.wait_for_load_state('domcontentloaded', timeout=10000)
                except TimeoutError:
                    # print("Timeout occurred, checking for button again...")
                    pass
            else:
                print(f"No 'Show more' button visible for {self.year}.")
                break

        times = await self.page.locator("div.event__time").all_text_contents()
        home_teams = await self.page.locator("div.event__participant.event__participant--home").all_text_contents()
        home_scores = await self.page.locator("div.event__score.event__score--home").all_text_contents()
        away_teams = await self.page.locator("div.event__participant.event__participant--away").all_text_contents()
        away_scores = await self.page.locator("div.event__score.event__score--away").all_text_contents()

        self.results = {
            'time': times,
            'home_team': home_teams,
            'home_score': home_scores,
            'away_team': away_teams,
            'away_score': away_scores,
        }

    @staticmethod
    def determine_correct_year(row):
        month = int(row['time'].split('.')[1]) 
        if month >= 6:
            return row['season_start_year']
        else:
            return row['season_end_year']

    def get_frame_scores(self):

        print(f"Results: {self.results}")

        indiv_df = pd.DataFrame(self.results)
        indiv_df['season_start_year'] = self.year
        indiv_df['season_end_year'] = self.year+1
        indiv_df['correct_year'] = indiv_df.apply(self.determine_correct_year, axis=1)

        indiv_df['time'] = indiv_df['time'].str.extract(self.time_pattern)[0] # issue with Italy 20.09 Verona vs Roma 20/21
        indiv_df['time'] = pd.to_datetime(indiv_df['time'] + ' ' + indiv_df['correct_year'].astype(str), format='%d.%m. %H:%M %Y')
        indiv_df = indiv_df.sort_values(by='time')

        indiv_df.to_csv(f"../data/scores/raw/{self.country}{self.year}_scores.csv",index=False)
        print(f"{len(self.results['time'])} rows of data found in {self.year}, saved to CSV")

In [None]:
async with async_playwright() as p:

    browser = await p.chromium.launch(headless=True)
    page = await browser.new_page()

    for country in COMPETITIONS.keys():
    
        for year in range(START_YEAR, END_YEAR+1):

            season = Season(year, country, page)
            await season.get_results()
            season.get_frame_scores()

    await browser.close()