In [1]:
!playwright install
# !pip install playwright

In [2]:
import asyncio
import nest_asyncio
nest_asyncio.apply()

from playwright.async_api import async_playwright
from playwright.async_api import TimeoutError

import pandas as pd 


In [3]:
START_YEAR = 1992
END_YEAR = 2022

In [20]:
BASE_URL = "https://www.flashscore.com/football/england"

async def get_scores(start_year, end_year):
    results = {}

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        
        for year in range(start_year, end_year+1):
            url = f'{BASE_URL}/premier-league-{year}-{year+1}/results/'
            await page.goto(url)
            await page.wait_for_selector('a.event__more.event__more--static')
            
            content = await page.content()
            results[year] = content
            
            while True:
                show_more_button_locator = page.locator('a.event__more.event__more--static')
                if await show_more_button_locator.is_visible():
                    try:
                        await show_more_button_locator.click(timeout=10000)
                        await page.wait_for_load_state('domcontentloaded', timeout=10000)
                    except TimeoutError:
                        # print("Timeout occurred, checking for button again...")
                        pass
                else:
                    print(f"No 'Show more' button visible for {year}.")
                    break

            times = await page.locator("div.event__time").all_text_contents()
            home_teams = await page.locator("div.event__participant.event__participant--home").all_text_contents()
            home_scores = await page.locator("div.event__score.event__score--home").all_text_contents()
            away_teams = await page.locator("div.event__participant.event__participant--away").all_text_contents()
            away_scores = await page.locator("div.event__score.event__score--away").all_text_contents()

            results[year] = {
                'time': times,
                'home_team': home_teams,
                'home_score': home_scores,
                'away_team': away_teams,
                'away_score': away_scores,
            }

        await browser.close()

    return results

In [21]:
def determine_correct_year(row):
    month = int(row['time'].split('.')[1]) 
    if month >= 6:
        return row['season_start_year']
    else:
        return row['season_end_year']

In [22]:
def get_frame_scores(results):

    print(f"Results: {results}")

    frames = []

    for year, result in results.items():

        indiv_df = pd.DataFrame(result)
        indiv_df['season_start_year'] = year
        indiv_df['season_end_year'] = year+1
        indiv_df['correct_year'] = indiv_df.apply(determine_correct_year, axis=1)
        indiv_df['time'] = pd.to_datetime(indiv_df['time'] + ' ' + indiv_df['correct_year'].astype(str), format='%d.%m. %H:%M %Y')
        indiv_df = indiv_df.sort_values(by='time')

        indiv_df.to_csv(f"./data/scores/PL{year}_scores.csv",index=False)
        print(f"{len(result['time'])} rows of data found in {year}, saved to CSV")

In [23]:
async def frame_for_scores(start_year=START_YEAR, end_year=END_YEAR):
    results = await get_scores(start_year, end_year)
    get_frame_scores(results)

In [24]:
await frame_for_scores(start_year=1992, end_year=2022)

No 'Show more' button visible for 1992.
No 'Show more' button visible for 1993.
No 'Show more' button visible for 1994.
No 'Show more' button visible for 1995.
No 'Show more' button visible for 1996.
No 'Show more' button visible for 1997.
No 'Show more' button visible for 1998.
No 'Show more' button visible for 1999.
No 'Show more' button visible for 2000.
No 'Show more' button visible for 2001.
No 'Show more' button visible for 2002.
No 'Show more' button visible for 2003.
No 'Show more' button visible for 2004.
No 'Show more' button visible for 2005.
No 'Show more' button visible for 2006.
No 'Show more' button visible for 2007.
No 'Show more' button visible for 2008.
No 'Show more' button visible for 2009.
No 'Show more' button visible for 2010.
No 'Show more' button visible for 2011.
No 'Show more' button visible for 2012.
No 'Show more' button visible for 2013.
No 'Show more' button visible for 2014.
No 'Show more' button visible for 2015.
No 'Show more' button visible for 2016.


In [None]:
df = pd.read_csv('./data/scores/PL1992_scores.csv')