In [9]:
# Import required libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import datetime as dt
import time
import pandas as pd

In [10]:
# Assign the seasons accordingly
season_mapping = {
    2023:'2023-2024',
    2022:'2022-2023',
    2021:'2021-2022',
    2020:'2020-2021',
    2019:'2019-2020'
}

In [11]:
# A function to check and adjust the list accordingly and enusre they are all the same length
def check(data):

    cleaned_list = []
    for i in range(len(data)):
        if len(data[i]) == 13:
            clean_data = [data[i][0]] + [' '.join(data[i][1:3])] + data[i][3:]
            cleaned_list.append(clean_data)
        elif len(data[i]) > 13:
            clean_data = [data[i][0]] + [' '.join(data[i][1:4])] + data[i][4:]
            cleaned_list.append(clean_data)
        elif len(data[i]) == 12:
            cleaned_list.append(data[i])

    return cleaned_list

In [12]:
# Create a function that takes the season and scrapes the league table
def table(season):

    driver = webdriver.Chrome()

    page_url = f"https://understat.com/league/EPL/{season}"
    driver.get(page_url) 

    name = driver.find_elements(By.ID, 'league-chemp')[0]
    table = name.text.split('\n')
    
    # Breaks the table into a list of list
    data = [line.split() for line in table] 

    # Checks the length of each list item and sorts accordingly
    cleaned_list = check(data)

    # Transform to a dataframe
    df = pd.DataFrame(cleaned_list)

    # Assign the first row of the list as the column header
    df.columns = df.iloc[0]
    df = df[1:]

    # Rename the column 
    df.rename(columns = {'№':'Position',
                        'M':'MatchesPlayed',
                        'W':'Wins',
                        'D':'Draw',
                        'L':'Loss',
                        'G':'GoalsScored',
                        'GA':'GoalsAgainst',
                        'PTS':'Points'}, inplace=True)

    # Change 'Position' to an interger
    df['Position'] = df['Position'].astype(int)

    # Filter to show only Arsenal data
    df_arsenal = df[df['Team'] == 'Arsenal']

    # Add a current season to respective table
    df_arsenal['season_name'] = season_mapping[season]

    return df_arsenal


### Historical Data

In [None]:
# Previous season and historical data 
df_arsenal_19 = table(2019)
df_arsenal_20 = table(2020)
df_arsenal_21 = table(2021)
df_arsenal_22 = table(2022)

# Join all the old data together into a table,
previous_season = [df_arsenal_19,df_arsenal_20,df_arsenal_21, df_arsenal_22]
df_league_1922_table = pd.concat(previous_season, ignore_index=True)

# Export values
df_league_1922_table.to_csv("/Users/Razak/Desktop/Arsenal Database Project/scraped csv files/Raw csv/Old Season/temp_leaguetable19-22_raw.csv", index=False, encoding = 'utf-8-sig')

### Current Season Data
Run the code below to add new data for the current season to the existing data above, then export the updated data.

In [None]:
# New changing data
df_arsenal_23 = table(2023)

# read old season table (using pandas read excel)
previous_table = pd.read_csv("/Users/Razak/Desktop/Arsenal Database Project/scraped csv files/Raw csv/Old Season/temp_leaguetable19-22_raw.csv")

# join new and onld season (concat)
joined_season = pd.concat([previous_table, df_arsenal_23], ignore_index=True)

# export join (export and save as old+new updated table)
joined_season.to_csv("/Users/Razak/Desktop/Arsenal Database Project/scraped csv files/Raw csv/Updated tables/temp_updatedleaguetable_raw.csv", index=False, encoding = 'utf-8-sig')