In [40]:
# Import required libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

In [41]:
# Assign the seasons accordingly
season_mapping = {
    2023:'2023-2024',
    2022:'2022-2023',
    2021:'2021-2022',
    2020:'2020-2021',
    2019:'2019-2020'
}

# Function to scrape - Wages

In [42]:
# Define a function that takes a season, scrapes the data and stores in 'df_wages' variable accordingly

def wages(season):

    page_url = f'https://fbref.com/en/squads/18bb7c10/{season}/wages/Arsenal-Wage-Details'

    df_wages = pd.read_html(page_url, attrs = {'id':"wages"})[0]
    df_wages = df_wages.iloc[:,0:5]
    df_wages['season_name'] = season

    # Remove blank rows
    df_wages = df_wages.drop(df_wages[df_wages['Weekly Wages'].isna()].index)

    return df_wages


In [44]:
# Enter desired season
df_wages_19 = wages(season_mapping[2019])
df_wages_20 = wages(season_mapping[2020])
df_wages_21 = wages(season_mapping[2021])
df_wages_22 = wages(season_mapping[2022])

dfs_to_concat = [df_wages_19, df_wages_20, df_wages_21, df_wages_22]
df_wages_1922_list = pd.concat(dfs_to_concat, ignore_index=True)

# Export table to .csv
# Export values
df_wages_1922_list.to_csv("/Users/Razak/Desktop/Arsenal Database Project/scraped csv files/Raw csv/Old Season/temp_wages19-22_raw.csv", index=False, encoding = 'utf-8-sig')

Run the below cell to update the database

In [45]:
# save new season table (Always updating)
df_wages_23 = wages(season_mapping[2023])

# read old season table (using pandas read excel)
previous_table = pd.read_csv("/Users/Razak/Desktop/Arsenal Database Project/scraped csv files/Raw csv/Old Season/temp_wages19-22_raw.csv")

# join new and onld season (concat)
joined_season = pd.concat([previous_table, df_wages_23], ignore_index=True)

# export join (export and save as old+new updated table)
joined_season.to_csv("/Users/Razak/Desktop/Arsenal Database Project/scraped csv files/Raw csv/Updated tables/temp_updatedwages_raw.csv", index=False, encoding = 'utf-8-sig')

# Function to scrape - Nationality

In [46]:

def nationality(season):

    driver = webdriver.Chrome()

    # Define a function that takes the desired season and inserts it into with various if statements

    page_url = f"https://fbref.com/en/squads/18bb7c10/{season}/roster/Arsenal-Roster-Details"
    driver.get(page_url) 

    name = driver.find_elements(By.CLASS_NAME, 'roster-player-info')

    # Clean dataset
    name_list = []
    for i in range(len(name)):
        name_list.append(name[i].text.split('\n'))
    
    df_nationality = pd.DataFrame(name_list)

    a = df_nationality.iloc[:,3]
    df_nations = pd.DataFrame(a)

    # Remove duplicates
    df_nations = df_nations.drop_duplicates()
    df_nations.reset_index(drop=True, inplace=True)

    # Rename column
    df_nations.rename(columns = {3:'Nationality'}, inplace=True)

    # Remove empty rows
    df_nations = df_nations.dropna()

    return df_nations

### Historical Data

In [47]:
# Enter desired season
df_nationality_19 = nationality(season_mapping[2019])
df_nationality_20 = nationality(season_mapping[2020])
df_nationality_21 = nationality(season_mapping[2021])
df_nationality_22 = nationality(season_mapping[2022])

# Join all the old data together into a table,
previous_season = [df_nationality_19,df_nationality_20,df_nationality_21,df_nationality_22]
df_nationality_1922_table = pd.concat(previous_season, ignore_index=True)

# Export table to .csv
# Export values
df_nationality_1922_table.to_csv("/Users/Razak/Desktop/Arsenal Database Project/scraped csv files/Raw csv/Old Season/temp_nationality19-22_raw.csv", index=False, encoding = 'utf-8-sig')

### Current Season Data
Run the code below to add new data for the current season to the existing data above, then export the updated data.

In [49]:
# New table
df_nationality_23 = nationality(season_mapping[2023])

# read old season table (using pandas read excel)
previous_table = pd.read_csv("/Users/Razak/Desktop/Arsenal Database Project/scraped csv files/Raw csv/Old Season/temp_nationality19-22_raw.csv")

# join new and onld season (concat)
joined_season = pd.concat([previous_table, df_nationality_23], ignore_index=True)

# export join (export and save as old+new updated table)
joined_season.to_csv("/Users/Razak/Desktop/Arsenal Database Project/scraped csv files/Raw csv/Updated tables/temp_updatednationality_raw.csv", index=False, encoding = 'utf-8-sig')