In [1]:
# Import required libraries
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd

# Database connector 
import mysql.connector
from mysql.connector import Error

In [2]:
# Connect to the database
connection = mysql.connector.connect(
    host = 'localhost',
    port = 3306,
    user = 'root',
    password = '#',
    database = 'arsenaldb'
)

# Function to execute a SQL query 
def execute_query(connection, query):
    cursor = connection.cursor()
    try:
        cursor.execute(query)
        connection.commit()
        print("Successful")
    except Error as err:
        print(f"Error: '{err}'")

# Read and Run Query
def read_query(connection, query):
    cursor = connection.cursor()
    result = None
    try:
        cursor.execute(query)
        result = cursor.fetchall() # Reads data from the database without making any changes to it.
        return result
    except Error as err:
        print(f'Error: "{err}"')

# Use DB
execute_query(connection, """USE ArsenalDB""")

Successful


Create database table - Wages

In [141]:
# Drop Table
execute_query(connection, """DROP TABLE IF EXISTS wages""")

# Create Table
wages_table = """
CREATE TABLE wages(
    SeasonID INT NOT NULL,
    PlayerID INT NOT NULL,
    PlayerName VARCHAR(80) NOT NULL,
    Wages INT NOT NULL
)
"""
execute_query(connection, wages_table)

Successful
Successful


In [3]:
# Assign the seasons accordingly
season_mapping = {
    2023:'2023-2024',
    2022:'2022-2023',
    2021:'2021-2022',
    2020:'2020-2021',
    2019:'2019-2020'
}

# Define a function that takes a season, scrapes the data and stores in 'df_wages' variable accordingly

def wages(season):

    page_url = f'https://fbref.com/en/squads/18bb7c10/{season}/wages/Arsenal-Wage-Details'

    df_wages = pd.read_html(page_url, attrs = {'id':"wages"})[0]
    
    # Filter for neeeded data
    df_wages = df_wages.loc[:,['Player','Weekly Wages']]

    # Rename Column
    df_wages.rename(columns = {'Player' :'PlayerName', 'Weekly Wages':'Wages'}, inplace=True)

    # Drop the last row
    df_wages.drop(df_wages.tail(1).index, inplace=True)
    
    # Clean the wages column to remove commas and convert to integer
    df_wages['Wages'] = df_wages['Wages'].apply(lambda x: int(x.rsplit(' ', 3)[0].split(' ', 1)[1].replace(",", "")))

    # Add season name
    df_wages['SeasonName'] = season

    # Update names in wages table to match players table
    # Sead Kolašinac
    df_wages.loc[df_wages['PlayerName'] == "Sead Kolašinac", 'PlayerName'] = "Sead Kolasinac"

    # Fabio Viera
    df_wages.loc[df_wages['PlayerName'] == "Fabio Vieira", 'PlayerName'] = "Fábio Vieira"

    # Timber
    df_wages.loc[df_wages['PlayerName'] == "Jurriën Timber", 'PlayerName'] = "Jurrien Timber"

    # Hein
    df_wages.loc[df_wages['PlayerName'] == "Karl Jakob Hein", 'PlayerName'] = "Karl Hein"

    return df_wages


In [143]:
# Enter desired season
df_wages_19 = wages(season_mapping[2019])
df_wages_20 = wages(season_mapping[2020])
df_wages_21 = wages(season_mapping[2021])
df_wages_22 = wages(season_mapping[2022])
df_wages_23 = wages(season_mapping[2023])

dfs_to_concat = [df_wages_19, df_wages_20, df_wages_21, df_wages_22, df_wages_23]
df_wages_table = pd.concat(dfs_to_concat, ignore_index=True)

In [144]:
df_wages_table.describe

<bound method NDFrame.describe of                     PlayerName   Wages SeasonName
0                   Mesut Özil  350000  2019-2020
1    Pierre-Emerick Aubameyang  250000  2019-2020
2          Alexandre Lacazette  182115  2019-2020
3              Héctor Bellerín  110000  2019-2020
4                   David Luiz  100962  2019-2020
..                         ...     ...        ...
123             Mohamed Elneny   55000  2023-2024
124          Takehiro Tomiyasu   55000  2023-2024
125               Fabio Vieira   45000  2023-2024
126           Emile Smith Rowe   40000  2023-2024
127            Karl Jakob Hein   10000  2023-2024

[128 rows x 3 columns]>

Pull Season and Player ID information from the database

In [4]:
# Season
season_table = """SELECT *FROM season"""
season_result = read_query(connection, season_table)
season_columns = ['SeasonID','SeasonName','League']

# Convert to pandas dataframe
df_season = pd.DataFrame(season_result, columns=season_columns)
df_season

Unnamed: 0,SeasonID,SeasonName,League
0,1,2019-2020,English Premier League
1,2,2020-2021,English Premier League
2,3,2021-2022,English Premier League
3,4,2022-2023,English Premier League
4,5,2023-2024,English Premier League


In [5]:
# Players 
players_table = """SELECT * FROM players"""
players_result = read_query(connection, players_table)
players_column = ['PlayerID','PlayerName','NationCode','Nation']

# Convert to a dataframe
df_players = pd.DataFrame(players_result, columns=players_column)
df_players.describe

<bound method NDFrame.describe of     PlayerID                 PlayerName NationCode   Nation
0          1                 Bernd Leno         de  Germany
1          2                 Matt Macey        eng  England
2          3  Sokratis Papastathopoulos         gr   Greece
3          4           Shkodran Mustafi         de  Germany
4          5                Rob Holding        eng  England
..       ...                        ...        ...      ...
86        87      Nathan Butler-Oyedeji        nan      nan
87        88                 David Raya         es    Spain
88        89                Kai Havertz         de  Germany
89        90             Jurrien Timber        nan      nan
90        91                Declan Rice        eng  England

[91 rows x 4 columns]>

### Join all tables (Wages, Season and Players)

In [148]:
# SeasonID, PlayerID, PlayerName, Wages
df_table = pd.merge(df_wages_table, df_season, on = 'SeasonName', how = 'left')
df_full = pd.merge(df_table, df_players, on = 'PlayerName', how = 'left')

# Filter to get needed columns
df_full = df_full.loc[:,['SeasonID','PlayerID','PlayerName','Wages']]
df_full.describe

<bound method NDFrame.describe of      SeasonID  PlayerID                 PlayerName   Wages
0           1        33                 Mesut Özil  350000
1           1        36  Pierre-Emerick Aubameyang  250000
2           1        18        Alexandre Lacazette  182115
3           1        27            Héctor Bellerín  110000
4           1        23                 David Luiz  100962
..        ...       ...                        ...     ...
123         5        51             Mohamed Elneny   55000
124         5        64          Takehiro Tomiyasu   55000
125         5        72               Fábio Vieira   45000
126         5        15           Emile Smith Rowe   40000
127         5        40                  Karl Hein   10000

[128 rows x 4 columns]>

Convert to a dictionary and insert table into database

In [149]:
# Convert to a dictionary
dict_wages = df_full.to_dict(orient='index')

sql_command = []

for i in range(len(dict_wages)):
    columns = ", ".join(dict_wages[i].keys())
    values = ", ".join(f'"{value}"' for value in dict_wages[i].values())

    sql_query = f'INSERT INTO wages ({columns}) VALUES ({values})'
    sql_command.append(sql_query)

sql_command[127]

'INSERT INTO wages (SeasonID, PlayerID, PlayerName, Wages) VALUES ("5", "40", "Karl Hein", "10000")'

In [150]:
# Execute the query to insert data into the table
for query in sql_command:
    try:
        execute_query(connection, query)
    except Error as err:
        print(f"Error: '{err}'")

Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful

In [6]:
# Run Query
run_table = """SELECT * 
FROM wages;"""
results = read_query(connection, run_table)

# View as dataframe
# Convert to pandas dataframe
league_columns = ['SeasonID','PlayerID','PlayerName','Wages']
df_table = pd.DataFrame(results, columns = league_columns)
df_table.head()

Unnamed: 0,SeasonID,PlayerID,PlayerName,Wages
0,1,33,Mesut Özil,350000
1,1,36,Pierre-Emerick Aubameyang,250000
2,1,18,Alexandre Lacazette,182115
3,1,27,Héctor Bellerín,110000
4,1,23,David Luiz,100962


In [7]:
connection.close()