In [1]:
# Import required libraries
import requests
import re
from bs4 import BeautifulSoup
import datetime as dt
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager

# Database connector 
import mysql.connector
from mysql.connector import Error

Connect to the database

In [2]:
# Connect to the database
connection = mysql.connector.connect(
    host = 'localhost',
    port = 3306,
    user = 'root',
    password = '#',
    database = 'arsenaldb'
)

# Function to execute a SQL query 
def execute_query(connection, query):
    cursor = connection.cursor()
    try:
        cursor.execute(query)
        connection.commit()
        print("Successful")
    except Error as err:
        print(f"Error: '{err}'")

# Read and Run Query
def read_query(connection, query):
    cursor = connection.cursor()
    result = None
    try:
        cursor.execute(query)
        result = cursor.fetchall() # Reads data from the database without making any changes to it.
        return result
    except Error as err:
        print(f'Error: "{err}"')

# Use DB
execute_query(connection, """USE ArsenalDB""")

Successful


# Create Database Table

### Players

In [61]:
# Drop Table
execute_query(connection, """ 
DROP TABLE IF EXISTS players
""")

# Create Table
players_table = """
CREATE TABLE players(
	PlayerID INT NOT NULL PRIMARY KEY UNIQUE KEY AUTO_INCREMENT,
    PlayerName VARCHAR(80) NOT NULL,
    NationCode VARCHAR(5),
    Nation	VARCHAR(30)
)
"""
execute_query(connection, players_table)

Successful
Successful


### Nationality

In [68]:
# Drop Table
execute_query(connection, """ 
DROP TABLE IF EXISTS nationality
""")

# Create Table
nationality_table = """
CREATE TABLE nationality(
	NationId INT NOT NULL PRIMARY KEY UNIQUE KEY AUTO_INCREMENT,
    Nation	VARCHAR(30) NOT NULL,
    NationCode VARCHAR(5) NOT NULL
);
"""
execute_query(connection, nationality_table)

Successful
Successful


## Get PLAYERS data

In [3]:
# Assign the seasons accordingly
season_mapping = {
    2023:'2023-2024',
    2022:'2022-2023',
    2021:'2021-2022',
    2020:'2020-2021',
    2019:'2019-2020'
}

def players(season):

    # Scrape the data
    headers = {
    "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    }

    url = f'https://www.transfermarkt.com/arsenal-fc/kader/verein/11/plus/0/galerie/0?saison_id={season}'
    response = requests.get(url, headers=headers)

    soup = BeautifulSoup(response.content, "html.parser")

    # Player Check 1/2
    odd_player = soup.select('table[class="items"] tr[class="odd"]')

    a_list = []
    for i in range(len(odd_player)):
        a_list.append(odd_player[i].text.split('\n'))

    # Remove empty indexes in the list    
    odd_list = []
    for sublist in a_list:
        n_sublist = [value.strip() for value in sublist if value.strip()]
        odd_list.append(n_sublist)

    # Player Check 2/2
    even_player = soup.select('table[class="items"] tr[class="even"]')

    b_list = []
    for i in range(len(even_player)):
        b_list.append(even_player[i].text.split('\n'))
        
    even_list = []
    for sublist in b_list:
        n_sublist = [value.strip() for value in sublist if value.strip()]
        even_list.append(n_sublist)

    # Join the lists
    player_list = odd_list + even_list

    
    # Convert the list to a pandas dataframe
    df_players = pd.DataFrame(player_list)
    df_players = df_players.iloc[:,0:3]
    df_players.rename(columns = {0:'ShirtNumber',
                                1:'PlayerName',
                                2:'Position'},
                    inplace=True)
    
    df_players['SeasonName'] = season_mapping[season]
    
    return df_players


In [11]:
df_players_19 = players(2019)
df_players_20 = players(2020)
df_players_21 = players(2021)
df_players_22 = players(2022)
df_players_23 = players(2023)


# Join all the old data together into a table,
previous_season = [df_players_19,df_players_20,df_players_21,df_players_22, df_players_23]
df_players_table = pd.concat(previous_season, ignore_index=True)

In [12]:
df_players_table.describe()

<bound method NDFrame.describe of     ShirtNumber                 PlayerName            Position season_name
0             1                 Bernd Leno          Goalkeeper   2019-2020
1            33                 Matt Macey          Goalkeeper   2019-2020
2             5  Sokratis Papastathopoulos         Centre-Back   2019-2020
3            20           Shkodran Mustafi         Centre-Back   2019-2020
4            16                Rob Holding         Centre-Back   2019-2020
..          ...                        ...                 ...         ...
183           8            Martin Ødegaard  Attacking Midfield   2023-2024
184          21               Fábio Vieira  Attacking Midfield   2023-2024
185          11         Gabriel Martinelli         Left Winger   2023-2024
186           7                Bukayo Saka        Right Winger   2023-2024
187           9              Gabriel Jesus      Centre-Forward   2023-2024

[188 rows x 4 columns]>

Filter data for Players Table - Distinct Player Name

In [13]:
# Filter to keep only PlayerName and drop duplicates
df_players = df_players_table.loc[:,['PlayerName']].drop_duplicates(ignore_index=True)
df_players.describe

<bound method NDFrame.describe of                    PlayerName
0                  Bernd Leno
1                  Matt Macey
2   Sokratis Papastathopoulos
3            Shkodran Mustafi
4                 Rob Holding
..                        ...
86      Nathan Butler-Oyedeji
87                 David Raya
88                Kai Havertz
89             Jurrien Timber
90                Declan Rice

[91 rows x 1 columns]>

## Get Nationality Data

In [14]:
# Assign the seasons accordingly
season_mapping = {
    2023:'2023-2024',
    2022:'2022-2023',
    2021:'2021-2022',
    2020:'2020-2021',
    2019:'2019-2020'
}

# Function takes the season, performs data cleaning and returns nationality
def nationality(season):

    # Web scrape using Selenium
    driver = webdriver.Chrome()

    page_url = f"https://fbref.com/en/squads/18bb7c10/{season}/roster/Arsenal-Roster-Details"
    driver.get(page_url) 

    name = driver.find_elements(By.CLASS_NAME, 'roster-player-info')

    # Clean dataset
    name_list = []
    for i in range(len(name)):
        name_list.append(name[i].text.split('\n'))
    
    df_nationality = pd.DataFrame(name_list)

    a = df_nationality.iloc[:,3]
    df_nations = pd.DataFrame(a)

    # Remove duplicates
    df_nations = df_nations.drop_duplicates()
    df_nations.reset_index(drop=True, inplace=True)

    # Rename column
    df_nations.rename(columns = {3:'Nationality'}, inplace=True)

    return df_nations

In [16]:
# Enter desired season
df_nationality_19 = nationality(season_mapping[2019])
df_nationality_20 = nationality(season_mapping[2020])
df_nationality_21 = nationality(season_mapping[2021])
df_nationality_22 = nationality(season_mapping[2022])
df_nationality_23 = nationality(season_mapping[2023])

In [17]:
# Join all the old data together into a table,
previous_season = [df_nationality_19,df_nationality_20,df_nationality_21,df_nationality_22, df_nationality_23]
df_nationality_table = pd.concat(previous_season, ignore_index=True)
df_nationality_table.describe

<bound method NDFrame.describe of                      Nationality
0      National Team: Germany de
1    National Team: Argentina ar
2       Citizenship: England eng
3       National Team: Brazil br
4       National Team: Greece gr
..                           ...
110     Citizenship: England eng
111       Citizenship: Wales wls
112     National Team: Norway no
113    National Team: Germany de
114    National Team: Belgium be

[115 rows x 1 columns]>

In [18]:
# Data Cleaning
# Drop rows where Nationality == None
df_nationality_table = df_nationality_table[df_nationality_table['Nationality'].notna()]

# Iterate to extract just the Country's name
df_nationality_table['Nation'] = df_nationality_table['Nationality'].apply(lambda x: x.split(':')[-1].strip().rsplit(' ', 1)[0])

# Assign a new column for the nationa code
df_nationality_table['NationCode'] = df_nationality_table['Nationality'].apply(lambda x: x.rsplit(' ',1)[-1])

# Filter Data
df_nationality_table = df_nationality_table.loc[:,['Nation','NationCode']].drop_duplicates(ignore_index=True)

# Remove empty rows
df_nationality_table.describe

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_nationality_table['Nation'] = df_nationality_table['Nationality'].apply(lambda x: x.split(':')[-1].strip().rsplit(' ', 1)[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_nationality_table['NationCode'] = df_nationality_table['Nationality'].apply(lambda x: x.rsplit(' ',1)[-1])


<bound method NDFrame.describe of                     Nation NationCode
0                  Germany         de
1                Argentina         ar
2                  England        eng
3                   Brazil         br
4                   Greece         gr
5   Bosnia and Herzegovina         ba
6                    Spain         es
7                 Scotland        sct
8                 Portugal         pt
9              Switzerland         ch
10                  France         fr
11                 Uruguay         uy
12                   Wales        wls
13                   Gabon         ga
14           Côte d'Ivoire         ci
15                 Armenia         am
16           United States         us
17                 Iceland         is
18               Australia         au
19                 Estonia         ee
20                   Egypt         eg
21                   Ghana         gh
22                  Norway         no
23                   Japan         jp
24              

Insert into 'nationality table'

## Get Wages Data

Aim is to get the Player, Nation and the NationCode

In [19]:
# Assign the seasons accordingly
season_mapping = {
    2023:'2023-2024',
    2022:'2022-2023',
    2021:'2021-2022',
    2020:'2020-2021',
    2019:'2019-2020'
}

# Define a function that takes a season, scrapes the data and stores in 'df_wages' variable accordingly

def wages(season):

    page_url = f'https://fbref.com/en/squads/18bb7c10/{season}/wages/Arsenal-Wage-Details'

    df_wages = pd.read_html(page_url, attrs = {'id':"wages"})[0]
    
    # Filter for neeeded data
    df_wages = df_wages.loc[:,['Player','Nation']]

    # Rename Column
    df_wages.rename(columns = {'Player' :'PlayerName'}, inplace=True)

    # Drop the last row
    df_wages.drop(df_wages.tail(1).index, inplace=True)
    
    # Assign a new column for the nationa code
    df_wages['NationCode'] = df_wages['Nation'].apply(lambda x: x.rsplit(' ',1)[-2])
    
    # Filter Data
    df_wages = df_wages.loc[:,['PlayerName','NationCode']]
    
    # Remove blank rows
    #df_wages = df_wages.drop(df_wages[df_wages['Weekly Wages'].isna()].index)

    return df_wages


In [20]:
# Enter desired season
df_wages_19 = wages(season_mapping[2019])
df_wages_20 = wages(season_mapping[2020])
df_wages_21 = wages(season_mapping[2021])
df_wages_22 = wages(season_mapping[2022])
df_wages_23 = wages(season_mapping[2023])

dfs_to_concat = [df_wages_19, df_wages_20, df_wages_21, df_wages_22, df_wages_23]
df_wages_table = pd.concat(dfs_to_concat, ignore_index=True)

In [None]:
# Data cleaning - Drop duplicates
df_wages_table = df_wages_table.drop_duplicates(ignore_index=True)
df_wages_table.describe

## JOIN tables together

Wages - PlayerName, CountryCode
Nationality - Country, CountryCode
Players - PlayerName

In [22]:
df_table = pd.merge(df_wages_table, df_nationality_table, on = 'NationCode')
df_full = pd.merge(df_table, df_players, on = 'PlayerName', how = 'right')
df_full.describe

<bound method NDFrame.describe of                    PlayerName NationCode   Nation
0                  Bernd Leno         de  Germany
1                  Matt Macey        eng  England
2   Sokratis Papastathopoulos         gr   Greece
3            Shkodran Mustafi         de  Germany
4                 Rob Holding        eng  England
..                        ...        ...      ...
86      Nathan Butler-Oyedeji        NaN      NaN
87                 David Raya         es    Spain
88                Kai Havertz         de  Germany
89             Jurrien Timber        NaN      NaN
90                Declan Rice        eng  England

[91 rows x 3 columns]>

In [23]:
# Convert to a dictionary
dict_players = df_full.to_dict(orient = 'index')
dict_players

{0: {'PlayerName': 'Bernd Leno', 'NationCode': 'de', 'Nation': 'Germany'},
 1: {'PlayerName': 'Matt Macey', 'NationCode': 'eng', 'Nation': 'England'},
 2: {'PlayerName': 'Sokratis Papastathopoulos',
  'NationCode': 'gr',
  'Nation': 'Greece'},
 3: {'PlayerName': 'Shkodran Mustafi',
  'NationCode': 'de',
  'Nation': 'Germany'},
 4: {'PlayerName': 'Rob Holding', 'NationCode': 'eng', 'Nation': 'England'},
 5: {'PlayerName': 'Konstantinos Mavropanos',
  'NationCode': 'gr',
  'Nation': 'Greece'},
 6: {'PlayerName': 'Kieran Tierney',
  'NationCode': 'sct',
  'Nation': 'Scotland'},
 7: {'PlayerName': 'Tolaji Bola', 'NationCode': nan, 'Nation': nan},
 8: {'PlayerName': 'Cédric Soares', 'NationCode': 'pt', 'Nation': 'Portugal'},
 9: {'PlayerName': 'Granit Xhaka',
  'NationCode': 'ch',
  'Nation': 'Switzerland'},
 10: {'PlayerName': 'Dani Ceballos', 'NationCode': 'es', 'Nation': 'Spain'},
 11: {'PlayerName': 'Joe Willock', 'NationCode': 'eng', 'Nation': 'England'},
 12: {'PlayerName': 'James Ola

Insert into the database table

In [58]:
sql_command = []

for i in range(len(dict_players)):
    columns = ", ".join(dict_players[i].keys())
    values = ", ".join(f'"{value}"' for value in dict_players[i].values())

    sql_query = f'INSERT INTO players ({columns}) VALUES ({values})'
    sql_command.append(sql_query)

sql_command[15]

'INSERT INTO players (PlayerName, NationCode, Nation) VALUES ("Nicolas Pépé", "ci", "Côte d\'Ivoire")'

In [None]:
# Execute the query to insert data into the table
for query in sql_command:
    try:
        execute_query(connection, query)
    except Error as err:
        print(f"Error: '{err}'")

View Table

In [64]:
# Execute the query to insert data into the table
for query in sql_command:
    try:
        execute_query(connection, query)
    except Error as err:
        print(f"Error: '{err}'")

Unnamed: 0,PlayerID,PlayerName,NationCode,Nation
0,1,Bernd Leno,de,Germany
1,2,Matt Macey,eng,England
2,3,Sokratis Papastathopoulos,gr,Greece
3,4,Shkodran Mustafi,de,Germany
4,5,Rob Holding,eng,England
...,...,...,...,...
86,87,Nathan Butler-Oyedeji,,
87,88,David Raya,es,Spain
88,89,Kai Havertz,de,Germany
89,90,Jurrien Timber,,


Update Database

In [3]:
# Gabriel
execute_query(connection, """
UPDATE players 
SET PlayerName = 'Gabriel Dos Santos'
WHERE PlayerName = 'Gabriel Magalhães';
""")


Successful


Insert into Nationality table

In [75]:
dict_nationality = df_nationality_table.to_dict(orient='index')

sql_command = []

for i in range(len(dict_nationality)):
    columns = ", ".join(dict_nationality[i].keys())

    # Store each value in double quotes
    values = ", ".join(f'"{value}"' for value in dict_nationality[i].values())
    sql_query = f'INSERT INTO nationality ({columns}) VALUES ({values})'
    sql_command.append(sql_query)

sql_command[0]

'INSERT INTO nationality (Nation, NationCode) VALUES ("Germany", "de")'

In [76]:
# Execute the query to insert data into the table
for query in sql_command:
    try:
        execute_query(connection, query)
    except Error as err:
        print(f"Error: '{err}'")

Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful
Successful


In [5]:
# Run Query
run_table = """SELECT * 
FROM nationality;"""
results = read_query(connection, run_table)

# View as dataframe
# Convert to pandas dataframe
league_columns = ['NationID','Nation','NationCode']
df_table = pd.DataFrame(results, columns = league_columns)
df_table.head()

Unnamed: 0,NationID,Nation,NationCode
0,1,Germany,de
1,2,Argentina,ar
2,3,England,eng
3,4,Brazil,br
4,5,Greece,gr


In [6]:
# Run Query
run_table = """SELECT * 
FROM players;"""
results = read_query(connection, run_table)

# View as dataframe
# Convert to pandas dataframe
league_columns = ['PlayerID','PlayerName','NationCode','Nation']
df_table = pd.DataFrame(results, columns = league_columns)
df_table.head()

Unnamed: 0,PlayerID,PlayerName,NationCode,Nation
0,1,Bernd Leno,de,Germany
1,2,Matt Macey,eng,England
2,3,Sokratis Papastathopoulos,gr,Greece
3,4,Shkodran Mustafi,de,Germany
4,5,Rob Holding,eng,England
