In [1]:
# Importing all library

import pandas as pd
import time
import pyautogui
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup


In [2]:


# Initialize WebDriver 
def initialize_driver(url):
    driver = webdriver.Chrome()
    driver.get(url)
    driver.implicitly_wait(5)
    return driver

# Adjust zoom leve, becuse selenium need see all the webpage
def adjust_zoom(driver, zoom_out_times=6):
    for _ in range(zoom_out_times):
        pyautogui.hotkey('ctrl', 'subtract')

# Get season data
def getting_new_webpage(driver, season_text, next_season_text):
    time.sleep(1)
    element = driver.find_element(By.XPATH, f'//span[text()="{season_text}"]')
    ActionChains(driver).click(element).send_keys(next_season_text).perform()
    ActionChains(driver).send_keys(Keys.ENTER).perform()
    
    # Check if a second interaction is necessary, becuse in transfer webpage we have season from "" until ""
    try:
        element = driver.find_element(By.XPATH, f'//span[text()="{season_text}"]')
        ActionChains(driver).click(element).send_keys(next_season_text).perform()
        ActionChains(driver).send_keys(Keys.ENTER).perform()
    except:
        print(f"No second interaction required for season {season_text}.")
    
    # In webpage with season statistics we have "Show" button
    try:
        button = driver.find_element(By.CSS_SELECTOR, 'input[type="submit"].button.small[value="Show"]')
        button.click()
        print(f"Clicked 'Show' for season {next_season_text}.")
    # In webpage with transfer season we have "Display selection" button
    except:
        try:
            button = driver.find_element(By.CSS_SELECTOR, 'input[type="submit"].button.small[value="Display selection"]')
            button.click()
            print(f"Clicked 'Display selection' for season {next_season_text}.")
        except Exception as e:
            print(f"Failed to click button for season {next_season_text}: {e}")

    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    table = soup.find('table', class_="items").tbody
    trs = table.find_all('tr')

    return trs

# Getting season data
def getting_season_data(trs, next_season_text):
    teams, places, matches, wins, draws, loses, goals, goals_diff, points = ([] for _ in range(9))

    for tr in trs:
        team_element = tr.find('td', class_="no-border-links hauptlink")
        place_element = tr.find('td', class_="rechts hauptlink")
        
        if team_element and place_element:
            teams.append(team_element.text)
            places.append(place_element.text)
            tds = tr.find_all('td', class_="zentriert")
            matches.append(tds[1].text.strip() if len(tds) > 1 else '')
            wins.append(tds[2].text.strip() if len(tds) > 2 else '')
            draws.append(tds[3].text.strip() if len(tds) > 3 else '')
            loses.append(tds[4].text.strip() if len(tds) > 4 else '')
            goals.append(tds[5].text.strip() if len(tds) > 5 else '')
            goals_diff.append(tds[6].text.strip() if len(tds) > 6 else '')
            points.append(tds[7].text.strip() if len(tds) > 7 else '')
        else:
            print(f"Missing data for one of the elements in row: {tr}")

    return pd.DataFrame({
        'Teams': teams,
        'Places': places,
        'Matches': matches,
        'Wins': wins,
        'Draws': draws,
        'Loses': loses,
        'Goals': goals,
        'Goals Diff': goals_diff,
        'Points': points,
        'Year': next_season_text
    })

# Getting season data for transfers
def getting_transfers_data(trs, next_season_text):
    teams, spends, balances = [], [], []

    for tr in trs:
        team_element = tr.find('td', class_="hauptlink no-border-links")
        spend_element = tr.find('td', class_="rechts hauptlink redtext")
        balance_element = tr.find('td', class_="rechts hauptlink")

        if team_element and spend_element and balance_element:
            teams.append(team_element.text)
            spends.append(spend_element.text)
            balances.append(balance_element.text)
        else:
            print(f"Missing data for one of the elements in row: {tr}")

    return pd.DataFrame({
        'Teams': teams,
        'Spend': spends,
        'Balance': balances,
        'Year': next_season_text
    })

def main_getting_scraping_data(url, parse_function):
    driver = initialize_driver(url)
    adjust_zoom(driver)
    
    start_season, end_season = 24, 5
    df_seasons_statistics = pd.DataFrame()

    for year in range(start_season, end_season - 1, -1):
        season_text = f"{str(year).zfill(2)}/{str(year+1).zfill(2)}"
        next_season_text = f"{str(year-1).zfill(2)}/{str(year).zfill(2)}"
        trs = getting_new_webpage(driver, season_text, next_season_text)
        df_season = parse_function(trs, next_season_text)
        df_seasons_statistics = pd.concat([df_seasons_statistics, df_season], ignore_index=True)

    driver.quit()
    return df_seasons_statistics


In [3]:
# Webpage url with league table information 
url_transfers = "https://www.transfermarkt.co.uk/premier-league/einnahmenausgaben/wettbewerb/GB1"

# Running the script for league table
df_seasons_statistics_transfers = main_getting_scraping_data(url_transfers, getting_transfers_data)
df_seasons_statistics_transfers

Clicked 'Display selection' for season 23/24.
Clicked 'Display selection' for season 22/23.
Clicked 'Display selection' for season 21/22.
Clicked 'Display selection' for season 20/21.
Clicked 'Display selection' for season 19/20.
Clicked 'Display selection' for season 18/19.
Clicked 'Display selection' for season 17/18.
Clicked 'Display selection' for season 16/17.
Clicked 'Display selection' for season 15/16.
Clicked 'Display selection' for season 14/15.
Clicked 'Display selection' for season 13/14.
Clicked 'Display selection' for season 12/13.
Clicked 'Display selection' for season 11/12.
Clicked 'Display selection' for season 10/11.
Clicked 'Display selection' for season 09/10.
Clicked 'Display selection' for season 08/09.
Clicked 'Display selection' for season 07/08.
Clicked 'Display selection' for season 06/07.
Clicked 'Display selection' for season 05/06.
Clicked 'Display selection' for season 04/05.


Unnamed: 0,Teams,Spend,Balance,Year
0,Chelsea FC,€464.10m,€-186.60m,23/24
1,Tottenham Hotspur,€272.10m,€-151.40m,23/24
2,Manchester City,€259.60m,€-133.80m,23/24
3,Arsenal FC,€235.10m,€-165.90m,23/24
4,Manchester United,€202.30m,€-143.96m,23/24
...,...,...,...,...
395,Blackburn Rovers,€7.88m,€-175k,04/05
396,Middlesbrough FC,€6.75m,€-3.30m,04/05
397,Crystal Palace,€5.15m,€-5.05m,04/05
398,Manchester City,€1.50m,€6.44m,04/05


In [4]:
# Webpage url with transfers information 
url_league_table = "https://www.transfermarkt.co.uk/premier-league/tabelle/wettbewerb/GB1?saison_id=2024"

# Running the script for transfer table 
df_seasons_statistics_table = main_getting_scraping_data(url_league_table, getting_season_data)
df_seasons_statistics_table

No second interaction required for season 24/25.
Clicked 'Show' for season 23/24.
No second interaction required for season 23/24.
Clicked 'Show' for season 22/23.
No second interaction required for season 22/23.
Clicked 'Show' for season 21/22.
No second interaction required for season 21/22.
Clicked 'Show' for season 20/21.
No second interaction required for season 20/21.
Clicked 'Show' for season 19/20.
No second interaction required for season 19/20.
Clicked 'Show' for season 18/19.
No second interaction required for season 18/19.
Clicked 'Show' for season 17/18.
No second interaction required for season 17/18.
Clicked 'Show' for season 16/17.
No second interaction required for season 16/17.
Clicked 'Show' for season 15/16.
No second interaction required for season 15/16.
Clicked 'Show' for season 14/15.
No second interaction required for season 14/15.
Clicked 'Show' for season 13/14.
No second interaction required for season 13/14.
Clicked 'Show' for season 12/13.
No second intera

Unnamed: 0,Teams,Places,Matches,Wins,Draws,Loses,Goals,Goals Diff,Points,Year
0,\nMan City\n,1,38,28,7,3,96:34,62,91,23/24
1,\nArsenal\n,2,38,28,5,5,91:29,62,89,23/24
2,\nLiverpool\n,3,38,24,10,4,86:41,45,82,23/24
3,\nAston Villa\n,4,38,20,8,10,76:61,15,68,23/24
4,\nTottenham\n,5,38,20,6,12,74:61,13,66,23/24
...,...,...,...,...,...,...,...,...,...,...
395,\nPortsmouth\n,16,38,10,9,19,43:59,-16,39,04/05
396,\nWest Brom \n,17,38,6,16,16,36:61,-25,34,04/05
397,\nCrystal Palace\n,18,38,7,12,19,41:62,-21,33,04/05
398,\nNorwich\n,19,38,7,12,19,42:77,-35,33,04/05


In [5]:
#  Saving data as csv

df_seasons_statistics_table.to_csv('seasons_statistics_table.csv', index=False)

df_seasons_statistics_transfers.to_csv('seasons_statistics_transfers.csv', index=False)