In [1]:
# Import libraries
import pandas as pd
import requests as rq
from datetime import datetime
import os
import numpy as np

In [2]:
file_1 = pd.read_csv('https://www.football-data.co.uk/mmz4281/1920/E0.csv')
file_1.head()

Unnamed: 0,Div,Date,Time,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,...,AvgC<2.5,AHCh,B365CAHH,B365CAHA,PCAHH,PCAHA,MaxCAHH,MaxCAHA,AvgCAHH,AvgCAHA
0,E0,09/08/2019,20:00,Liverpool,Norwich,4,1,H,4,0,...,3.43,-2.25,1.91,1.99,1.94,1.98,1.99,2.07,1.9,1.99
1,E0,10/08/2019,12:30,West Ham,Man City,0,5,A,0,1,...,2.91,1.75,1.95,1.95,1.96,1.97,2.07,1.98,1.97,1.92
2,E0,10/08/2019,15:00,Bournemouth,Sheffield United,1,1,D,0,0,...,1.92,-0.5,1.95,1.95,1.98,1.95,2.0,1.96,1.96,1.92
3,E0,10/08/2019,15:00,Burnley,Southampton,3,0,H,0,0,...,1.71,0.0,1.87,2.03,1.89,2.03,1.9,2.07,1.86,2.02
4,E0,10/08/2019,15:00,Crystal Palace,Everton,0,0,D,0,0,...,1.71,0.25,1.82,2.08,1.97,1.96,2.03,2.08,1.96,1.93


In [4]:
# Get the current date and extract the year in the format yy
current_date = datetime.now()
formated_date = int(current_date.strftime('%Y'))

# Creating folders for the 3 leagues
premiership_folder = 'Premiership'
championship_folder = 'Championship'
league1_folder = 'League 1'

fields_to_extract = ['Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG']

# Setting up a For-Loop to run  since we want to extract for 32years starting from current season 2024/2025
for i in range(32):
    # Get the season in the format cyny - (cy - current year; ny - next year)
    season = str(formated_date)[2:]+str(formated_date+1)[2:]

    # Create the naming formats and paths for the 3 leagues
    premiership_file_name = f'E0_{season}.csv'
    premiership_path = os.path.join(premiership_folder, premiership_file_name)

    championship_file_name = f'E1_{season}.csv'
    championship_path = os.path.join(championship_folder, championship_file_name)

    league1_file_name = f'E2_{season}.csv'
    league1_path = os.path.join(league1_folder, league1_file_name)

    # Create the url formatting strings for the 3 leagues
    premiership_url = f'https://www.football-data.co.uk/mmz4281/{season}/E0.csv'
    championship_url = f'https://www.football-data.co.uk/mmz4281/{season}/E1.csv'
    league1_url = f'https://www.football-data.co.uk/mmz4281/{season}/E2.csv'

    # Read and save premiership data
    response = rq.get(premiership_url, timeout=10)
    if response.status_code == 200:
        try:
            prem_data = pd.read_csv(premiership_url, usecols=fields_to_extract)

        except ValueError or UnicodeDecodeError:
            prem_data = pd.read_csv(premiership_url, encoding='ISO-8859-1', usecols=range(6))
            prem_data['Time'] = np.nan
            prem_data = prem_data[fields_to_extract]

        # Handling and additional empty rows
        prem_data = prem_data.dropna(how='all')
        prem_data.reset_index(drop=True, inplace=True)
        prem_data.insert(1, 'season', season)

        prem_data.to_csv(premiership_path, index=False)
        print(f'Saved {premiership_file_name} to {premiership_path}')
    else:
        print(f'Premiership file not found for the season {season}')
    
    # Read and save championship data
    response = rq.get(championship_url, timeout=10)
    if response.status_code == 200:
        try:
            chams_data = pd.read_csv(championship_url, usecols=fields_to_extract)

        except ValueError or UnicodeDecodeError:
            chams_data = pd.read_csv(championship_url, encoding='ISO-8859-1', usecols=range(6))
            chams_data['Time'] = np.nan
            chams_data = chams_data[fields_to_extract]

        # Handling and additional empty rows
        chams_data = chams_data.dropna(how='all')
        chams_data.reset_index(drop=True, inplace=True)
        chams_data.insert(1, 'season', season)

        chams_data.to_csv(championship_path, index=False)
        print(f'Saved {championship_file_name} to {championship_path}')
    else:
        print(f'Championship file not found for the season {season}')

    # Read and save league 1 data
    response = rq.get(league1_url, timeout=10)
    if response.status_code == 200:
        try:
            league1_data = pd.read_csv(league1_url, usecols=fields_to_extract)

        except ValueError or UnicodeDecodeError:
            league1_data = pd.read_csv(league1_url, encoding='ISO-8859-1', usecols=range(6))
            league1_data['Time'] = np.nan
            league1_data = league1_data[fields_to_extract]

        # Handling and additional empty rows
        league1_data = league1_data.dropna(how='all')
        league1_data.reset_index(drop=True, inplace=True)
        league1_data.insert(1, 'season', season)

        league1_data.to_csv(league1_path, index=False)
        print(f'Saved {league1_file_name} to {league1_path}')
    else:
        print(f'League 1 file not found for the season {season}')

    formated_date-=1


Saved E0_2425.csv to Premiership\E0_2425.csv
Saved E1_2425.csv to Championship\E1_2425.csv
Saved E2_2425.csv to League 1\E2_2425.csv
Saved E0_2324.csv to Premiership\E0_2324.csv
Saved E1_2324.csv to Championship\E1_2324.csv
Saved E2_2324.csv to League 1\E2_2324.csv
Saved E0_2223.csv to Premiership\E0_2223.csv
Saved E1_2223.csv to Championship\E1_2223.csv
Saved E2_2223.csv to League 1\E2_2223.csv
Saved E0_2122.csv to Premiership\E0_2122.csv
Saved E1_2122.csv to Championship\E1_2122.csv
Saved E2_2122.csv to League 1\E2_2122.csv
Saved E0_2021.csv to Premiership\E0_2021.csv
Saved E1_2021.csv to Championship\E1_2021.csv
Saved E2_2021.csv to League 1\E2_2021.csv
Saved E0_1920.csv to Premiership\E0_1920.csv
Saved E1_1920.csv to Championship\E1_1920.csv
Saved E2_1920.csv to League 1\E2_1920.csv
Saved E0_1819.csv to Premiership\E0_1819.csv
Saved E1_1819.csv to Championship\E1_1819.csv
Saved E2_1819.csv to League 1\E2_1819.csv
Saved E0_1718.csv to Premiership\E0_1718.csv
Saved E1_1718.csv to Cha