In [1]:
# Web scraping to obtain EPL historical data from https://fbref.com/en/comps/9/history/Premier-League-Seasons

import requests
from bs4 import BeautifulSoup
import pandas as pd

# webpage for webscraping 
url = "https://fbref.com/en/comps/9/history/Premier-League-Seasons"

# using the GET request function 
response = requests.get(url)
response.raise_for_status()  #to ensure that the GET request was successful before proceeding

# processing the contenst of the html
soup = BeautifulSoup(response.content, 'html.parser')

# to get the table which contains the data for each season using the find function 
table = soup.find('table', {'id': 'seasons'})

# hence the headers and rows of the table should be extracted 
headers = [th.text for th in table.find('thead').find_all('th')]
rows = []
for row in table.find('tbody').find_all('tr'):
    cells = row.find_all('td')
    if cells:
        rows.append([cell.text.strip() for cell in cells])

# Create a DataFrame for the data scraped 
df = pd.DataFrame(rows, columns=headers[1:])  # Exclude the first header as it is for row numbers

# therefore the dataframe should be displayed 
print(df)

                   Competition Name # Squads                Champion  \
0                    Premier League       20                           
1                    Premier League       20    Manchester City - 91   
2                    Premier League       20    Manchester City - 89   
3                    Premier League       20    Manchester City - 93   
4                    Premier League       20    Manchester City - 86   
..                              ...      ...                     ...   
121  Football League First Division       16         Sunderland - 48   
122                 Football League       14         Sunderland - 42   
123                 Football League       12            Everton - 29   
124                 Football League       12  Preston North End - 33   
125                 Football League       12  Preston North End - 40   

                            Top Scorer  
0                                       
1                  Erling Haaland - 27  
2           

In [10]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load the data
file_path = r"C:\Users\HP\OneDrive - University of Hertfordshire\EPL Dataset\14-15.csv"
data = pd.read_csv(file_path)



# Display the first few rows of the data
print("First few rows of the dataset:")
print(data.head())

First few rows of the dataset:
  Div      Date    HomeTeam        AwayTeam  FTHG  FTAG FTR  HTHG  HTAG HTR  \
0  E0  16-08-14     Arsenal  Crystal Palace     2     1   H     1     1   D   
1  E0  16-08-14   Leicester         Everton     2     2   D     1     2   A   
2  E0  16-08-14  Man United         Swansea     1     2   A     0     1   A   
3  E0  16-08-14         QPR            Hull     0     1   A     0     0   D   
4  E0  16-08-14       Stoke     Aston Villa     0     1   A     0     0   D   

   ... HST  AST  HF  AF  HC  AC  HY  AY  HR  AR  
0  ...   6    2  13  19   9   3   2   2   0   1  
1  ...   3    3  16  10   3   6   1   1   0   0  
2  ...   5    4  14  20   4   0   2   4   0   0  
3  ...   6    4  10  10   8   9   1   2   0   0  
4  ...   2    2  14   9   2   8   0   3   0   0  

[5 rows x 23 columns]


In [11]:
# preprocessing of data dowloaded from https://www.football-data.co.uk/englandm.php

import os
import pandas as pd

# the data for each season was downloaded in different csv files.
# the section combines all the csv files in a single file


# Defining the file path/directory cwhere the data files are (csv files)
directory = r"C:\Users\HP\OneDrive - University of Hertfordshire\EPL Dataset"

# An empty list is initialised to hold the DataFrames
dataframes = []

# Iterating through each file in the path 
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        # Read the CSV file into a DataFrame
        filepath = os.path.join(directory, filename)
        df = pd.read_csv(filepath)
        # Append the DataFrame to the list
        dataframes.append(df)

# Concatenate all the DataFrames into one DataFrame
New_combined_df = pd.concat(dataframes, ignore_index=True)

# Hence the new path where the new CSV will be located (the new combined dataset)
output_path = r"C:\Users\HP\OneDrive - University of Hertfordshire\EPL Dataset\New_combined_dataset.csv"

# Save the combined DataFrame to a single CSV file
New_combined_df.to_csv(output_path, index=False)

print(f"New_Combined CSV file saved at: {output_path}")

New_Combined CSV file saved at: C:\Users\HP\OneDrive - University of Hertfordshire\EPL Dataset\New_combined_dataset.csv


In [23]:
# preprocessing of the new combined dataset

import pandas as pd

# Load the dataset from the directory to inspect the data then print the first few rows to check
Data_path = r"C:\Users\HP\OneDrive - University of Hertfordshire\EPL Dataset\New_combined_dataset.csv"
df = pd.read_csv(Data_path)
print(df.head())

# Handling missing values by checking through the dataset and ensuring it contains the appropriate data types

# Check for missing values
print(df.isnull().sum())

# Then check through the data if there are duplicate rows or columns
print(df.duplicated().sum())

# Check through the dataset to have an overview of the data types to ensure they are appropriate
print(df.dtypes)

# Drop rows if high percentage (above 50%) of missing values occur
df = df.dropna(thresh=len(df) * 0.5, axis=1)

# Hence fill missing numerical values with the median
df = df.fillna(df.median())

# Fill missing categorical values with the mode
for column in df.select_dtypes(include=['object']).columns:
    df[column] = df[column].fillna(df[column].mode()[0])

df

  Div      Date    HomeTeam        AwayTeam  FTHG  FTAG FTR  HTHG  HTAG HTR  \
0  E0  16-08-14     Arsenal  Crystal Palace     2     1   H     1     1   D   
1  E0  16-08-14   Leicester         Everton     2     2   D     1     2   A   
2  E0  16-08-14  Man United         Swansea     1     2   A     0     1   A   
3  E0  16-08-14         QPR            Hull     0     1   A     0     0   D   
4  E0  16-08-14       Stoke     Aston Villa     0     1   A     0     0   D   

   ... AvgC<2.5  AHCh  B365CAHH  B365CAHA  PCAHH  PCAHA  MaxCAHH  MaxCAHA  \
0  ...      NaN   NaN       NaN       NaN    NaN    NaN      NaN      NaN   
1  ...      NaN   NaN       NaN       NaN    NaN    NaN      NaN      NaN   
2  ...      NaN   NaN       NaN       NaN    NaN    NaN      NaN      NaN   
3  ...      NaN   NaN       NaN       NaN    NaN    NaN      NaN      NaN   
4  ...      NaN   NaN       NaN       NaN    NaN    NaN      NaN      NaN   

   AvgCAHH  AvgCAHA  
0      NaN      NaN  
1      NaN      Na

  df = df.fillna(df.median())


Unnamed: 0,Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,...,B365C<2.5,MaxC>2.5,MaxC<2.5,AvgC>2.5,AvgC<2.5,AHCh,PCAHH,PCAHA,AvgCAHH,AvgCAHA
0,E0,16-08-14,Arsenal,Crystal Palace,2,1,H,1,1,D,...,2.04,1.87,2.17,1.79,2.06,-0.25,1.955,1.96,1.93,1.95
1,E0,16-08-14,Leicester,Everton,2,2,D,1,2,A,...,2.04,1.87,2.17,1.79,2.06,-0.25,1.955,1.96,1.93,1.95
2,E0,16-08-14,Man United,Swansea,1,2,A,0,1,A,...,2.04,1.87,2.17,1.79,2.06,-0.25,1.955,1.96,1.93,1.95
3,E0,16-08-14,QPR,Hull,0,1,A,0,0,D,...,2.04,1.87,2.17,1.79,2.06,-0.25,1.955,1.96,1.93,1.95
4,E0,16-08-14,Stoke,Aston Villa,0,1,A,0,0,D,...,2.04,1.87,2.17,1.79,2.06,-0.25,1.955,1.96,1.93,1.95
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3795,E0,19/05/2024,Crystal Palace,Aston Villa,5,0,H,2,0,H,...,2.75,1.50,2.90,1.45,2.78,-0.75,1.780,2.16,1.81,2.05
3796,E0,19/05/2024,Liverpool,Wolves,2,0,H,2,0,H,...,5.00,1.19,5.60,1.15,5.17,-2.75,2.040,1.85,2.04,1.82
3797,E0,19/05/2024,Luton,Fulham,2,4,A,1,2,A,...,2.75,1.50,2.79,1.48,2.69,0.25,1.990,1.93,1.96,1.91
3798,E0,19/05/2024,Man City,West Ham,3,1,H,2,1,H,...,4.50,1.22,5.20,1.18,4.82,-3.00,1.990,1.90,1.96,1.91
