In [24]:
# Web scraping to obtain EPL historical data from https://fbref.com/en/comps/9/history/Premier-League-Seasons

import requests
from bs4 import BeautifulSoup
import pandas as pd

# webpage for webscraping 
url = "https://fbref.com/en/comps/9/history/Premier-League-Seasons"

# using the GET request function 
response = requests.get(url)
response.raise_for_status()  #to ensure that the GET request was successful before proceeding

# processing the contenst of the html
soup = BeautifulSoup(response.content, 'html.parser')

# to get the table which contains the data for each season using the find function 
table = soup.find('table', {'id': 'seasons'})

# hence the headers and rows of the table should be extracted 
headers = [th.text for th in table.find('thead').find_all('th')]
rows = []
for row in table.find('tbody').find_all('tr'):
    cells = row.find_all('td')
    if cells:
        rows.append([cell.text.strip() for cell in cells])

# Create a DataFrame for the data scraped 
df = pd.DataFrame(rows, columns=headers[1:])  # Exclude the first header as it is for row numbers

# therefore the dataframe should be displayed 
print(df)

                   Competition Name # Squads                Champion  \
0                    Premier League       20                           
1                    Premier League       20    Manchester City - 91   
2                    Premier League       20    Manchester City - 89   
3                    Premier League       20    Manchester City - 93   
4                    Premier League       20    Manchester City - 86   
..                              ...      ...                     ...   
121  Football League First Division       16         Sunderland - 48   
122                 Football League       14         Sunderland - 42   
123                 Football League       12            Everton - 29   
124                 Football League       12  Preston North End - 33   
125                 Football League       12  Preston North End - 40   

                            Top Scorer  
0                                       
1                  Erling Haaland - 27  
2           

In [45]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load the data
file_path = r"C:\Users\HP\OneDrive - University of Hertfordshire\EPL Dataset\14-15.csv"
data = pd.read_csv(file_path)



# Display the first few rows of the data
print("First few rows of the dataset:")
print(data.head())

First few rows of the dataset:
  Div      Date    HomeTeam        AwayTeam  FTHG  FTAG FTR  HTHG  HTAG HTR  \
0  E0  16-08-14     Arsenal  Crystal Palace     2     1   H     1     1   D   
1  E0  16-08-14   Leicester         Everton     2     2   D     1     2   A   
2  E0  16-08-14  Man United         Swansea     1     2   A     0     1   A   
3  E0  16-08-14         QPR            Hull     0     1   A     0     0   D   
4  E0  16-08-14       Stoke     Aston Villa     0     1   A     0     0   D   

   ... HST  AST  HF  AF  HC  AC  HY  AY  HR  AR  
0  ...   6    2  13  19   9   3   2   2   0   1  
1  ...   3    3  16  10   3   6   1   1   0   0  
2  ...   5    4  14  20   4   0   2   4   0   0  
3  ...   6    4  10  10   8   9   1   2   0   0  
4  ...   2    2  14   9   2   8   0   3   0   0  

[5 rows x 23 columns]


In [46]:
# preprocessing of data dowloaded from https://www.football-data.co.uk/englandm.php

import os
import pandas as pd

# the data for each season was downloaded in different csv files.
# the section combines all the csv files in a single file


# Defining the file path/directory cwhere the data files are (csv files)
directory = r"C:\Users\HP\OneDrive - University of Hertfordshire\EPL Dataset"

# An empty list is initialised to hold the DataFrames
dataframes = []

# Iterating through each file in the path 
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        # Read the CSV file into a DataFrame
        filepath = os.path.join(directory, filename)
        df = pd.read_csv(filepath)
        # Append the DataFrame to the list
        dataframes.append(df)

# Concatenate all the DataFrames into one DataFrame
New_combined_df = pd.concat(dataframes, ignore_index=True)

# Hence the new path where the new CSV will be located (the new combined dataset)
output_path = r"C:\Users\HP\OneDrive - University of Hertfordshire\EPL Dataset\New_combined_dataset.csv"

# Save the combined DataFrame to a single CSV file
New_combined_df.to_csv(output_path, index=False)

print(f"New_Combined CSV file saved at: {output_path}")

New_Combined CSV file saved at: C:\Users\HP\OneDrive - University of Hertfordshire\EPL Dataset\New_combined_dataset.csv


In [48]:
# DATA CLEANING 
# Cleaning the new combined dataset

import pandas as pd

# Load the dataset from the directory to inspect the data then print the first few rows to check
Data_path = r"C:\Users\HP\OneDrive - University of Hertfordshire\EPL Dataset\New_combined_dataset.csv"
df = pd.read_csv(Data_path)
print(df.head())

# Handling missing values by checking through the dataset and ensuring it contains the appropriate data types

# Check through the data if there are duplicate rows or columns
print(df.duplicated().sum())

# then Check for missing values
print(df.isnull().sum())

# Check through to Inspect the datatypes 
print(df.dtypes)

# Drop rows that have missing values 
df = df.dropna(axis=0, how='any')

# confirm if there are no more missing values
print(df.isnull().sum())


# To be double sure, print the length of specific columns 
column_name = 'Half Time Home Team Goal' 
column = df[column_name]
print(f"The length of the column '{column_name}' is: {len(column)}")

# check the characteristics of the new data and display the first few rows of the data
print("The first few rows of the dataset:")
print(df.head())

# Hence, using the for loop check the length of all columns to ensure they are the same 
for column in df.columns:
    print(f"The length of the column '{column}' is: {len(df[column])}")

  Div      Date    HomeTeam        AwayTeam  Full Time Home Team Goals  \
0  E0  16-08-14     Arsenal  Crystal Palace                          2   
1  E0  16-08-14   Leicester         Everton                          2   
2  E0  16-08-14  Man United         Swansea                          1   
3  E0  16-08-14         QPR            Hull                          0   
4  E0  16-08-14       Stoke     Aston Villa                          0   

   Full Time Away Team Goals Full Time Result  Half Time Home Team Goal  \
0                          1                H                         1   
1                          2                D                         1   
2                          2                A                         0   
3                          1                A                         0   
4                          1                A                         0   

   Half Time Away Team Goals Half Time Result  ... Home Team Shots on Target  \
0                       

In [49]:
# DATA PREPROCESSING 
# preprocessing of the new combined dataset

from sklearn.preprocessing import StandardScaler


# The One-hot encoding technique is used to encode the categorical variables into numerical varaibles
df = pd.get_dummies(df, drop_first=True)

# Feature scaling 
# Normalising the numerical variable to ensure they are on the same scale
scaler = StandardScaler()
numerical_features = df.select_dtypes(include=['float64', 'int64']).columns
df[numerical_features] = scaler.fit_transform(df[numerical_features])

In [None]:
# EXPLORATORY DATA ANALYSIS


# The first step is to get the summary statistics to understand data distribution 
print(df.describe())

# Then patterns, correlation and outliers are identified using visualisation 

import matplotlib.pyplot as plt
import seaborn as sns

# Histograms to identify the distribution 
df.hist(bins=30, figsize=(20, 15))
plt.show()

# Box plots to identify outliers
plt.figure(figsize=(20, 10))
sns.boxplot(data=df)
plt.show()

# Correlation heatmap for relationship between features
plt.figure(figsize=(20, 15))
sns.heatmap(df.corr(), annot=True, fmt='.2f')
plt.show()