In [None]:
# 760 means we have the correct data since each team plays 38 matches. 19 home and 19 away.
# we multiply 38*20/2 and we get all the matches that a season has (380) however we have the data of each matchweek for each team
# hence we dont divide it by 2 and we get 740 which is also the number of rows that we've got

In [19]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# Define base URL and seasons
base_url = 'https://www.worldfootball.net/schedule/eng-premier-league-'
seasons = [f'{year}-{year+1}' for year in range(2018, 2024)]
matchweeks = range(1, 39)

# Initialize an empty list to store the data
data = []

# Function to clean up team names
def clean_team_name(name):
    return name.split("\n")[0].strip()

# Loop through each season and matchweek
for season in seasons:
    for week in matchweeks:
        url = f'{base_url}{season}-spieltag/{week}/'
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "lxml")

        # Find all tables
        tables = soup.find_all("table", class_="standard_tabelle")

        # Loop through tables to find the one with the correct headers
        target_table = None
        for table in tables:
            headers = [header.text.strip() for header in table.find_all('th')]
            if headers == ["#", "Team", "M.", "W", "D", "L", "goals", "Dif.", "Pt."]:
                target_table = table
                break

        if target_table:
            # Loop through each row in the found table
            for row in target_table.find_all('tr')[1:]:  # Skip the header row
                # Extract data from each cell in the row
                cells = row.find_all('td')
                if cells:  # Check if the row is not empty
                    position = cells[0].text.strip()
                    team = clean_team_name(cells[2].text.strip())
                    matches = cells[3].text.strip()
                    wins = cells[4].text.strip()
                    draws = cells[5].text.strip()
                    losses = cells[6].text.strip()
                    goals = cells[7].text.strip()
                    difference = cells[8].text.strip()
                    points = cells[9].text.strip()
                    # Append the row data along with matchweek and season
                    data.append({
                        'Position': position,
                        'Team': team,
                        'Matches': matches,
                        'Wins': wins,
                        'Draws': draws,
                        'Losses': losses,
                        'Goals': goals,
                        'Difference': difference,
                        'Points': points,
                        'Matchweek': week,
                        'Season': season
                    })
            print(f"Data collected for {season} matchweek {week}")
        else:
            print(f"No target table found for {season} matchweek {week}")

        # Delay the next request
        time.sleep(1)  # Wait for 2 seconds

# Convert the data list to a DataFrame
df = pd.DataFrame(data)

# Print the DataFrame
df

Data collected for 2018-2019 matchweek 1
Data collected for 2018-2019 matchweek 2
Data collected for 2018-2019 matchweek 3
Data collected for 2018-2019 matchweek 4
Data collected for 2018-2019 matchweek 5
Data collected for 2018-2019 matchweek 6
Data collected for 2018-2019 matchweek 7
Data collected for 2018-2019 matchweek 8
Data collected for 2018-2019 matchweek 9
Data collected for 2018-2019 matchweek 10
Data collected for 2018-2019 matchweek 11
Data collected for 2018-2019 matchweek 12
Data collected for 2018-2019 matchweek 13
Data collected for 2018-2019 matchweek 14
Data collected for 2018-2019 matchweek 15
Data collected for 2018-2019 matchweek 16
Data collected for 2018-2019 matchweek 17
Data collected for 2018-2019 matchweek 18
Data collected for 2018-2019 matchweek 19
Data collected for 2018-2019 matchweek 20
Data collected for 2018-2019 matchweek 21
Data collected for 2018-2019 matchweek 22
Data collected for 2018-2019 matchweek 23
Data collected for 2018-2019 matchweek 24
D

Unnamed: 0,Position,Team,Matches,Wins,Draws,Losses,Goals,Difference,Points,Matchweek,Season
0,1,Liverpool FC,1,1,0,0,4:0,4,3,1,2018-2019
1,2,Chelsea FC,1,1,0,0,3:0,3,3,1,2018-2019
2,3,AFC Bournemouth,1,1,0,0,2:0,2,3,1,2018-2019
3,,Crystal Palace,1,1,0,0,2:0,2,3,1,2018-2019
4,,Manchester City,1,1,0,0,2:0,2,3,1,2018-2019
...,...,...,...,...,...,...,...,...,...,...,...
4555,16,Brentford FC,38,10,9,19,56:65,-9,39,38,2023-2024
4556,17,Nottingham Forest,38,9,9,20,49:67,-18,32,38,2023-2024
4557,18,Luton Town,38,6,8,24,52:85,-33,26,38,2023-2024
4558,19,Burnley FC,38,5,9,24,41:78,-37,24,38,2023-2024


In [22]:
df

Unnamed: 0,Position,Team,Matches,Wins,Draws,Losses,Goals,Difference,Points,Matchweek,Season
0,1,Liverpool FC,1,1,0,0,4:0,4,3,1,2018-2019
1,2,Chelsea FC,1,1,0,0,3:0,3,3,1,2018-2019
2,3,AFC Bournemouth,1,1,0,0,2:0,2,3,1,2018-2019
3,,Crystal Palace,1,1,0,0,2:0,2,3,1,2018-2019
4,,Manchester City,1,1,0,0,2:0,2,3,1,2018-2019
...,...,...,...,...,...,...,...,...,...,...,...
4555,16,Brentford FC,38,10,9,19,56:65,-9,39,38,2023-2024
4556,17,Nottingham Forest,38,9,9,20,49:67,-18,32,38,2023-2024
4557,18,Luton Town,38,6,8,24,52:85,-33,26,38,2023-2024
4558,19,Burnley FC,38,5,9,24,41:78,-37,24,38,2023-2024


In [21]:
df.head(40)

Unnamed: 0,Position,Team,Matches,Wins,Draws,Losses,Goals,Difference,Points,Matchweek,Season
0,1.0,Liverpool FC,1,1,0,0,4:0,4,3,1,2018-2019
1,2.0,Chelsea FC,1,1,0,0,3:0,3,3,1,2018-2019
2,3.0,AFC Bournemouth,1,1,0,0,2:0,2,3,1,2018-2019
3,,Crystal Palace,1,1,0,0,2:0,2,3,1,2018-2019
4,,Manchester City,1,1,0,0,2:0,2,3,1,2018-2019
5,,Watford FC,1,1,0,0,2:0,2,3,1,2018-2019
6,7.0,Manchester United,1,1,0,0,2:1,1,3,1,2018-2019
7,,Tottenham Hotspur,1,1,0,0,2:1,1,3,1,2018-2019
8,9.0,Everton FC,1,0,1,0,2:2,0,1,1,2018-2019
9,,Wolverhampton Wanderers,1,0,1,0,2:2,0,1,1,2018-2019


In [None]:
df.isnull().sum() 

df["Team"].value_counts()

In [None]:
team_counts = df["Team"].value_counts().sort_index()
team_counts

In [None]:
# formating some columns to allign the format with the other dataframes, also dropping unuseful columns
df.rename(columns={'Team': 'HomeTeam'}, inplace=True)
df.rename(columns={'Difference': 'Goal_Difference'}, inplace=True)
df.drop(columns=['Goals'], inplace=True)

In [None]:
# changing the seasons to just one to have them as numerical values
df['Season'] = df['Season'].str.split('-').str[0].astype(int)

In [None]:
df.head(40)

In [None]:
df.isnull().sum()

In [None]:
df.columns

In [None]:
# Save the DataFrame to a CSV file
df.to_csv('premier_league_data.csv', index=False)

print("Data scraping completed and saved to premier_league_data.csv")

In [None]:
df["Season"].value_counts()

In [None]:
df["Position"].isnull().sum()