## This file will focus strictly on scraping data from the website fbref.com


In [3]:
pip install requests beautifulsoup4 pandas




In [10]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os


### Defining functions


`get_team_data()` takes a file path, reads the HTML content, and returns a dataframe

In [16]:
def get_team_data(file_path):
    # Read the HTML content from the file
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
    
    # Parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find the table
    table = soup.find('table')
    
    if not table:
        raise ValueError("No table found in the HTML content")
    
    # Extract column names from thead
    thead = table.find('thead')
    if thead:
        column_headers = thead.find_all('th')
        column_names = [th.get('aria-label', th.text.strip()) for th in column_headers]
    else:
        column_names = []
    
    # Extract data from tbody
    tbody = table.find('tbody')
    data = []
    if tbody:
        for row in tbody.find_all('tr'):
            row_data = [cell.text.strip() for cell in row.find_all(['th', 'td'])]
            data.append(row_data)
    
    # Create dataframe
    df = pd.DataFrame(data, columns=column_names)
    
    return df
    
    
    

`get_all_seasons()` loops through all the txt files, extracts the data, and adds it to a dataframe

In [20]:
def get_all_seasons():
    seasons = ['2017-2018', '2018-2019', '2019-2020', '2020-2021', '2021-2022', '2022-2023', '2023-2024']
    all_seasons = []
    
    for season in seasons:
        path = f'{season}.txt'
        current_season = get_team_data(path)
        current_season['Season'] = season  # Add a season column
        all_seasons.append(current_season)
    
    return pd.concat(all_seasons, ignore_index=True)

In [7]:
# def get_player_data(url)

### Getting team data from website

In [21]:
seasons_data = get_all_seasons()
output_path = 'seasons_data.csv'
seasons_data.to_csv(output_path, index=False)
print(seasons_data.head())

  Rk            Squad  Country Rank Matches Played Wins Draws Losses  \
0  1  Manchester City  eng ENG    1             38   32     4      2   
1  2         Juventus   it ITA    1             38   30     5      3   
2  3    Bayern Munich   de GER    1             34   27     3      4   
3  4        Paris S-G   fr FRA    1             38   29     6      3   
4  5        Barcelona   es ESP    1             38   28     9      1   

  Goals For Goals Against  ... Points Points/Match    xG xG Allowed  \
0       106            27  ...    100         2.63  78.6       23.8   
1        86            24  ...     95         2.50  55.3       27.4   
2        92            28  ...     84         2.47  75.8       31.7   
3       108            29  ...     93         2.45  86.1       34.3   
4        99            29  ...     93         2.45  83.5       41.9   

  xG Difference xG Difference/90 Attendance/Game          Top Team Scorer  \
0         +54.8            +1.44          53,812       Sergio A

### Getting Player data
