# Teams: League Table & Scores Scraper

Importing packages that we are going to use:

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

### Class: Scrape_Teams_Data

First, we write a class "Scrape_Teams_Data" with two functions for scraping data from our website. The class takes two arguments: url_table (the url where we can find the league table) and url_scores (where we can find scores). There is always a different url for each season. The first function "league_table" scrapes data from the first url and creates a Pandas dataframe with the league table data. The second function "scores" takes a logical argument "regular", and returns a Pandas dataframe with scores from all games in the given season. The argument "regular" is True if the season which we are scraping was regular and False if it was not regular. Regular season means there were no extra rounds after all teams had played against each other twice. Irregular seasons usually include extra championship and relegation rounds. The argument makes sure we follow the correct html structure of the website because regular seasons' websites have a slightly different structure than the irregular ones.

In [2]:
class Scrape_Teams_Data:
    
    def __init__(self, url_table, url_scores):
        self.url_table = url_table
        self.url_scores = url_scores
        
    def league_table(self):
        
        # getting soup from the url
        r = requests.get(self.url_table)
        soup = BeautifulSoup(r.text, "html.parser")
        league_table = soup.find("div", {'class': 'table_container'}).table

        # creating empty lists for looping
        rank = []
        club = []
        games = []
        wins = []
        draws = []
        losses = []
        goals_for = []
        goals_against = []
        goal_diff = []
        points = []
        last_5 = []
        attendance_per_g = []
        top_team_scorers = []
        top_keeper = []

        # looping over different teams (rows)
        for team in league_table.find_all("tbody"):
            rows = team.find_all("tr")

            # looping over different variables (columns) and writing into lists
            for row in rows:
                rank.append(row.find("th", {'data-stat': 'rank'}).text)
                club.append(row.find("td", {'data-stat': 'squad'}).text)
                games.append(row.find("td", {'data-stat': 'games'}).text)
                wins.append(row.find("td", {'data-stat': 'wins'}).text)
                draws.append(row.find("td", {'data-stat': 'draws'}).text)
                losses.append(row.find("td", {'data-stat': 'losses'}).text)
                goals_for.append(row.find("td", {'data-stat': 'goals_for'}).text)
                goals_against.append(row.find("td", {'data-stat': 'goals_against'}).text)
                goal_diff.append(row.find("td", {'data-stat': 'goal_diff'}).text)
                points.append(row.find("td", {'data-stat': 'points'}).text)
                last_5.append(row.find("td", {'data-stat': 'last_5'}).text)
                attendance_per_g.append(row.find("td", {'data-stat': 'attendance_per_g'}).text)
                top_team_scorers.append(row.find("td", {'data-stat': 'top_team_scorers'}).text)
                top_keeper.append(row.find("td", {'data-stat': 'top_keeper'}).text)

        # creating a dataframe by concatenating the lists
        table = pd.DataFrame({"Rank": rank, "Team": club, "Games": games, "Wins": wins, "Draws": draws, "Losses": losses, "Goals for": goals_for, "Goals against": goals_against, "Goal Difference": goal_diff, "Points": points, "Last 5 Games": last_5, "Attendance per Game": attendance_per_g, "Top Team Scorers": top_team_scorers, "Top Goalkeeper": top_keeper})
        return table
    
    def scores(self, regular=True):
        
        # getting soup from the url
        r = requests.get(self.url_scores)
        soup = BeautifulSoup(r.text, "html.parser")
        scores_table = soup.find("div", {'class': 'table_container'}).table

        # creating empty lists for looping
        season = []
        gameweek = []
        dayofweek = []
        date = []
        time = []
        squad_a = []
        score = []
        squad_b = []
        attendance = []
        venue = []
        referee = []

        # looping over different rows
        for game in scores_table.find_all("tbody"):
            rows = game.find_all("tr")

            if regular:
                # looping over different variables (columns) and writing into lists
                for row in rows:
                    gameweek.append(row.find("th", {'data-stat': 'gameweek'}).text)
                    dayofweek.append(row.find("td", {'data-stat': 'dayofweek'}).text)
                    date.append(row.find("td", {'data-stat': 'date'}).text)
                    time.append(row.find("td", {'data-stat': 'time'}).text)
                    squad_a.append(row.find("td", {'data-stat': 'squad_a'}).text)
                    score.append(row.find("td", {'data-stat': 'score'}).text)
                    squad_b.append(row.find("td", {'data-stat': 'squad_b'}).text)
                    attendance.append(row.find("td", {'data-stat': 'attendance'}).text)
                    venue.append(row.find("td", {'data-stat': 'venue'}).text)
                    referee.append(row.find("td", {'data-stat': 'referee'}).text)
            else:
                # looping over different variables (columns) and writing into lists
                for row in rows:
                            gameweek.append(row.find("td", {'data-stat': 'gameweek'}).text)
                            dayofweek.append(row.find("td", {'data-stat': 'dayofweek'}).text)
                            date.append(row.find("td", {'data-stat': 'date'}).text)
                            time.append(row.find("td", {'data-stat': 'time'}).text)
                            squad_a.append(row.find("td", {'data-stat': 'squad_a'}).text)
                            score.append(row.find("td", {'data-stat': 'score'}).text)
                            squad_b.append(row.find("td", {'data-stat': 'squad_b'}).text)
                            attendance.append(row.find("td", {'data-stat': 'attendance'}).text)
                            venue.append(row.find("td", {'data-stat': 'venue'}).text)
                            referee.append(row.find("td", {'data-stat': 'referee'}).text)

        # creating a dataframe by concatenating the lists
        scores = pd.DataFrame({"Game Week": gameweek, "Weekday": dayofweek, "Date": date, "Time": time, "Home Team": squad_a, "Score": score, "Away Team": squad_b, "Attendance": attendance, "Venue": venue, "Referee": referee})
        return scores

Now we take the urls of the websites we want to scrape, and plug it as an argument into the prewritten class. We call the function "league_table" to obtain a dataframe with the current season's league table data. To be able to work with the scraped data we also wrote a csv file which we are going to read in other notebooks.

In [3]:
urls = ["https://fbref.com/en/comps/66/Czech-First-League-Stats", "https://fbref.com/en/comps/66/schedule/Czech-First-League-Scores-and-Fixtures"]
table = Scrape_Teams_Data(urls[0], urls[1]).league_table()
#table.to_csv(r"C:\Users\Honza Stuchlík\Documents\IES\Data Processing in Python\Czech-Football-League\league_table.csv", index = False)
table

Unnamed: 0,Rank,Team,Games,Wins,Draws,Losses,Goals for,Goals against,Goal Difference,Points,Last 5 Games,Attendance per Game,Top Team Scorers,Top Goalkeeper
0,1,Slavia Prague,15,13,2,0,42,6,36,41,W W W W W,2019,Abdallah Sima - 8,Ondřej Kolář
1,2,Sparta Prague,16,11,2,3,33,18,15,35,W D D W W,1445,Lukáš Juliš - 11,Florin Niță
2,3,Jablonec,16,11,2,3,33,16,17,35,D W W W W,722,Ivan Schranz - 6,Jan Hanuš
3,4,Slovácko,16,9,3,4,30,17,13,30,W W W W W,498,Jan Kliment - 6,Vít Nemrava
4,5,Sigma Olomouc,16,6,7,3,25,19,6,25,D D L L W,819,David Houska - 4,Aleš Mandous
5,6,Slovan Liberec,15,7,4,4,24,16,8,25,W W D D W,984,Michael Rabušic - 7,Filip Nguyen
6,7,Viktoria Plzeň,16,7,3,6,30,21,9,24,W L L D W,1333,"Aleš Čermák, Jean-David Beauguel - 6",Aleš Hruška
7,8,České Budĕjov.,16,6,6,4,23,23,0,24,D L W W W,471,"Benjamin Čolić, Patrik Brandner - 5",Jaroslav Drobný
8,9,Baník Ostrava,15,6,5,4,20,13,7,23,D W D L D,1579,Dyjan Carlos De Azevedo - 6,Jan Laštůvka
9,10,FK Pardubice,16,6,4,6,15,19,-4,22,L D W L L,589,David Huf - 5,Marek Boháč


Now, we call the fucntion "scores" on the same class. The season was regular, which is the default setting, so we do not need to spedify the argument "regular". This time we obtain a dataframe with scores from the current season.

In [4]:
scores = Scrape_Teams_Data(urls[0], urls[1]).scores()
scores

Unnamed: 0,Game Week,Weekday,Date,Time,Home Team,Score,Away Team,Attendance,Venue,Referee
0,1,Fri,2020-08-21,18:00,Viktoria Plzeň,3–1,Opava,2813,Doosan Arena,Alex Denev
1,1,Sat,2020-08-22,17:00,Fastav Zlín,1–2,Slovácko,1282,Stadion Letná,Pavel Královec
2,1,Sat,2020-08-22,17:00,Příbram,1–3,Teplice,1350,Energon Aréna,Ondřej Berka
3,1,Sat,2020-08-22,17:00,Sigma Olomouc,1–0,Slovan Liberec,2216,Andrův stadion,Paval Julínek
4,1,Sat,2020-08-22,19:30,Zbrojovka Brno,1–4,Sparta Prague,2500,Městský fotbalový stadion Srbská,Pavel Franek
...,...,...,...,...,...,...,...,...,...,...
343,34,Fri,2021-05-28,,Mladá Boleslav,,Bohemians 1905,,Adidas Aréna,
344,34,Fri,2021-05-28,,Sparta Prague,,Zbrojovka Brno,,Generali Arena,
345,34,Fri,2021-05-28,,Slovácko,,Fastav Zlín,,Městský fotbalový stadion Miroslava Vale...,
346,34,Fri,2021-05-28,,Slavia Prague,,České Budĕjov.,,Sinobo Stadium,


The second url represents an "irregular" season. This time we specify the argument "regular" as False, and we get a dataframe in the same format as for a regular season.

In [5]:
scores = Scrape_Teams_Data("https://fbref.com/en/comps/66/2427/2018-2019-Czech-First-League-Stats", "https://fbref.com/en/comps/66/2427/schedule/2018-2019-Czech-First-League-Scores-and-Fixtures").scores(regular = False)
scores

Unnamed: 0,Game Week,Weekday,Date,Time,Home Team,Score,Away Team,Attendance,Venue,Referee
0,1,Fri,2018-07-20,18:30,Dukla Prague,1–3,Viktoria Plzeň,5039,Stadion Juliska,Pavel Franek
1,1,Sat,2018-07-21,17:00,Příbram,1–1,Teplice,3758,Energon Aréna,Radek Příhoda
2,1,Sat,2018-07-21,17:00,Fastav Zlín,3–2,Mladá Boleslav,3852,Stadion Letná,Zbyněk Proske
3,1,Sat,2018-07-21,17:00,Slovan Liberec,1–0,Karviná,4548,Stadion u Nisy,Paval Julínek
4,1,Sat,2018-07-21,19:00,Sparta Prague,2–0,Opava,12186,Generali Arena,Pavel Královec
...,...,...,...,...,...,...,...,...,...,...
306,5,Sat,2019-05-25,17:00,Slovácko,2–0,Karviná,3124,Městský fotbalový stadion Miroslava Vale...,Cieslar Ondřej
307,5,Sat,2019-05-25,17:00,Příbram,3–2,Dukla Prague,1266,Energon Aréna,Miroslav Zelinka
308,5,Sun,2019-05-26,15:00,Jablonec,1–0,Slovan Liberec,3410,Stadion Střelnice,Ondřej Pechanec
309,5,Sun,2019-05-26,15:00,Slavia Prague,2–1,Sparta Prague,17000,Sinobo Stadium,Ondřej Berka


Since we get a dataframe in the same format for both regular and irregular seasons, we can easily append the dataframes for all seasons that we scraped into a single dataframe to make our work easier. The result is again written into a csv file, so that we can load it and process it in another notebook.

In [6]:
scores_1516 = Scrape_Teams_Data("https://fbref.com/en/comps/66/1459/2015-2016-Czech-First-League-Stats", "https://fbref.com/en/comps/66/1459/schedule/2015-2016-Czech-First-League-Scores-and-Fixtures").scores()
scores_1617 = Scrape_Teams_Data("https://fbref.com/en/comps/66/1518/2016-2017-Czech-First-League-Stats", "https://fbref.com/en/comps/66/1518/schedule/2016-2017-Czech-First-League-Scores-and-Fixtures").scores()
scores_1718 = Scrape_Teams_Data("https://fbref.com/en/comps/66/1623/2017-2018-Czech-First-League-Stats", "https://fbref.com/en/comps/66/1623/schedule/2017-2018-Czech-First-League-Scores-and-Fixtures").scores()
scores_1819 = Scrape_Teams_Data("https://fbref.com/en/comps/66/2427/2018-2019-Czech-First-League-Stats", "https://fbref.com/en/comps/66/2427/schedule/2018-2019-Czech-First-League-Scores-and-Fixtures").scores(regular = False)
scores_1920 = Scrape_Teams_Data("https://fbref.com/en/comps/66/3226/2019-2020-Czech-First-League-Stats", "https://fbref.com/en/comps/66/3226/schedule/2019-2020-Czech-First-League-Scores-and-Fixtures").scores(regular = False)
scores_2021 = Scrape_Teams_Data("https://fbref.com/en/comps/66/Czech-First-League-Stats", "https://fbref.com/en/comps/66/schedule/Czech-First-League-Scores-and-Fixtures").scores()
scores_dfs = [scores_1516, scores_1617, scores_1718, scores_1819, scores_1920, scores_2021]
scores = pd.DataFrame()
for i in scores_dfs:
    scores = scores.append(i, ignore_index = True, sort = False)
#scores.to_csv(r"C:\Users\Honza Stuchlík\Documents\IES\Data Processing in Python\Czech-Football-League\scores.csv", index = False)
scores

Unnamed: 0,Game Week,Weekday,Date,Time,Home Team,Score,Away Team,Attendance,Venue,Referee
0,1,Fri,2015-07-24,17:30,Viktoria Plzeň,2–1,Slavia Prague,11233,Doosan Arena,Pavel Franek
1,1,Fri,2015-07-24,19:00,Vysočina Jihlava,0–0,Sparta Prague,3894,Stadion v Jiráskově ulici,Tomas Kocourek
2,1,Sat,2015-07-25,17:00,Příbram,2–3,Jablonec,4182,Energon Aréna,Pavel Královec
3,1,Sat,2015-07-25,17:00,Slovácko,4–3,Dukla Prague,3726,Městský fotbalový stadion Miroslava Vale...,Zbyněk Proske
4,1,Sat,2015-07-25,17:00,Zbrojovka Brno,2–1,Baník Ostrava,5326,Městský fotbalový stadion Srbská,Libor Kovařík
...,...,...,...,...,...,...,...,...,...,...
1732,34,Fri,2021-05-28,,Mladá Boleslav,,Bohemians 1905,,Adidas Aréna,
1733,34,Fri,2021-05-28,,Sparta Prague,,Zbrojovka Brno,,Generali Arena,
1734,34,Fri,2021-05-28,,Slovácko,,Fastav Zlín,,Městský fotbalový stadion Miroslava Vale...,
1735,34,Fri,2021-05-28,,Slavia Prague,,České Budĕjov.,,Sinobo Stadium,
