In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
url = 'https://www.premierleague.com/history/season-reviews'
res = requests.get(url)     # Response object
res.raise_for_status()      #Checks for HTTP Error

In [3]:
# HTML Soup
soup = BeautifulSoup(res.text, 'html.parser')

In [4]:
# CSS Selectors used to get data needed.

selectors = ['.history-season-reviews__heading-main',
             '.history-season-reviews__heading',
             '.tableBodyContainer > tr',
             '.history-season-reviews__stat'
            ]

In [5]:
yearElemList = soup.select(selectors[0])
yearList = [elem.getText() for elem in yearElemList]
print(yearList)

['2020/21', '2019/20', '2018/19', '2017/18', '2016/17', '2015/16', '2014/15', '2013/14', '2012/13', '2011/12', '2010/11', '2009/10', '2008/09', '2007/08', '2006/07', '2005/06', '2004/05', '2003/04', '2002/03', '2001/02', '2000/01', '1999/00', '1998/99', '1997/98', '1996/97', '1995/96', '1994/95', '1993/94', '1992/93']


In [6]:
championElemList = soup.select(selectors[1])
championList = [elem.getText() for elem in championElemList]
print(championList)

['Manchester City', 'Liverpool', 'Manchester City', 'Manchester City', 'Chelsea', 'Leicester City', 'Chelsea', 'Manchester City', 'Manchester United', 'Manchester City', 'Manchester United', 'Chelsea', 'Manchester United', 'Manchester United', 'Manchester United', 'Chelsea', 'Chelsea', 'Arsenal', 'Manchester United', 'Arsenal', 'Manchester United', 'Manchester United', 'Manchester United', 'Arsenal', 'Manchester United', 'Manchester United', 'Blackburn Rovers', 'Manchester United', 'Manchester United']


In [7]:
# Table Statistics List contain the next 5 columns of data.
tableStatsElemList = soup.select(selectors[2])
tableStatsList = [elem.getText().strip().split('\n') for elem in tableStatsElemList]
tableStatsList = list(zip(*tableStatsList))

In [8]:
gamesPlayedList = list(tableStatsList[0])
print(gamesPlayedList)

['38', '38', '38', '38', '38', '38', '38', '38', '38', '38', '38', '38', '38', '38', '38', '38', '38', '38', '38', '38', '38', '38', '38', '38', '38', '38', '42', '42', '42']


In [9]:
gamesWonList = list(tableStatsList[1])
print(gamesWonList)

['27', '32', '32', '32', '30', '23', '26', '27', '28', '28', '23', '27', '28', '27', '28', '29', '29', '26', '25', '26', '24', '28', '22', '23', '21', '25', '27', '27', '24']


In [10]:
gamesDrawnList = list(tableStatsList[2])
print(gamesDrawnList)

['5', '3', '2', '4', '3', '12', '9', '5', '5', '5', '11', '5', '6', '6', '5', '4', '8', '12', '8', '9', '8', '7', '13', '9', '12', '7', '8', '11', '12']


In [11]:
gamesLostList = list(tableStatsList[3])
print(gamesLostList)

['6', '3', '4', '2', '5', '3', '3', '6', '5', '5', '4', '6', '4', '5', '5', '5', '1', '0', '5', '3', '6', '3', '3', '6', '5', '6', '7', '4', '6']


In [12]:
pointsList = list(tableStatsList[4])
print(pointsList)

['86', '99', '98', '100', '93', '81', '87', '86', '89', '89', '80', '86', '90', '87', '89', '91', '95', '90', '83', '87', '80', '91', '79', '78', '75', '82', '89', '92', '84']


In [13]:
# Review Statistics List contain the last 2 columns of data
reviewStatsElemList = soup.select(selectors[3])
reviewStatsList = [elem.getText().strip().split('\n') for elem in reviewStatsElemList]

In [14]:
topGoalScorerList = [value[1:] for index, value in enumerate(reviewStatsList) if index % 2 == 0]
print(topGoalScorerList)

[['Harry Kane'], ['Jamie Vardy'], ['Pierre-Emerick Aubameyang', 'Sadio Mané', 'Mohamed Salah'], ['Mohamed Salah'], ['Harry Kane'], ['Harry Kane'], ['Sergio Agüero'], ['Luis Suárez'], ['Robin van Persie'], ['Robin van Persie'], ['Dimitar Berbatov', 'Carlos Tevez'], ['Didier Drogba'], ['Nicolas Anelka'], ['Cristiano Ronaldo'], ['Didier Drogba'], ['Thierry Henry'], ['Thierry Henry'], ['Thierry Henry'], ['Ruud van Nistelrooy'], ['Thierry Henry'], ['Jimmy Floyd Hasselbaink'], ['Kevin Phillips'], ['Jimmy Floyd Hasselbaink', 'Michael Owen', 'Dwight Yorke'], ['Dion Dublin', 'Michael Owen', 'Chris Sutton'], ['Alan Shearer'], ['Alan Shearer'], ['Alan Shearer'], ['Andrew Cole'], ['Teddy Sheringham']]


In [15]:
mostCleanSheetsList = [value[1:] for index, value in enumerate(reviewStatsList) if index % 2 != 0]
print(mostCleanSheetsList)

[['Ederson'], ['Ederson'], ['Alisson'], ['David de Gea'], ['Thibaut Courtois'], ['Petr Cech'], ['Joe Hart'], ['Petr Cech', 'Wojciech Szczesny'], ['Joe Hart'], ['Joe Hart'], ['Joe Hart'], ['Petr Cech', 'Pepe Reina'], ['Edwin van der Sar'], ['Pepe Reina'], ['Pepe Reina'], ['Pepe Reina'], ['Petr Cech'], ['Jens Lehmann', 'Edwin van der Sar'], ['Brad Friedel'], ['Nigel Martyn'], ['Fabien Barthez', 'Paul Jones', 'Sander Westerveld'], ['Ed de Goey'], ['David Seaman'], ['Peter Schmeichel'], ['Nigel Martyn'], ['Peter Schmeichel'], ['Peter Schmeichel'], ['David Seaman'], ['Bobby Mimms']]


In [16]:
data_dict = {'SeasonYear': yearList, 'Champion': championList, 'GamesPlayed': gamesPlayedList, 'GamesWon': gamesWonList, 
             'GamesDrawn': gamesDrawnList,'GamesLost': gamesLostList, 'Points':pointsList,
             'TopGoalScorer': topGoalScorerList, 'MostCleanSheets': mostCleanSheetsList}

In [17]:
df = pd.DataFrame(data_dict)
df

Unnamed: 0,SeasonYear,Champion,GamesPlayed,GamesWon,GamesDrawn,GamesLost,Points,TopGoalScorer,MostCleanSheets
0,2020/21,Manchester City,38,27,5,6,86,[Harry Kane],[Ederson]
1,2019/20,Liverpool,38,32,3,3,99,[Jamie Vardy],[Ederson]
2,2018/19,Manchester City,38,32,2,4,98,"[Pierre-Emerick Aubameyang, Sadio Mané, Mohame...",[Alisson]
3,2017/18,Manchester City,38,32,4,2,100,[Mohamed Salah],[David de Gea]
4,2016/17,Chelsea,38,30,3,5,93,[Harry Kane],[Thibaut Courtois]
5,2015/16,Leicester City,38,23,12,3,81,[Harry Kane],[Petr Cech]
6,2014/15,Chelsea,38,26,9,3,87,[Sergio Agüero],[Joe Hart]
7,2013/14,Manchester City,38,27,5,6,86,[Luis Suárez],"[Petr Cech, Wojciech Szczesny]"
8,2012/13,Manchester United,38,28,5,5,89,[Robin van Persie],[Joe Hart]
9,2011/12,Manchester City,38,28,5,5,89,[Robin van Persie],[Joe Hart]


In [18]:
df.to_csv('premier-league-season-review.csv', index=False)