In [3]:
from bs4 import BeautifulSoup
from urllib.request import urlopen

In [7]:
import requests

### Prototyping Scraping on Nicola Mirotic

In [4]:
url = "https://www.euroleague.net/competition/players/showplayer?pcode=000796&seasoncode=E2021#!currentstats"

In [8]:
html = requests.get(url)

In [14]:
html.text;

In [17]:
soup = BeautifulSoup(html.text, "html5lib")

#### Grabbing player's name

In [90]:
player_data = soup.find_all(class_="player-data")

In [91]:
player_data[0].find_all(class_="name")[0].get_text()

'MIROTIC, NIKOLA'

#### Grabbing player's height and DOB

In [108]:
second_summary_spans = player_data[0].find_all(class_='summary-second')[0].find_all("span")

In [109]:
second_summary_spans[0].get_text().split(":")[1]

' 2.08'

In [110]:
second_summary_spans[1].get_text().split(":")[1]

' 11 February, 1991'

#### Grabbing player's position

In [96]:
first_summary_spans = player_data[0].find_all(class_="summary-first")[0].find_all("span")

In [106]:
first_summary_spans[1].find_all("span")[1].get_text()

'Forward'

#### Grabbing player's stats

In [73]:
stats_table = soup.find('table', id = "tblPlayerPhaseStatistics" )

In [85]:
stats_table.find_all(class_="PlayerGridRow")[0].find_all("td") # Stats

[<td class="PlayerTitleColumn">Totals</td>,
 <td>9</td>,
 <td>9</td>,
 <td>225:39</td>,
 <td>149</td>,
 <td>33/51</td>,
 <td>18/35</td>,
 <td>29/34</td>,
 <td>12</td>,
 <td>39</td>,
 <td>51</td>,
 <td>12</td>,
 <td>12</td>,
 <td>9</td>,
 <td>3</td>,
 <td>1</td>,
 <td>12</td>,
 <td>29</td>,
 <td>194</td>]

In [135]:
stats = stats_table.find_all(class_="PlayerGridRow")[0].find_all("td")

In [136]:
for stat in stats:
    print(stat.get_text())

Totals
9
9
225:39
149
33/51
18/35
29/34
12
39
51
12
12
9
3
1
12
29
194


In [137]:
len(stats)

19

In [131]:
headers = stats_table.find_all("tr")[1].find_all("th") # headers

In [132]:
for header in headers[1:]:
    print(header.get_text())

G
GS
Min
Pts
2FG
3FG
FT
O
D
T
As
St
To
Fv
Ag
Cm
Rv
PIR


In [138]:
len(headers)

19

#### Create dictionary with player's stats

In [190]:
def player_dictionary_creation(soup):
    player_data_dict = {}
    player_data = soup.find_all(class_="player-data")
    name_of_player = player_data[0].find_all(class_="name")[0].get_text().split(",")
    player_data_dict['name'] = name_of_player[1] + " " + name_of_player[0]
    first_summary_spans = player_data[0].find_all(class_="summary-first")[0].find_all("span")
    player_data_dict['position'] = first_summary_spans[1].find_all("span")[1].get_text()
    second_summary_spans = player_data[0].find_all(class_='summary-second')[0].find_all("span")
    player_data_dict['height'] = second_summary_spans[0].get_text().split(":")[1]
    player_data_dict['dob'] = second_summary_spans[1].get_text().split(":")[1]
    stats_table = soup.find('table', id = "tblPlayerPhaseStatistics" )
    stats = stats_table.find_all(class_="PlayerGridRow")[0].find_all("td")
    headers = stats_table.find_all("tr")[1].find_all("th") 
    for i, (stat, header) in enumerate(zip(stats,headers)):
        if i!=0:
            player_data_dict[header.get_text()] = stat.get_text()
    return player_data_dict

In [191]:
Mirotic_dict = player_dictionary_creation(soup)

In [192]:
Mirotic_dict

{'name': ' NIKOLA MIROTIC',
 'position': 'Forward',
 'height': ' 2.08',
 'dob': ' 11 February, 1991',
 'G': '9',
 'GS': '9',
 'Min': '225:39',
 'Pts': '149',
 '2FG': '33/51',
 '3FG': '18/35',
 'FT': '29/34',
 'O': '12',
 'D': '39',
 'T': '51',
 'As': '12',
 'St': '12',
 'To': '9',
 'Fv': '3',
 'Ag': '1',
 'Cm': '12',
 'Rv': '29',
 'PIR': '194'}

### Grabbing all players from leaders table

#### Are we allowed to do it?

In [254]:
robots = requests.get("https://www.euroleague.net/robots.txt")

In [257]:
BeautifulSoup(robots.text,"html5lib")

<html><head></head><body>User-agent: Sosospider
Disallow: /

User-agent: Yandex
Disallow: /

User-agent: Baiduspider
Disallow: /

User-agent: *
Crawl-delay: 15</body></html>

#### We are allowed to scrape

In [193]:
url_leaders_1 = "https://www.euroleague.net/main/statistics?agg=PerGame&mode=Leaders&seasonmode=Single&entity=Players&cat=Score&seasoncode=E2021&page=1"
url_leaders_2 = "https://www.euroleague.net/main/statistics?agg=PerGame&mode=Leaders&seasonmode=Single&entity=Players&cat=Score&seasoncode=E2021&page=2"

In [242]:
html_leaders_2 = requests.get(url_leaders_2)

In [243]:
html_leaders_2.text;

In [244]:
soup_leaders_2 = BeautifulSoup(html_leaders_2.text, "html5lib")

#### Grabbing the url of each player

In [275]:
player_tags_even = soup_leaders_1.find_all("table")[0].find_all("tr",class_="StatsAlternatingGridResults")

In [278]:
player_tags_odd = soup_leaders_1.find_all("table")[0].find_all("tr",class_="StatsRowAlternatingGridResults")

In [282]:
player_tags_odd[3].find_all("a")[0].get('href')

'/competition/players/showplayer?pcode=008989&seasoncode=E2021'

In [283]:
for tag in player_tags_odd:
    print(tag.find_all("a")[0].get('href'))

/competition/players/showplayer?pcode=011212&seasoncode=E2021
/competition/players/showplayer?pcode=002580&seasoncode=E2021
/competition/players/showplayer?pcode=000796&seasoncode=E2021
/competition/players/showplayer?pcode=008989&seasoncode=E2021
/competition/players/showplayer?pcode=007032&seasoncode=E2021
/competition/players/showplayer?pcode=005985&seasoncode=E2021
/competition/players/showplayer?pcode=003108&seasoncode=E2021
/competition/players/showplayer?pcode=007870&seasoncode=E2021
/competition/players/showplayer?pcode=009006&seasoncode=E2021
/competition/players/showplayer?pcode=009754&seasoncode=E2021
/competition/players/showplayer?pcode=004866&seasoncode=E2021
/competition/players/showplayer?pcode=007831&seasoncode=E2021
/competition/players/showplayer?pcode=000925&seasoncode=E2021
/competition/players/showplayer?pcode=004554&seasoncode=E2021
/competition/players/showplayer?pcode=009866&seasoncode=E2021
/competition/players/showplayer?pcode=007200&seasoncode=E2021
/competi

#### Putting it all together

In [252]:
base_url = "https://www.euroleague.net"
leaders_urls = [url_leaders_1,url_leaders_2]

In [290]:
def players_urls():
    players_url_list = []
    for url in leaders_urls:
        html_leaders = requests.get(url)
        soup_leaders = BeautifulSoup(html_leaders.text, "html5lib")
        player_tags_even = soup_leaders.find_all("table")[0].find_all("tr",class_="StatsAlternatingGridResults")
        player_tags_odd = soup_leaders.find_all("table")[0].find_all("tr",class_="StatsRowAlternatingGridResults")
        player_tags = player_tags_even + player_tags_odd
        players_url_list.extend([tag.find_all("a")[0].get('href') for tag in player_tags])
    return players_url_list

In [291]:
players_url_list = players_urls()

In [292]:
len(players_url_list)

76

In [293]:
def create_soup(url):
    html = requests.get(url)
    soup = BeautifulSoup(html.text, "html5lib")
    return soup

In [294]:
def list_of_player_dicts():
    player_list = []
    players_url_list = players_urls()
    for relative_url in players_url_list:
        absolute_url = base_url + relative_url
        soup = create_soup(absolute_url)
        player_dict = player_dictionary_creation(soup)
        player_list.append(player_dict)
    return player_list

In [295]:
Player_List = list_of_player_dicts()

In [296]:
len(Player_List)

76

In [317]:
Player_List[20:30]

[{'name': ' SASHA VEZENKOV',
  'position': 'Forward',
  'height': ' 2.06',
  'dob': ' 6 August, 1995',
  'G': '9',
  'GS': '9',
  'Min': '260:42',
  'Pts': '93',
  '2FG': '23/44',
  '3FG': '11/38',
  'FT': '14/19',
  'O': '13',
  'D': '36',
  'T': '49',
  'As': '5',
  'St': '10',
  'To': '4',
  'Fv': '3',
  'Ag': '3',
  'Cm': '13',
  'Rv': '22',
  'PIR': '109'},
 {'name': ' MALCOLM DELANEY',
  'position': 'Guard',
  'height': ' 1.91',
  'dob': ' 11 March, 1989',
  'G': '5',
  'GS': '5',
  'Min': '120:53',
  'Pts': '51',
  '2FG': '7/17',
  '3FG': '8/21',
  'FT': '13/18',
  'O': '2',
  'D': '10',
  'T': '12',
  'As': '16',
  'St': '2',
  'To': '9',
  'Fv': '0',
  'Ag': '1',
  'Cm': '9',
  'Rv': '15',
  'PIR': '49'},
 {'name': ' JANIS STRELNIEKS',
  'position': 'Guard',
  'height': ' 1.91',
  'dob': ' 1 September, 1989',
  'G': '7',
  'GS': '7',
  'Min': '166:17',
  'Pts': '71',
  '2FG': '11/18',
  '3FG': '14/32',
  'FT': '7/8',
  'O': '5',
  'D': '15',
  'T': '20',
  'As': '12',
  'St': 

#### Save the list as json file

In [1]:
import json

In [319]:
with open("Players.json","w") as f:
    json.dump(Player_List,f)

In [2]:
import pandas as pd

In [4]:
f = open("Players.json")

In [5]:
player_list_from_json = json.load(f)

In [7]:
df = pd.DataFrame(player_list_from_json) 

In [8]:
df.head()

Unnamed: 0,name,position,height,dob,G,GS,Min,Pts,2FG,3FG,...,D,T,As,St,To,Fv,Ag,Cm,Rv,PIR
0,DARYL MACON JR,Guard,1.88,"29 November, 1995",9,6,234:25,153,24/33,24/53,...,14,18,24,4,17,1,0,24,31,149
1,WILL CLYBURN,Forward,2.01,"17 May, 1990",9,1,262:29,150,33/73,17/43,...,35,50,11,7,18,4,5,12,42,156
2,CHRIS JONES,Guard,1.88,"10 April, 1993",9,9,255:00,141,41/65,13/28,...,26,27,22,16,15,1,3,16,24,153
3,TORNIKE SHENGELIA,Forward,2.06,"5 October, 1991",5,5,138:01,77,20/35,6/15,...,13,21,9,5,8,0,2,10,20,82
4,OSCAR DA SILVA,Forward,2.05,"21 September, 1998",8,4,213:32,114,42/59,4/11,...,21,29,8,9,8,6,2,18,21,131


In [9]:
!ls

Players.json  Prototyping_scraping_on_single_player.ipynb  README.md


In [10]:
!mkdir data

In [11]:
df.to_csv("data/players.csv",index=False)

In [12]:
df2 = pd.read_csv("data/players.csv")

In [17]:
pd.testing.assert_frame_equal(df,df2)

AssertionError: Attributes of DataFrame.iloc[:, 2] (column name="height") are different

Attribute "dtype" are different
[left]:  object
[right]: float64