# Table of Contents
 <p>

In [1]:
# Beautiful Soup for web scrapping data
from bs4 import BeautifulSoup
import requests

# Save into csv file
import csv

# Pandas to reimport csv data
import pandas as pd

In [2]:
def log_progress(sequence, every=None, size=None, name='Items', delete=False):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )
        if delete:
            box.close()

In [22]:
headers = {'User-Agent': 'Mozilla/5.00'}

base_url = "https://www.transfermarkt.com"



# create file handle
csvfile = open('data-managers.csv', 'w')

manager_url = "/jose-mourinho/profil/trainer/781"


url = base_url + manager_url
r  = requests.get(url, headers=headers)
manager_data_page = BeautifulSoup(r.text, 'html.parser')

manager_table = manager_data_page.find("tr", {"class":"ausfallzeiten_k"}).find_parent().find_all("tr")

for tr in manager_table:
    tds = tr.find_all("td")
    if len(tds)>1 and tds[-3].text == "Manager":
        club = tds[1].text.replace(" ","")
        manager_name = manager_url.split("/")[1]
        
        csvfile.write(club+"|"+manager_name+"\n")
        

# Close CSV file
csvfile.close()

# Other website

In [101]:
%%time

# create file handle
csvfile = open('data-managers-new.csv', 'w')

base_url = "http://www.soccerbase.com/managers/manager.sd?manager_id={0}"

nbrFailure =  []
nbrNoInfo  =  []

for manager_id in log_progress(range(1,4654), every=1):
    url = base_url.format(str(manager_id))
    
    r  = requests.get(url, headers=headers)
    manager_data_page = BeautifulSoup(r.text, 'html.parser')

    try:
        manager_name = manager_data_page.find('div', {"class":"headlineBlock"}).text.replace("\n","")

        for club in manager_data_page.find_all('td', {"class":"first bull"}):
            club_name = club.text
            from_year = club.find_next().find_next().get_text().split(",")[1].replace(" ","")
            to_year = club.find_next().find_next().find_next().get_text().split(",")[1].replace(" ","")
            
            # Can contain duplicates (=> self-pointing edges)
            csvfile.write(club_name+"|"+manager_name+"|"+from_year+"|"+to_year+"\n")
    
    except:
        # Two kinds of exceptions: either there is no carre information for that manager
        if len(manager_data_page.find_all("div", {"class":"nodataBlock nodataInSoccer"})) > 0:
            nbrNoInfo.append(manager_id)
        else:
            # Either all other types of errors
            nbrFailure.append(manager_id)
            
    
# Close CSV file
csvfile.close()

CPU times: user 1min 52s, sys: 4.6 s, total: 1min 56s
Wall time: 28min 53s


In [6]:
import time

In [7]:
# create file handle
csvfile = open('data-managers-new_te.csv', 'w')
headers = {'User-Agent': 'Mozilla/5.00'}
base_url = "http://www.soccerbase.com/managers/manager.sd?manager_id={0}"

nbrFailure =  []
nbrNoInfo  =  []

for manager_id in log_progress(range(1,4654), every=1):
    if (manager_id % 50 == 0):
        time.sleep(10)
    
    url = base_url.format(str(manager_id))

    r  = requests.get(url, headers=headers)
    manager_data_page = BeautifulSoup(r.text, 'html.parser')

    try:
        manager_name = manager_data_page.find('div', {"class":"headlineBlock"}).text.replace("\n","")

        for club in manager_data_page.find_all('td', {"class":"first bull"}):
            club_name = club.text
            from_year = club.find_next().find_next().get_text().split(",")[1].replace(" ","")
            to_year = club.find_next().find_next().find_next().get_text().split(",")[1].replace(" ","")
            
            # Can contain duplicates (=> self-pointing edges)
            csvfile.write(club_name+"|"+manager_name+"|"+from_year+"|"+to_year+"\n")

    except:
        # Two kinds of exceptions: either there is no carre information for that manager
        if len(manager_data_page.find_all("div", {"class":"nodataBlock nodataInSoccer"})) > 0:
            nbrNoInfo.append(manager_id)
        else:
            # Either all other types of errors
            nbrFailure.append(manager_id)
            
    
# Close CSV file
csvfile.close()

In [8]:
print("Total: 4653")
print("Failure:", len(nbrFailureailure))
print("No Info:", len(nbrNoInfo))

Total: 4653
Failure: 4105
No Info: 38


In [44]:
df = pd.read_csv("data-managers.csv", sep="|", names=["Club", "Manager"])
df.head(10)

Unnamed: 0,Club,Manager
0,Scarborough,A C Bell
1,Walsall,A E Parsloe
2,Walsall,A G Burton
3,Stoke,A J Barker
4,Chelsea,A Leslie Knighton
5,Middlesbro,A Walker
6,Orlando City,Adrian Heath
7,Coventry,Adrian Heath
8,Coventry,Adrian Heath
9,Sheff Utd,Adrian Heath


In [89]:
df.describeribe()

Unnamed: 0,Club,Manager
count,2575,2575
unique,795,1370
top,Stockport,Roy Hodgson
freq,27,16


In [46]:
print("Number of clubes:    ", len(df.Club.unique()))
print("Number of managers:  ", len(df.Manager.unique()))

Number of clubes:     795
Number of managers:   1370


In [65]:
manager_clubs_freq = {}
for manager in df.Manager.unique():
    x = len(df.Manager[df.Manager==manager])
    manager_clubs_freq[x] = manager_clubs_freq.get(x,0)+1

manager_clubs_freq

{1: 812,
 2: 281,
 3: 127,
 4: 62,
 5: 35,
 6: 18,
 7: 17,
 8: 6,
 9: 8,
 10: 1,
 12: 2,
 16: 1}

# Create Network

In [68]:
# NetworkX
import networkx as nx

G = nx.Graph()

In [69]:
# Each manager is a node
for manager in df.Manager.unique():
    G.add_node(manager)

In [None]:
# Link between two managers if they coach the same team


In [None]:
for club in df.Club.unique():
    managers = df.Manager[df.Club == club]

In [70]:
df.Manager[df.Club=="Chelsea"]

4        A Leslie Knighton
361     Danny Blanchflower
385            Dave Sexton
449           Glenn Hoddle
848              Ron Suart
1014        Tommy Docherty
1154            Graham Rix
1165       Claudio Ranieri
Name: Manager, dtype: object

In [86]:
for club in df.Manager.unique():
    if "Jose Mourinho" in club:
        print(club)

In [80]:
df[df.Manager=="Josep Guardiola"]

Unnamed: 0,Club,Manager
1563,Man City,Josep Guardiola
1564,B Munich,Josep Guardiola
1565,Barcelona,Josep Guardiola


In [82]:
r  = requests.get("http://www.soccerbase.com/managers/manager.sd?manager_id=1908", headers=headers)
manager_data_page = BeautifulSoup(r.text, 'html.parser')

manager_name = manager_data_page.find('div', {"class":"headlineBlock"}).text.replace("\n","")

for club in manager_data_page.find_all('td', {"class":"first bull"}):
    club_name = club.text
    print(club_name, manager_name)

Man Utd Jose Mourinho
Chelsea Jose Mourinho
Real Madrid Jose Mourinho
Inter Jose Mourinho
Chelsea Jose Mourinho


In [84]:
"-"+manager_name+"-"

'-Jose Mourinho-'