In [20]:
import requests
from bs4 import BeautifulSoup


def get_page_soup_headers(url: str, save_html: bool = False) -> BeautifulSoup:
    """Returns html of a given url

    Args:
        url (str): The URL to fetch the HTML from.
        save_html (bool): Whether to save the HTML content or not.

    Returns:
        bs4: pageSoup
    """

    USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36"  # noqa: E501
    headers = {"User-Agent": USER_AGENT}

    pageTree = requests.get(url, headers=headers)
    pageSoup = BeautifulSoup(pageTree.content, "html.parser")

    if save_html:
        return pageSoup, pageTree.content
    else:
        return pageSoup

In [6]:
import pandas as pd

club = "Newcastle United"
team_id = 762
pageSoup = None

In [7]:
def _get_team_id_and_club(team_id, club):
    df = pd.read_csv(
        "https://raw.githubusercontent.com/ian-shepherd/reus_data/main/raw-data/team_translations.csv",
        keep_default_na=False,
    )
    filter_condition = (
        (df.fbref_name == club)
        | (df.transfermarkt_name == club)  # noqa: W503
        | (df.transfermarkt_link == club)  # noqa: W503
        | (df.fcpython == club)  # noqa: W503
        | (df.fivethirtyeight == club)  # noqa: W503
    )
    filtered_df = df[filter_condition]
    club = filtered_df.transfermarkt_link.iloc[0]
    team_id = int(filtered_df.transfermarkt.iloc[0])

    return club, team_id


club, team_id = _get_team_id_and_club(team_id, club)
club, team_id

('newcastle-united', 762)

In [92]:
role = "Manager"
# role = "Goalkeeping Coach"


# TODO: order by number or alphabetical
# TODO: check missing (Man City)
role_dict = {
    "Manager": 1,
    "Caretaker Manager": 10,
    "Assistant Manager": 2,
    "Goalkeeping Coach": 3,
    "Conditioning Coach": 11,
    "Fitness Coach": 22,
    "Technical Coach": 38,
    "Chief Analyst": 16,
    "Youth Coach": 104,
    "Head of Academy Coaching": 106,
    "Video Analyst": 70,
    "Match Analyst": 146,
    "Director of Football": 13,
    "Sporting Director": 54,
    "Team Manager": 55,
    "Academy Manager": 67,
    "Chief Executive Officer": 25,
    "Advisor of management": 102,
    "Director": 188,
    "Chairman": 28,
    "Board Member": 39,
    "Owner": 105,
    "Chief Scout": 53,
    "Head of Scouting": 90,
    "Scout": 7,
    "Youth Chief Scout": 167,
    "Youth Scout": 166,
    "Head of Youth Scouting": 140,
    "Head of Medical": 46,
    "Club Doctor": 19,
    "Physiotherapist": 12,
    "Sports Scientist": 71,
    "Masseur": 45,
    "Medical Director Physiotherapy": 180,
    "Marketing Staff": 44,
    "Media Officer": 92,
    "Media worker": 131,
    "Sponsoring": 56,
    "Head of Media and Communication": 179,
    "Photographer": 151,
    "Kit Manager": 20,
    "Club representative": 149,
    "Director of Finance": 81,
    "Advisor": 118,
    "Loan Player Manager": 158,
    "President": 17,
    "Vice-President": 27,
    "Vice-Chairman": 28,
    "Member of administrative board": 59,
    "Marketing/Management": 83,
    "Director of Marketing and Sales": 57,
    "Honorary President": 86,
    "Nutritionist": 130,
    "Academy Staff": 139,
    "Chief Instructor": 127,
    "Development Coach": 187,
    "Academy Goalkeeping cooridnator": 144,
    "Goalkeeping Co-oridnator": 65,
}

In [93]:
url = f"https://www.transfermarkt.com/{club}/mitarbeiterhistorie/verein/{team_id}/personalie_id/{role_dict[role]}"
url

'https://www.transfermarkt.com/newcastle-united/mitarbeiterhistorie/verein/762/personalie_id/1'

In [94]:
pageSoup = get_page_soup_headers(url)

In [95]:
# find table
table = pageSoup.find_all("table")[1]
tbody = table.find("tbody")

# Find rows
rows = tbody.find_all("tr")

In [96]:
# Generate empty list
mylist = []

# iterate through each row
for row in rows:
    # check if valid row
    try:
        row["class"] not in ["odd", "even"]
    except KeyError:
        continue

    # Store attributes
    name = row.find("img")["alt"]
    url = row.find("a")["href"]

    dob = row.find("td").find_all("td")[2].text
    cells = row.find_all("td", {"class": "zentriert"})

    # Error handling for no nation
    try:
        nation = cells[0].find("img")["alt"]
    except TypeError:
        nation = None

    appointed = cells[1].text
    left = cells[2].text
    time_in_post = row.find("td", {"class": "rechts"}).text

    # Error handling for non managers
    try:
        matches = cells[3].text
    except IndexError:
        matches = None
    try:
        ppg = cells[4].text
    except IndexError:
        ppg = None

    # Generate dictionary for each staff member
    mydict = {
        "name": name,
        "url": url,
        "dob": dob,
        "nation": nation,
        "appointed": appointed,
        "left": left,
        "time_in_post": time_in_post,
        "matches": matches,
        "ppg": ppg,
    }

    # Append dictionary to list
    mylist.append(mydict)

{'name': 'Eddie Howe', 'url': '/eddie-howe/profil/trainer/10976', 'dob': 'Nov 29, 1977', 'nation': 'England', 'appointed': 'Nov 8, 2021', 'left': '-', 'time_in_post': '769 days\xa0', 'matches': '99', 'ppg': '1.75'}
{'name': 'Steve Bruce', 'url': '/steve-bruce/profil/trainer/447', 'dob': 'Dec 31, 1960', 'nation': 'England', 'appointed': 'Jul 17, 2019', 'left': 'Oct 20, 2021', 'time_in_post': '826 days\xa0', 'matches': '97', 'ppg': '1.15'}
{'name': 'Rafael Benítez', 'url': '/rafael-benitez/profil/trainer/1522', 'dob': 'Apr 16, 1960', 'nation': 'Spain', 'appointed': 'Mar 11, 2016', 'left': 'Jun 30, 2019', 'time_in_post': '1206 days\xa0', 'matches': '146', 'ppg': '1.48'}
{'name': 'Steve McClaren', 'url': '/steve-mcclaren/profil/trainer/458', 'dob': 'May 3, 1961', 'nation': 'England', 'appointed': 'Jun 10, 2015', 'left': 'Mar 11, 2016', 'time_in_post': '275 days\xa0', 'matches': '31', 'ppg': '0.87'}
{'name': 'John Carver', 'url': '/john-carver/profil/trainer/3660', 'dob': 'Jan 16, 1965', 'n