In [2]:
########################################################
# Author: Izzat Zanail - izzat.zanail@gmail.com
# Language: Python 3.12
# Created: 2024-03-21
# Updated: -
# Project: Transfermarkt Malaysia Super League Data Scraper
# Description: Scrape MSL football teams data
########################################################

In [4]:
# Import libraries

import requests
from bs4 import BeautifulSoup
import pandas as pd

In [6]:
# Extract team page links for each season

season_year = [2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]
trsfrmkt_msl_link = "https://www.transfermarkt.com/malaysia-super-league/startseite/wettbewerb/MYS1/plus/?saison_id="

YearList = []
TeamsList = []
TeamLinksList = []
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}

for year in range(0, len(season_year)):
    page_year = trsfrmkt_msl_link + str(season_year[year])
    pageTree = requests.get(page_year, headers=headers)
    pageSoup = BeautifulSoup(pageTree.content, 'html.parser')
    
    Teams = pageSoup.find_all("td", {"class": "hauptlink no-border-links"})
    TeamLinks = pageSoup.find_all("td", {"class": "hauptlink no-border-links"})
    
    for i in range(0, len(Teams)):
        str_Teams = str(Teams[i]).split('title="',1)[1].split('">',1)[0]
        TeamsList.append(str_Teams)
    for i in range(0, len(TeamLinks)):
        str_TeamLinks = str(TeamLinks[i]).split('a href="',1)[1].split('"',1)[0]
        TeamLinksList.append("https://www.transfermarkt.com" + str_TeamLinks)
    for i in range(0, len(Teams)):
        YearList.append(str(season_year[year] + 1))

In [7]:
# Create Dataframe for the extracted links

team_links = pd.DataFrame({
                    "Year":YearList,
                    "Team":TeamsList,
                    "Link":TeamLinksList
})
team_links

Unnamed: 0,Year,Team,Link
0,2017,Johor Darul Ta'zim,https://www.transfermarkt.com/johor-darul-tazi...
1,2017,Melaka United FC,https://www.transfermarkt.com/melaka-united-fc...
2,2017,Kedah FA,https://www.transfermarkt.com/kedah-darul-aman...
3,2017,Selangor FC,https://www.transfermarkt.com/selangor-fc/star...
4,2017,Sri Pahang FC,https://www.transfermarkt.com/sri-pahang-fc/st...
...,...,...,...
94,2024,PDRM FC,https://www.transfermarkt.com/pdrm-fa/startsei...
95,2024,Kuching City,https://www.transfermarkt.com/kuching-city/sta...
96,2024,Negeri Sembilan FC,https://www.transfermarkt.com/negeri-sembilan-...
97,2024,Kelantan Darul Naim,https://www.transfermarkt.com/kelantan-united/...


In [8]:
# Export the DataFrame to csv file

team_links.to_csv(r"C:\Users\izzat\OneDrive\Desktop\MalaysiaSuperLeague_TransfermrktDataScraping\datacsv\MSL_TeamPages.csv", index=False)

In [9]:
# Create lists

SeasonList = []
PlayerTeamsList = []
PlayersList = []
NumberList = []
AgeList = []
PositionList = []
NationList = []
ValueList = []
grouped_positionList = []
cleaned_values = []

In [10]:
# Scrape player data, from links that were previously scraped

for link in range(0, len(TeamLinksList)):
    page = TeamLinksList[link]
    pageTree = requests.get(page, headers=headers)
    pageSoup = BeautifulSoup(pageTree.content, 'html.parser')
    
    TeamNames = pageSoup.find_all("h1", {"class": "data-header__headline-wrapper data-header__headline-wrapper--oswald"})
    Players = pageSoup.find_all("img", {"class": "bilderrahmen-fixed lazy lazy"})
    Numbers = pageSoup.find_all("div", {"class": "rn_nummer"})
    Age = pageSoup.find_all("td", {"class": "zentriert"})
    Positions = pageSoup.find_all("table", {"class": "inline-table"})
    Nationality = pageSoup.find_all("td", {"class": "zentriert"})
    Values = pageSoup.find_all("td", {"class": "rechts hauptlink"})
    
    season_year = str(int(TeamLinksList[link].split("/", 8)[8]) + 1)
    
    #if season 2024, skip 3 rows, else, skip 4 rows 
    if season_year == "2024":
        r = 3
    else:
        r = 4
    
    for i in range(0, len(Players)):
        if season_year == "2024":
            SeasonList.append("2024/2025") # Starting from year 2024, Malaysia Super League will start in May 2024 and end in 2025.
        else:
            SeasonList.append(season_year) 
    
    for i in range(0, len(Players)):
        str_Team = str(TeamNames).split('">\n            ',1)[1].split('        </h1>',1)[0]
        PlayerTeamsList.append(str_Team)
    
    for i in range(0, len(Players)):
        str_Players = str(Players[i]).split('" class',1)[0].split('<img alt="',1)[1]
        PlayersList.append(str_Players)
    
    for i in range(0, len(Numbers)):
        str_Numbers = str(Numbers[i]).split('class="rn_nummer">',1)[1].split('</div>',1)[0]
        NumberList.append(str_Numbers)
    
    for i in range(1, (len(Players)*r), r):
        str_Age = str(Age[i]).split("(",1)[1].split(")",1)[0]
        AgeList.append(str_Age)
        
    for i in range(0, len(Positions)):
        str_Position = str(Positions[i]).split('<td>', 1)[1].split('</td>', 1)[0].split('\n ', 1)[1].split('           ', 1)[1].split('        ',1)[0]
        PositionList.append(str_Position)
    
    # Convert specific position names into a more general name - Marked as "Position Type"
    for j in range(0, len(Positions)):
        if 'Striker' in PositionList[j]:
            grouped_positionList.append('Forwards')
        elif 'Forward' in PositionList[j]:
            grouped_positionList.append('Forwards')
        elif 'Midfield' in PositionList[j]:
            grouped_positionList.append('Midfielders')
        elif 'Midfielder' in PositionList[j]:
            grouped_positionList.append('Midfielders')
        elif 'Winger' in PositionList[j]:
            grouped_positionList.append('Midfielders')
        elif 'Back' in PositionList[j]:
            grouped_positionList.append('Defenders')
        elif 'Defender' in PositionList[j]:
            grouped_positionList.append('Defenders')
        else:
            grouped_positionList.append(PositionList[j])
    
    for i in range(2, (len(Players)*r), r): 
        str_Nationality = str(Nationality[i]).split('" class',1)[0].split('<img alt="',1)[1]
        NationList.append(str_Nationality)
    
    for i in range(0, len(Values)):
        ValueList.append(Values[i].text)
        
    # Data cleaning - change "Value" datatype from string type to float type
    for a in range(0, len(Values)):
        if 'k' in ValueList[a]:
            str_a = str(ValueList[a]).split('€')[1].split('k')[0]
            flt_a = float(str_a)*1000
            cleaned_values.append(flt_a)
        elif 'm' in ValueList[a]:
            str_a = str(ValueList[a]).split('€')[1].split('m')[0]
            flt_a = float(str_a)*1000000
            cleaned_values.append(flt_a)
        else:
            cleaned_values.append(float(a))


In [11]:
# Create DataFrame for full extracted data

msl_full_df = pd.DataFrame({
                         "Season":SeasonList,
                         "Team":PlayerTeamsList,
                         "Player":PlayersList,
                         "Number":NumberList,
                         "Age":AgeList,
                         "Position":PositionList,
                         "Position Type":grouped_positionList,
                         "Nationality":NationList,
                         "ValueEUR":cleaned_values                        
                        })
msl_full_df

Unnamed: 0,Season,Team,Player,Number,Age,Position,Position Type,Nationality,ValueEUR
0,2017,Johor Darul Ta'zim,Farizal Marlias,1,29,Goalkeeper,Goalkeeper,Malaysia,0.0
1,2017,Johor Darul Ta'zim,Izham Tarmizi,24,24,Goalkeeper,Goalkeeper,Malaysia,1.0
2,2017,Johor Darul Ta'zim,Haziq Nadzli,30,17,Goalkeeper,Goalkeeper,Malaysia,2.0
3,2017,Johor Darul Ta'zim,Bruno Soares,2,27,Centre-Back,Defenders,Brazil,700000.0
4,2017,Johor Darul Ta'zim,Marcos António,6,32,Centre-Back,Defenders,Brazil,400000.0
...,...,...,...,...,...,...,...,...,...
3389,2024/2025,Penang FC,Kogileswaran Raj,9,25,Right Winger,Midfielders,Malaysia,25.0
3390,2024/2025,Penang FC,Amer Azahar,11,28,Right Winger,Midfielders,Malaysia,26.0
3391,2024/2025,Penang FC,Nabil Lapti,30,31,Right Winger,Forwards,Malaysia,27.0
3392,2024/2025,Penang FC,Khairil Anuar,83,29,Right Winger,Forwards,Malaysia,1500000.0


In [12]:
# Export the DataFrame to csv file

msl_full_df.to_csv(r"C:\Users\izzat\OneDrive\Desktop\MalaysiaSuperLeague_TransfermrktDataScraping\datacsv\MSL_FullData.csv", index=False)