In [309]:
########################################################
# Author: Izzat Zanail - izzat.zanail@gmail.com
# Language: Python 3.12
# Created: 2024-03-21
# Updated: 2024-10-13
# Project: Malaysia Super League Data Analysis
# Description: MSL Football Clubs Data Scraping and Analysis from Transfermarkt website
########################################################

In [310]:
# Import libraries

import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import math

In [311]:
# # Scrape MSL teams' data

season_year = [2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
trsfrmkt_msl_link = 'https://www.transfermarkt.com/malaysia-super-league/startseite/wettbewerb/MYS1/plus/?saison_id='

YearList = []
TeamsList = []
TeamsListCleaned = []
TeamSquadSize = []
TeamNoForeigners = []
TeamMarketValue = []
TeamLinksList = []
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36'}

for year in range(0, len(season_year)):
    page_year = trsfrmkt_msl_link + str(season_year[year])
    pageTree = requests.get(page_year, headers=headers)
    pageSoup = BeautifulSoup(pageTree.content, 'html.parser')
    
    Teams = pageSoup.find_all('td', {'class': 'hauptlink no-border-links'})
    SquadSize = pageSoup.find_all('td', {'class': 'zentriert'})
    NoForeigners = pageSoup.find_all('td', {'class': 'zentriert'})
    MarketValue = pageSoup.find_all('td', {'class': 'rechts'})
    TeamLinks = pageSoup.find_all('td', {'class': 'hauptlink no-border-links'})
    
    for i in range(0, len(Teams)):
        str_Teams = str(Teams[i]).split('title="',1)[1].split('">',1)[0]
        TeamsList.append(str_Teams)
        #if 'Johor Darul' in str_Teams:
        #    str_Teams = 'Johor Darul Takzim FC'
        #    TeamsList.append(str_Teams)
        #elif 'Kelantan United' in str_Teams:
        #    str_Teams = 'Kelantan Darul Naim FC'
        #    TeamsList.append(str_Teams)
        #else:
        #    TeamsList.append(str_Teams)
    for i in range(4, len(Teams)*4+4, 4):
        str_SquadSize = str(SquadSize[i]).split('>', 2)[2].split('<')[0]
        TeamSquadSize.append(str_SquadSize)
    for i in range(6, len(Teams)*4+6, 4):
        str_NoForeigners = str(NoForeigners[i]).split('>')[1].split('<')[0]
        TeamNoForeigners.append(str_NoForeigners)
    for i in range(3, len(Teams)*2+2, 2):
        str_MarketValue = str(MarketValue[i]).split('€')[1].split('<')[0]
        if 'k' in str_MarketValue:
            str_mv = str_MarketValue.split('k')[0]
            flt_mv = float(str_mv)*1000
            TeamMarketValue.append(flt_mv)
        elif 'm' in str_MarketValue:
            str_mv = str_MarketValue.split('m')[0]
            flt_mv = float(str_mv)*1000000
            TeamMarketValue.append(flt_mv)
        else:
            TeamMarketValue.append(float(flt_mv))
    for i in range(0, len(TeamLinks)):
        str_TeamLinks = str(TeamLinks[i]).split('a href="',1)[1].split('"',1)[0]
        TeamLinksList.append("https://www.transfermarkt.com" + str_TeamLinks)
    for i in range(0, len(Teams)):
        YearList.append(str(season_year[year] + 1))

# Data cleaning - clean Team names
for i in range(0, len(TeamsList)):
    if "Johor Darul Ta'zim" in TeamsList[i]:
        new_name = "Johor Darul Ta'zim FC"
        TeamsListCleaned.append(new_name)
    elif "Kedah FA" in TeamsList[i]:
        new_name = 'Kedah Darul Aman FC'
        TeamsListCleaned.append(new_name)
    elif "Kelantan United" in TeamsList[i]:
        new_name = 'Kelantan Darul Naim FC'
        TeamsListCleaned.append(new_name)
    elif "Kelantan Darul Naim" in TeamsList[i]:
        new_name = 'Kelantan Darul Naim FC'
        TeamsListCleaned.append(new_name)
    else:
        TeamsListCleaned.append(TeamsList[i])

In [312]:
# Create Dataframe for the extracted links

df_msl_teams = pd.DataFrame({
                    "SEASON":YearList,
                    "TEAM":TeamsListCleaned,
                    #"TEAM_CLEANED":TeamsListCleaned,
                    "SQUAD_SIZE":TeamSquadSize,
                    "NO_OF_FOREIGNERS":TeamNoForeigners,
                    "MARKET_VALUE_EUR":TeamMarketValue,
                    "LINK":TeamLinksList
})
# set dtypes for each column
df_msl_teams['SEASON'] = df_msl_teams['SEASON'].astype(str)
df_msl_teams['TEAM'] = df_msl_teams['TEAM'].astype(str)
df_msl_teams['SQUAD_SIZE'] = df_msl_teams['SQUAD_SIZE'].astype(int)
df_msl_teams['NO_OF_FOREIGNERS'] = df_msl_teams['NO_OF_FOREIGNERS'].astype(int)
df_msl_teams['LINK'] = df_msl_teams['LINK'].astype(str)

# remove duplicated rows of Year 2025
df_msl_teams = df_msl_teams[df_msl_teams['SEASON'] != '2025']



df_msl_teams.info()

<class 'pandas.core.frame.DataFrame'>
Index: 99 entries, 0 to 98
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   SEASON            99 non-null     object 
 1   TEAM              99 non-null     object 
 2   SQUAD_SIZE        99 non-null     int32  
 3   NO_OF_FOREIGNERS  99 non-null     int32  
 4   MARKET_VALUE_EUR  99 non-null     float64
 5   LINK              99 non-null     object 
dtypes: float64(1), int32(2), object(3)
memory usage: 4.6+ KB


In [313]:
df_msl_teams.head(n=10)

Unnamed: 0,SEASON,TEAM,SQUAD_SIZE,NO_OF_FOREIGNERS,MARKET_VALUE_EUR,LINK
0,2017,Johor Darul Ta'zim FC,36,9,6630000.0,https://www.transfermarkt.com/johor-darul-tazi...
1,2017,Melaka United FC,52,10,3130000.0,https://www.transfermarkt.com/melaka-united-fc...
2,2017,Kedah Darul Aman FC,33,3,3030000.0,https://www.transfermarkt.com/kedah-darul-aman...
3,2017,Selangor FC,35,8,2980000.0,https://www.transfermarkt.com/selangor-fc/star...
4,2017,Sri Pahang FC,33,5,2850000.0,https://www.transfermarkt.com/sri-pahang-fc/st...
5,2017,Selangor FC II,38,7,2830000.0,https://www.transfermarkt.com/selangor-fa-ii/s...
6,2017,Kelantan FC,45,9,2730000.0,https://www.transfermarkt.com/kelantan-fa/star...
7,2017,Terengganu FC II,36,6,2580000.0,https://www.transfermarkt.com/terengganu-fc-ii...
8,2017,FELDA United FC,35,8,2250000.0,https://www.transfermarkt.com/felda-united-fc/...
9,2017,Penang FC,41,8,2250000.0,https://www.transfermarkt.com/penang-fc/starts...


In [314]:
df_msl_teams.tail(n=10)

Unnamed: 0,SEASON,TEAM,SQUAD_SIZE,NO_OF_FOREIGNERS,MARKET_VALUE_EUR,LINK
89,2024,Sabah FC,34,8,3890000.0,https://www.transfermarkt.com/sabah-fc/startse...
90,2024,Sri Pahang FC,26,5,3800000.0,https://www.transfermarkt.com/sri-pahang-fc/st...
91,2024,Kedah Darul Aman FC,40,5,3750000.0,https://www.transfermarkt.com/kedah-darul-aman...
92,2024,Kuala Lumpur City FC,29,5,3380000.0,https://www.transfermarkt.com/kuala-lumpur-cit...
93,2024,Perak FC,28,8,3030000.0,https://www.transfermarkt.com/perak-fc/startse...
94,2024,Kuching City,37,10,2920000.0,https://www.transfermarkt.com/kuching-city/sta...
95,2024,Kelantan Darul Naim FC,38,10,2610000.0,https://www.transfermarkt.com/kelantan-united/...
96,2024,PDRM FC,35,9,2550000.0,https://www.transfermarkt.com/pdrm-fa/startsei...
97,2024,Negeri Sembilan FC,30,8,2480000.0,https://www.transfermarkt.com/negeri-sembilan-...
98,2024,Penang FC,30,5,2450000.0,https://www.transfermarkt.com/penang-fc/starts...


In [315]:
# Export the DataFrame to csv file

df_msl_teams.to_csv('C:\\Users\\izzat\\OneDrive\\Desktop\\MalaysiaSuperLeague_TransfermrktDataScraping\\datacsv\\MSL_TeamPages.csv', index=False)
df_msl_teams.to_csv('C:\\Users\\izzat\\OneDrive\\Desktop\\MalaysiaSuperLeague_DataAnalysis\\MalaysiaSuperLeagueDataAnalysis\\datacsv\\MSL_TeamPages.csv', index=False)

In [316]:
# Scrape MSL players' data

# Create lists

SeasonList = []
PlayerTeamsList = []
PlayersList = []
NumberList = []
AgeList = []
PositionList = []
NationList = []
ValueList = []
grouped_positionList = []
cleaned_values = []
cleaned_team_names = []

# Scrape player data, from links that were previously scraped

for link in range(0, len(TeamLinksList)):
    page = TeamLinksList[link]
    pageTree = requests.get(page, headers=headers)
    pageSoup = BeautifulSoup(pageTree.content, 'html.parser')
    
    TeamNames = pageSoup.find_all("h1", {"class": "data-header__headline-wrapper data-header__headline-wrapper--oswald"})
    Players = pageSoup.find_all("img", {"class": "bilderrahmen-fixed lazy lazy"})
    Numbers = pageSoup.find_all("div", {"class": "rn_nummer"})
    Age = pageSoup.find_all("td", {"class": "zentriert"})
    Positions = pageSoup.find_all("table", {"class": "inline-table"})
    Nationality = pageSoup.find_all('img', {'class': 'flaggenrahmen'})
    Values = pageSoup.find_all("td", {"class": "rechts hauptlink"})
    
    season_year = str(int(TeamLinksList[link].split("/", 8)[8]))
    
    #if season 2024, skip 3 rows, else, skip 4 rows 
    if season_year == "2024":
        r = 3
    else:
        r = 4
    
    for i in range(0, len(Players)):
        SeasonList.append(season_year)
        #if season_year == "2024":
        #    SeasonList.append("2024/2025") # Starting from year 2024, Malaysia Super League will start in May 2024 and end in 2025.
        #else:
        #    SeasonList.append(season_year) 
    
    for i in range(0, len(Players)):
        str_Team = str(TeamNames).split('>')[1].split('\n            ')[1].split('        </h1')[0]
        #if 'Johor Darul' in str_Teams:
        #    str_Teams = 'Johor Darul Takzim FC'
        #    PlayerTeamsList.append(str_Teams)
        #else:
        #    PlayerTeamsList.append(str_Teams)
        PlayerTeamsList.append(str_Team)
    
    for i in range(0, len(Players)):
        str_Players = str(Players[i]).split('" class',1)[0].split('<img alt="',1)[1]
        PlayersList.append(str_Players)
    
    for i in range(0, len(Numbers)):
        str_Numbers = str(Numbers[i]).split('class="rn_nummer">',1)[1].split('</div>',1)[0]
        NumberList.append(str_Numbers)
           
    for i in range(0, len(Positions)):
        str_Position = str(Positions[i]).split('<td>', 1)[1].split('</td>', 1)[0].split('\n ', 1)[1].split('           ', 1)[1].split('        ',1)[0]
        PositionList.append(str_Position)
    
    # Convert specific position names into a more general name - Marked as "Position Type"
    for j in range(0, len(Positions)):
        if 'Striker' in PositionList[j]:
            grouped_positionList.append('Forwards')
        elif 'Forward' in PositionList[j]:
            grouped_positionList.append('Forwards')
        elif 'Midfield' in PositionList[j]:
            grouped_positionList.append('Midfielders')
        elif 'Midfielder' in PositionList[j]:
            grouped_positionList.append('Midfielders')
        elif 'Winger' in PositionList[j]:
            grouped_positionList.append('Midfielders')
        elif 'Back' in PositionList[j]:
            grouped_positionList.append('Defenders')
        elif 'Defender' in PositionList[j]:
            grouped_positionList.append('Defenders')
        else:
            grouped_positionList.append(PositionList[j])
    
    #for i in range(2, (len(Players)*r), r): 
    #    str_Nationality = str(Nationality[i]).split('" class',1)[0].split('<img alt="',1)[1]
    #    NationList.append(str_Nationality)

    for i in range(1, (len(Players)*r), r):
        str_Age = str(Age[i]).split("(",1)[1].split(")",1)[0]
        AgeList.append(str_Age)
    
    for i in range(0, len(Values)):
        ValueList.append(Values[i].text)
        
# Data cleaning - change "Value" datatype from string type to float type
for a in range(0, len(ValueList)):
    if 'k' in ValueList[a]:
        str_a = str(ValueList[a]).split('€')[1].split('k')[0]
        flt_a = float(str_a)*1000
        cleaned_values.append(flt_a)
    elif 'm' in ValueList[a]:
        str_a = str(ValueList[a]).split('€')[1].split('m')[0]
        flt_a = float(str_a)*1000000
        cleaned_values.append(flt_a)
    else:
        cleaned_values.append(float(a))

# Data cleaning - clean Team names
for b in range(0, len(PlayerTeamsList)):
    if "Johor Darul Ta'zim" in PlayerTeamsList[b]:
        new_name = "Johor Darul Ta'zim FC"
        cleaned_team_names.append(new_name)
    elif "Kedah FA" in PlayerTeamsList[b]:
        new_name = 'Kedah Darul Aman FC'
        cleaned_team_names.append(new_name)
    elif "Kelantan United" in PlayerTeamsList[b]:
        new_name = 'Kelantan Darul Naim FC'
        cleaned_team_names.append(new_name)
    elif "Kelantan Darul Naim" in PlayerTeamsList[b]:
        new_name = 'Kelantan Darul Naim FC'
        cleaned_team_names.append(new_name)
    elif "Kuching City" in PlayerTeamsList[b]:
        new_name = 'Kuching City FC'
        cleaned_team_names.append(new_name)
    else:
        cleaned_team_names.append(PlayerTeamsList[b])


In [317]:
# Create DataFrame for full extracted data

df_msl_player = pd.DataFrame({
                         "SEASON":SeasonList,
                         "TEAM":PlayerTeamsList,
                         "TEAM_CLEANED":cleaned_team_names,
                         "PLAYER":PlayersList,
                         "NUMBER":NumberList,
                         "AGE":AgeList,
                         "POSITION":PositionList,
                         "POSITION_TYPE":grouped_positionList,
                         #"NATIONALITY":NationList,
                         "VALUE_EUR":ValueList,
                         "VALUE_EUR_1":cleaned_values                         
                        })
df_msl_player

# clean AGE
df_msl_player['AGE'] = np.where(df_msl_player['AGE'] == '-', int(0), df_msl_player['AGE'])
df_msl_player['AGE'] = df_msl_player['AGE'].astype('int')

df_msl_player = df_msl_player[df_msl_player['AGE'] != 0]

# remove duplicated rows of Year 2025 & reset index
df_msl_player = df_msl_player.drop_duplicates().reset_index()

# remove column 'index'
df_msl_player = df_msl_player.drop(columns=['index'])

# clean VALUE_EUR
df_msl_player['VALUE_EUR'] = np.where(df_msl_player['VALUE_EUR'] == '-', '€0n', df_msl_player['VALUE_EUR'])

df_msl_player['VALUE_EUR_CLEANED'] = ''
for i in range(0, len(df_msl_player)):
    val = df_msl_player.loc[i, 'VALUE_EUR'].split('€')[1]

    res_k = 'k' in val
    res_m = 'm' in val
    res_nothing = 'n' in val

    if res_k:
        new_val = float(val.split('k')[0]) * 1000
    elif res_m:
        new_val = float(val.split('m')[0]) * 1000000
    elif res_nothing:
        new_val = float(val.split('n')[0])*0

    df_msl_player.loc[i, 'VALUE_EUR_CLEANED'] = new_val

df_msl_player['VALUE_EUR'] = df_msl_player['VALUE_EUR_1']

# remove column 'VALUE_EUR_CLEANED'
df_msl_player = df_msl_player.drop(columns=['VALUE_EUR_CLEANED', 'VALUE_EUR_1'])

df_msl_player.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3434 entries, 0 to 3433
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   SEASON         3434 non-null   object 
 1   TEAM           3434 non-null   object 
 2   TEAM_CLEANED   3434 non-null   object 
 3   PLAYER         3434 non-null   object 
 4   NUMBER         3434 non-null   object 
 5   AGE            3434 non-null   int32  
 6   POSITION       3434 non-null   object 
 7   POSITION_TYPE  3434 non-null   object 
 8   VALUE_EUR      3434 non-null   float64
dtypes: float64(1), int32(1), object(7)
memory usage: 228.2+ KB


In [318]:
# Export the DataFrame to csv file

df_msl_player.to_csv('C:\\Users\\izzat\\OneDrive\\Desktop\\MalaysiaSuperLeague_TransfermrktDataScraping\\datacsv\\MSL_FullData.csv', index=False)
df_msl_player.to_csv('C:\\Users\\izzat\\OneDrive\\Desktop\\MalaysiaSuperLeague_DataAnalysis\\MalaysiaSuperLeagueDataAnalysis\\datacsv\\MSL_FullData.csv', index=False)