In [54]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import time 
import csv
from pathlib import Path
import os

Get Overview Details

In [55]:
def getPlayerDetails(html_source):
    dict = {}

    soup = BeautifulSoup(html_source , 'html.parser')

    try:
        first_name = soup.find(class_='player-header__name-first')
        if first_name:
            first_name = first_name.text.strip()
        else:
            first_name = ''
        dict['Name'] = first_name + ' ' + soup.find(class_='player-header__name-last').text.strip()
    except (AttributeError,KeyError):
        print("Skipping incomplete stat detail.")

    try:
        dict['Wins'] = soup.find(class_='allStatContainer js-all-stat-container statwins').text.strip()
    except (AttributeError,KeyError):
        print("Skipping incomplete stat detail.")

    try:
        dict['Losses'] = soup.find(class_='allStatContainer js-all-stat-container statlosses').text.strip()
    except (AttributeError,KeyError):
        print("Skipping incomplete stat detail.")

    #Thông tin cá nhân
    personalDetails = soup.find_all("div", class_='player-overview__col')
    for personalDetail in personalDetails:
        title_element = personalDetail.find(class_='player-overview__label')
        value_element = personalDetail.find(class_='player-overview__info')
        
        # Check if both the title and value elements are found
        if title_element and value_element:
            title = title_element.text.strip()  # Get the stat name
            value = value_element.text.strip()  # Get the stat value
            dict[title] = value
        else:
            print("Skipping incomplete stat detail.")

    #Do Appearances có ở thông tin cá nhân là của tất mùa giải, ghi đè Appearances ở đây là của chỉ mùa giải hiện tại
    try:
        dict['Appearances'] = soup.find(class_='allStatContainer js-all-stat-container statappearances').text.strip()
    except (AttributeError,KeyError):
        print("Skipping incomplete stat detail.")

    try: 
        dict['Nationality'] = soup.find(class_='player-overview__player-country').text.strip()
    except (AttributeError,KeyError):
        print("Skipping incomplete stat detail.")

    return dict

Get Defenders' Statistics Info

In [56]:
def getDefenderInfo (html_source,dict):
    data = []
    soup = BeautifulSoup(html_source , 'html.parser')

    #Thông tin chỉ số kĩ năng
    stats = soup.find_all("div", class_="player-stats__stat-value")
    for stat in stats:
        try:
            # Extract the stat title and its value
            title = stat.contents[0].strip()  # Get the stat name
            value = stat.find("span").text.strip()  # Get the stat value
            dict[title] = value
        except AttributeError:
            print("Skipping incomplete stat detail.")

    data.append(dict)

    field_names = ['Name','Nationality','Date of Birth','Height','Club','Position'
                    ,'Appearances','Wins','Losses'
                    ,'Clean sheets', 'Goals Conceded' , 'Tackles','Tackle success %','Last man tackles','Blocked shots','Interceptions','Clearances','Headed Clearance','Clearances off line','Recoveries','Duels won','Duels lost','Successful 50/50s','Aerial battles won','Aerial battles lost','Own goals','Errors leading to goal'
                    ,'Assists','Passes','Passes per match','Big Chances Created','Crosses','Cross accuracy %','Through balls','Accurate long balls'
                    ,'Yellow cards','Red cards','Fouls','Offsides'
                    ,'Goals','Headed goals','Goals with right foot','Goals with left foot','Hit woodwork']
    
    file_path = Path('../data/Stats_csv/Defender_stats.csv')
    # Check if the file exists and is empty
    is_file_empty = not file_path.exists() or file_path.stat().st_size == 0
    with open('../data/Stats_csv/Defender_stats.csv','a', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=field_names, extrasaction="ignore")
        if is_file_empty:
            writer.writeheader()  # Write header only if the file is empty
        writer.writerows(data)  # Ghi dữ liệu

Crawling Forwards' Statistics Info

In [57]:
def getForwardInfo (html_source,dict):
    data = []
    soup = BeautifulSoup(html_source , 'html.parser')
    field_names = ['Name','Nationality','Date of Birth','Height','Club','Position'
                    ,'Appearances','Wins','Losses'
                    ,'Goals', 'Goals per match' , 'Headed goals','Goals with right foot','Goals with left foot','Penalties scored','Freekicks scored','Shots','Shots on target','Shooting accuracy %','Hit woodwork','Big chances missed'
                    ,'Assists','Passes','Passes per match','Big Chances Created','Crosses'
                    ,'Yellow cards','Red cards','Fouls','Offsides'
                    ,'Tackles','Blocked shots','Interceptions','Clearances','Headed Clearance']
    #Thông tin chỉ số kĩ năng
    stats = soup.find_all("div", class_="player-stats__stat-value")
    for stat in stats:
        try:
            # Extract the stat title and its value
            title = stat.contents[0].strip()  # Get the stat name
            value = stat.find("span").text.strip()  # Get the stat value
            if title in field_names : dict[title] = value
        except AttributeError:
            print("Skipping incomplete stat detail.")

    data.append(dict)

    
    file_path = Path('../data/Stats_csv/Forward_stats.csv')
    # Check if the file exists and is empty
    is_file_empty = not file_path.exists() or file_path.stat().st_size == 0

    #Write to csv file
    with open('../data/Stats_csv/Forward_stats.csv', 'a', newline='',encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=field_names, extrasaction="ignore")
    
        if is_file_empty:
            writer.writeheader()  # Write header only if the file is empty
        
        # Write new rows
        writer.writerows(data)
        

Crawling Goalkeepers' Statistics Info

In [58]:
def getGoalkeeperInfo (html_source,dict):
    data = []
    soup = BeautifulSoup(html_source , 'html.parser')
    field_names = ['Name','Nationality','Date of Birth','Height','Club','Position'
                    ,'Appearances','Wins','Losses'
                    ,'Saves', 'Penalties Saved' , 'Punches','High Claims','Catches','Sweeper clearances','Throw outs','Goal Kicks'
                    ,'Clean sheets','Goals Conceded','Errors leading to goal','Own goals'
                    ,'Yellow cards','Red cards','Fouls'
                    ,'Goals','Assists','Passes','Passes per match','Accurate long balls']
    #Thông tin chỉ số kĩ năng
    stats = soup.find_all("div", class_="player-stats__stat-value")
    for stat in stats:
        try:
            # Extract the stat title and its value
            title = stat.contents[0].strip()  # Get the stat name
            value = stat.find("span").text.strip()  # Get the stat value
            if title in field_names : dict[title] = value
        except AttributeError:
            print("Skipping incomplete stat detail.")

    data.append(dict)

    file_path = Path('../data/Stats_csv/Goalkeeper_stats.csv')
    # Check if the file exists and is empty
    is_file_empty = not file_path.exists() or file_path.stat().st_size == 0
    with open('../data/Stats_csv/Goalkeeper_stats.csv', 'a', newline='',encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=field_names, extrasaction="ignore")
        if is_file_empty:
            writer.writeheader()  # Write header only if the file is empty
        writer.writerows(data)

Crawling Midfielders' Statistics Info

In [59]:
def getMidfielderInfo (html_source,dict):
    data = []
    soup = BeautifulSoup(html_source , 'html.parser')
    field_names = ['Name','Nationality','Date of Birth','Height','Club','Position'
                    ,'Appearances','Wins','Losses'
                    ,'Goals', 'Goals per match' , 'Headed goals','Goals with right foot','Goals with left foot','Penalties scored','Freekicks scored','Shots','Shots on target','Shooting accuracy %','Hit woodwork','Big chances missed'
                    ,'Assists','Passes','Passes per match','Big Chances Created','Crosses','Cross accuracy %','Through balls','Accurate long balls'
                    ,'Yellow cards','Red cards','Fouls','Offsides'
                    ,'Tackles','Tackle success %','Blocked shots','Interceptions','Clearances','Headed Clearance','Recoveries','Fouls','Duels won','Duels lost','Successful 50/50s','Aerial battles won','Aerial battles lost','Errors leading to goal']
    #Thông tin chỉ số kĩ năng
    stats = soup.find_all("div", class_="player-stats__stat-value")
    for stat in stats:
        try:
            # Extract the stat title and its value
            title = stat.contents[0].strip()  # Get the stat name
            value = stat.find("span").text.strip()  # Get the stat value
            if title in field_names : dict[title] = value
        except AttributeError:
            print("Skipping incomplete stat detail.")
    data.append(dict)
    file_path = Path('../data/Stats_csv/Midfielder_stats.csv')
    # Check if the file exists and is empty
    is_file_empty = not file_path.exists() or file_path.stat().st_size == 0
    with open('../data/Stats_csv/Midfielder_stats.csv', 'a', newline='',encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=field_names, extrasaction="ignore")
        if is_file_empty:
            writer.writeheader()  # Write header only if the file is empty
        writer.writerows(data)

Write to Stats_csv

In [60]:
# Specify the folder containing the HTML files
folder_path = '../data/players'  

# Check if the folder exists
if not os.path.exists(folder_path):
    print("Folder does not exist.")
else:
    # Loop through all files in the folder
    for file_name in os.listdir(folder_path):
        # Process only files with a .html extension
        if file_name.endswith('.html'):
            file_path = os.path.join(folder_path, file_name)
            
            # Open and read the HTML file
            with open(file_path, 'r', encoding='utf-8') as file:
                html_content = file.read()

            # Read Player overview info
            PlayerInfo = getPlayerDetails(html_content)

            # Read players' stats
            try:
                if (PlayerInfo['Position'] == 'Defender'): getDefenderInfo(html_content,PlayerInfo)
                elif (PlayerInfo['Position'] == 'Forward'): getForwardInfo(html_content,PlayerInfo)
                elif (PlayerInfo['Position'] == 'Goalkeeper'): getGoalkeeperInfo(html_content,PlayerInfo)
                elif (PlayerInfo['Position'] == 'Midfielder'): getMidfielderInfo(html_content,PlayerInfo)
            except (AttributeError,KeyError):
                print("Skipping incomplete stat detail.")

Skipping incomplete stat detail.
Skipping incomplete stat detail.
Skipping incomplete stat detail.
Skipping incomplete stat detail.
Skipping incomplete stat detail.
Skipping incomplete stat detail.
Skipping incomplete stat detail.
Skipping incomplete stat detail.
Skipping incomplete stat detail.
Skipping incomplete stat detail.
Skipping incomplete stat detail.
Skipping incomplete stat detail.
Skipping incomplete stat detail.
Skipping incomplete stat detail.
Skipping incomplete stat detail.
Skipping incomplete stat detail.
Skipping incomplete stat detail.
Skipping incomplete stat detail.
Skipping incomplete stat detail.
Skipping incomplete stat detail.
Skipping incomplete stat detail.
Skipping incomplete stat detail.
Skipping incomplete stat detail.
Skipping incomplete stat detail.
Skipping incomplete stat detail.
Skipping incomplete stat detail.
Skipping incomplete stat detail.
Skipping incomplete stat detail.
Skipping incomplete stat detail.
Skipping incomplete stat detail.
Skipping i