In [81]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [82]:
BASE_URL = 'https://www.sofascore.com'

ATTRIBUTE_OVERVIEW_API = 'https://api.sofascore.com/api/v1/player/{id}/attribute-overviews'
LAST_YEAR_SUMMARY_API = 'https://api.sofascore.com/api/v1/player/{id}/last-year-summary'
headers = {
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
} 

### Lưu dữ liệu

In [83]:
### Xuất dữ liệu vào file csv
def create_csv_file(data, filename, header=True, mode='w'):
  df = pd.DataFrame(data)
  df.to_csv(filename, index=False, mode=mode, header=header)

### Lấy dữ liệu thông tin cơ bản

In [84]:
def get_profile_info(url):
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        return None
    
    soup = BeautifulSoup(response.content, "html.parser")

    tournament_tag = soup.select("ul.sc-4da3a94c-0.cECKqR")

    name_tag = soup.select_one("h2.Text.gcxwef")
    club_tag = soup.select_one("div.Text.jzzmLj")

    market_value_tag = soup.select_one("div.Text.iumpyb")
    
    info_titles = soup.select("div.Text.kQQxFk")
    info_values = soup.select("div.Text.hnfikr")

    tournament_elements = tournament_tag[0].find_all('a')
    

    if len(set(info_titles)) < 7 or len(set(info_values)) < 6 or name_tag is None or club_tag is None or market_value_tag is None or tournament_tag is None or len(tournament_elements) < 3:
        return None
    
    country = tournament_elements[1].text.strip()
    league = tournament_elements[2].text.strip()

    name = name_tag.text.strip()
    club = club_tag.text.strip()
    market_value = market_value_tag.text.strip()
    nationality = info_values[0].find('div').find('span').text.strip()
    birth_date = info_titles[1].text.strip()
    age = info_values[1].text.strip()
    height = info_values[2].text.strip()
    preferred_foot = info_values[3].text.strip()
    position = info_values[4].text.strip()
    shirt_number = info_values[5].text.strip()

    return {
        "name": name,
        "market_value": market_value,
        "club": club,
        "country": country,
        "league": league,
        "nationality": nationality,
        "birth_date": birth_date,
        "age": age,
        "height": height,
        "preferred_foot": preferred_foot,
        "position": position,
        "shirt_number": shirt_number, 
    }

# get_profile_info('https://www.sofascore.com/player/serhou-guirassy/328027')

### Lấy dữ liệu thông số thi đấu của cầu thủ

In [85]:
def get_detail_info(id):
    attribute_response = requests.get(ATTRIBUTE_OVERVIEW_API.format(id=id), headers=headers)
    if attribute_response.status_code != 200:
        return None
    
    summary_response = requests.get(LAST_YEAR_SUMMARY_API.format(id=id), headers=headers)
    if summary_response.status_code != 200:
        return None
    
    attribute_data = attribute_response.json()
    summary_data = summary_response.json()

    attribute = pd.DataFrame(attribute_data['playerAttributeOverviews'])
    # print(attribute.columns)
    if len(attribute) == 0 or 'position' not in attribute.columns:
        return None
    current_attribute = attribute.iloc[0]
    # print(current_attribute)

    attacking, technical, tactical, defending, creativity, saves, anticipation, ball_distribution, aerial = 0, 0, 0, 0, 0, 0, 0, 0, 0

    if current_attribute['position'] == 'G':
        if 'tactical' not in attribute.columns or 'saves' not in attribute.columns or 'anticipation' not in attribute.columns or 'ballDistribution' not in attribute.columns or 'aerial' not in attribute.columns:
            return None
        tactical = current_attribute['tactical']
        saves = current_attribute['saves']
        anticipation = current_attribute['anticipation']
        ball_distribution = current_attribute['ballDistribution']
        aerial = current_attribute['aerial']
    else:
        if 'attacking' not in attribute.columns or 'technical' not in attribute.columns or 'tactical' not in attribute.columns or 'defending' not in attribute.columns or 'creativity' not in attribute.columns:
            return None
        attacking = current_attribute['attacking']
        technical = current_attribute['technical']
        tactical = current_attribute['tactical']
        defending = current_attribute['defending']
        creativity = current_attribute['creativity']

    summary = pd.DataFrame(summary_data['summary'])
    if len(summary) == 0 or 'value' not in summary.columns:
        return None
    summary['value'] = summary['value'].astype(float)
    summary = summary.loc[summary['value'].between(0.0, 10.0)]
    avg_score_last_12_months = round(summary['value'].mean(), 2)
    appearances_last_12_months = len(summary)

    return {
        "attacking": attacking,
        "technical": technical,
        "tactical": tactical,
        "defending": defending,
        "creativity": creativity,
        "saves": saves,
        "anticipation": anticipation,
        "ball_distribution": ball_distribution,
        "aerial": aerial,
        "avg_score_last_12_months": avg_score_last_12_months,
        "appearances_last_12_months": appearances_last_12_months
    }

# get_detail_info(328027)

### Main

In [86]:
# Lấy link của các cầu thủ từ trang chủ
player_links = pd.read_csv("./raw_data/player_links.csv")
player_links['url'] = BASE_URL + player_links['player_link']
player_links['id'] = player_links['player_link'].apply(lambda x: x.split('/')[-1])

players_data = []
# Duyệt qua từng link cầu thủ
for index, row in player_links.iterrows():
    if index >= 10 and index < 20:
        print(f"Processing {index} : {row['id']} ...")
        player_detail = get_detail_info(row['id'])
        if player_detail is None:
            continue
        player_profile = get_profile_info(row['url'])
        if player_profile is None:
            continue

        player_data = {**player_profile, **player_detail}
        players_data.append(player_data)

    

create_csv_file(players_data, 'players_data.csv', header=True, mode='a')

Processing 10 : 874576 ...
Processing 11 : 901471 ...
Processing 12 : 830885 ...
Processing 13 : 1412474 ...
Processing 14 : 1117022 ...
Processing 15 : 1485206 ...
Processing 16 : 1109356 ...
Processing 17 : 1482389 ...
Processing 18 : 933158 ...
Processing 19 : 871788 ...
