In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [3]:
BASE_URL = 'https://www.sofascore.com'

ATTRIBUTE_OVERVIEW_API = 'https://api.sofascore.com/api/v1/player/{id}/attribute-overviews'
LAST_YEAR_SUMMARY_API = 'https://api.sofascore.com/api/v1/player/{id}/last-year-summary'
headers = {
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
} 

### Lưu dữ liệu

In [4]:
### Xuất dữ liệu vào file csv
def create_csv_file(data, filename, header=True):
  df = pd.DataFrame(data)
  df.to_csv(filename, mode='a', index=False, header=header)

### Lấy dữ liệu thông tin cơ bản

In [6]:
def get_profile_info(url):
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        return None
    
    soup = BeautifulSoup(response.content, "html.parser")

    name_tag = soup.select_one("h2.sc-gFqAkR.rPsHj")
    club_tag = soup.select_one("div.sc-gFqAkR.cDBfTX")

    market_value_tag = soup.select_one("div.sc-gFqAkR.cQUimj")
    
    info_titles = soup.select("div.sc-gFqAkR.crobRI")
    info_values = soup.select("div.sc-gFqAkR.jYYdYz")

    if name_tag is None or club_tag is None or market_value_tag is None or len(info_titles) < 6 or len(info_values) < 6:
        return None

    name = name_tag.text.strip()
    club = club_tag.text.strip()
    market_value = market_value_tag.text.strip()
    nationality = info_values[0].find('div').find('span').text.strip()
    birth_date = info_titles[1].text.strip()
    age = info_values[1].text.strip()
    height = info_values[2].text.strip()
    preferred_foot = info_values[3].text.strip()
    position = info_values[4].text.strip()
    shirt_number = info_values[5].text.strip()

    return {
        "name": name,
        "club": club,
        "market_value": market_value,
        "nationality": nationality,
        "birth_date": birth_date,
        "age": age,
        "height": height,
        "preferred_foot": preferred_foot,
        "position": position,
        "shirt_number": shirt_number, 
    }

# get_profile_info('https://www.sofascore.com/player/scott-carson/716')

{'name': 'Scott Carson',
 'club': 'Manchester City',
 'market_value': '210K €',
 'nationality': 'ENG',
 'birth_date': '3 Sept 1985',
 'age': '38 yrs',
 'height': '188 cm',
 'preferred_foot': 'Right',
 'position': 'G',
 'shirt_number': '33'}

### Lấy dữ liệu thông số thi đấu của cầu thủ

In [123]:
def get_detail_info(id):
    attribute_response = requests.get(ATTRIBUTE_OVERVIEW_API.format(id=id), headers=headers)
    if attribute_response.status_code != 200:
        return None
    
    summary_response = requests.get(LAST_YEAR_SUMMARY_API.format(id=id), headers=headers)
    if summary_response.status_code != 200:
        return None
    
    attribute_data = attribute_response.json()
    summary_data = summary_response.json()

    attribute = pd.DataFrame(attribute_data['playerAttributeOverviews'])
    # print(attribute.columns)
    if len(attribute) == 0 or 'position' not in attribute.columns:
        return None
    current_attribute = attribute.iloc[0]
    # print(current_attribute)

    attacking, technical, tactical, defending, creativity, saves, anticipation, ball_distribution, aerial = 0, 0, 0, 0, 0, 0, 0, 0, 0

    if current_attribute['position'] == 'G':
        if 'tactical' not in attribute.columns or 'saves' not in attribute.columns or 'anticipation' not in attribute.columns or 'ballDistribution' not in attribute.columns or 'aerial' not in attribute.columns:
            return None
        tactical = current_attribute['tactical']
        saves = current_attribute['saves']
        anticipation = current_attribute['anticipation']
        ball_distribution = current_attribute['ballDistribution']
        aerial = current_attribute['aerial']
    else:
        if 'attacking' not in attribute.columns or 'technical' not in attribute.columns or 'tactical' not in attribute.columns or 'defending' not in attribute.columns or 'creativity' not in attribute.columns:
            return None
        attacking = current_attribute['attacking']
        technical = current_attribute['technical']
        tactical = current_attribute['tactical']
        defending = current_attribute['defending']
        creativity = current_attribute['creativity']

    summary = pd.DataFrame(summary_data['summary'])
    if len(summary) == 0 or 'value' not in summary.columns:
        return None
    summary['value'] = summary['value'].astype(float)
    summary = summary.loc[summary['value'].between(0.0, 10.0)]
    avg_score_last_12_months = round(summary['value'].mean(), 2)
    appearances_last_12_months = len(summary)

    return {
        "attacking": attacking,
        "technical": technical,
        "tactical": tactical,
        "defending": defending,
        "creativity": creativity,
        "saves": saves,
        "anticipation": anticipation,
        "ball_distribution": ball_distribution,
        "aerial": aerial,
        "avg_score_last_12_months": avg_score_last_12_months,
        "appearances_last_12_months": appearances_last_12_months
    }

# get_detail_info(610766)

    

### Main

In [124]:
# Lấy link của các cầu thủ từ trang chủ
player_links = pd.read_csv("../crawl_link/player_links.csv", names=["path"])
player_links['url'] = BASE_URL + player_links['path']
player_links['id'] = player_links['path'].apply(lambda x: x.split('/')[-1])

players_data = []
# Duyệt qua từng link cầu thủ
for index, row in player_links.iterrows():
    print(f"Processing {row['id']} ...")
    player_detail = get_detail_info(row['id'])
    if player_detail is None:
        continue
    player_profile = get_profile_info(row['url'])
    if player_profile is None:
        continue

    player_data = {**player_profile, **player_detail}
    players_data.append(player_data)

create_csv_file(players_data, 'players_data.csv', header=True)

Processing 839956 ...
Processing 944656 ...
Processing 189061 ...
Processing 934386 ...
Processing 70996 ...
Processing 859765 ...
Processing 331209 ...
Processing 827606 ...
Processing 136710 ...
Processing 945122 ...
Processing 1136731 ...
Processing 855835 ...
Processing 1065216 ...
Processing 964994 ...
Processing 318941 ...
Processing 44614 ...
Processing 152077 ...
Processing 149663 ...
Processing 383560 ...
Processing 254491 ...
Processing 125274 ...
Processing 716 ...
Processing 108579 ...
Processing 21626 ...
Processing 1048888 ...
Processing 1010231 ...
Processing 293519 ...
Processing 35166 ...
Processing 259117 ...
Processing 187433 ...
Processing 280441 ...
Processing 184661 ...
Processing 1084730 ...
Processing 355492 ...
Processing 1142251 ...
Processing 1410926 ...
Processing 843665 ...
Processing 803031 ...
Processing 847030 ...
Processing 896569 ...
Processing 798583 ...
Processing 980418 ...
Processing 246999 ...
Processing 146101 ...
Processing 158233 ...
Processing