In [99]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [100]:
BASE_URL = 'https://www.sofascore.com'
UEFA_RANKINGS_URL = 'https://www.sofascore.com/football/rankings/uefa'

COUNTRY_RANKINGS_URL = 'https://api.sofascore.com/api/v1/rankings/season/2024/type/1'
CLUB_RANKINGS_URL = 'https://api.sofascore.com/api/v1/rankings/type/9'

TEAM_URL = BASE_URL + '/team/football'

headers = {
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
} 

### Lưu dữ liệu

In [101]:
### Xuất dữ liệu vào file csv
def create_csv_file(data, filename, header=True):
  df = pd.DataFrame(data)
  df.to_csv(filename, mode='a', index=False, header=header)

### Lấy danh sách các nước với các hệ số thích hợp

In [102]:
def scrape_country_rankings(country_rankings_url=COUNTRY_RANKINGS_URL, threshold=0.75):
  response = requests.get(country_rankings_url, headers=headers)
  if response.status_code != 200:
    print('Failed to fetch page')
    return
  
  data = response.json()

  df = pd.DataFrame(data['rankings'])
  countries = []

  for index, row in df.iterrows():
    if len(countries) == 0 or (row['points'] < countries[-1]['points'] - threshold and row['uniqueTournament']['name'] not in [country['name'] for country in countries]):
      countries.append({
        'name': row['uniqueTournament']['name'],
        'country': row['uniqueTournament']['category']['country']['name'],
        'points': row['points'],
      })

  create_csv_file(countries, 'country_rankings.csv')

# scrape_country_rankings()

### Lấy danh sách các Câu lạc bộ thuộc các nước đã chọn cùng với có hệ số thích hợp

In [103]:
def scrape_club_rankings(club_rankings_url=CLUB_RANKINGS_URL, threshold=0.5):
  response = requests.get(club_rankings_url, headers=headers)
  if response.status_code != 200:
    print('Failed to fetch page')
    return
  
  data = response.json()

  df = pd.DataFrame(data['rankings'])
  clubs = []

  countries = pd.read_csv('country_rankings.csv')

  for index, row in df.iterrows():
    if row['team']['country']['name'] in countries['country'].values and (len(clubs) == 0 or row['team']['name'] not in [club['name'] for club in clubs]):
      clubs.append({
        'name': row['team']['name'],
        'country': row['team']['country']['name'],
        'points': row['points'],
        'path': '/' + row['team']['slug'] + '/' + str(row['team']['id'])
      })

  create_csv_file(clubs, 'club_rankings.csv')

# scrape_club_rankings()

### Lấy link của các cầu thủ

In [104]:
def scrape_player_pages(team_base_url=TEAM_URL):
  link_players = []

  clubs = pd.read_csv('club_rankings.csv')
  club_urls = team_base_url + clubs['path']

  for club_url in club_urls:
    print('Scraping club:', club_url)
    response = requests.get(club_url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    div_players = soup.find_all('div', class_='sc-fqkvVR fSBPoD')
    
    if div_players:
      for player in div_players:
        player_url = player.find('a')
        if player_url:
          link_players.append(player_url.get('href'))

  filename = 'player_links.csv'
  create_csv_file(link_players, filename, header=False)

# scrape_player_pages()

### Main

In [105]:
# Lấy danh sách các quốc gia và số điểm của họ
scrape_country_rankings()
# Lấy danh sách các câu lạc bộ và số điểm của họ
scrape_club_rankings()
# Lấy danh sách liên kết của các cầu thủ từ các tiêu chí đã chọn
scrape_player_pages()
print("Scraping player links completed.")

Scraping club: https://www.sofascore.com/team/football/manchester-city/17
Scraping club: https://www.sofascore.com/team/football/fc-bayern-munchen/2672
Scraping club: https://www.sofascore.com/team/football/real-madrid/2829
Scraping club: https://www.sofascore.com/team/football/paris-saint-germain/1644
Scraping club: https://www.sofascore.com/team/football/liverpool/44
Scraping club: https://www.sofascore.com/team/football/inter/2697
Scraping club: https://www.sofascore.com/team/football/rb-leipzig/36360
Scraping club: https://www.sofascore.com/team/football/chelsea/38
Scraping club: https://www.sofascore.com/team/football/roma/2702
Scraping club: https://www.sofascore.com/team/football/manchester-united/35
Scraping club: https://www.sofascore.com/team/football/borussia-dortmund/2673
Scraping club: https://www.sofascore.com/team/football/barcelona/2817
Scraping club: https://www.sofascore.com/team/football/atletico-madrid/2836
Scraping club: https://www.sofascore.com/team/football/sevi