## Import các thư viện cần thiết cho việc thu thập dữ liệu

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

### Khai báo các giá trị đường dẫn hỗ trợ việc thu thập thông tin

In [2]:
BASE_URL = 'https://www.sofascore.com'
UEFA_RANKINGS_URL = 'https://www.sofascore.com/football/rankings/fifa'

COUNTRY_RANKINGS_URL = 'https://api.sofascore.com/api/v1/rankings/type/2'
COUNTRY_CATEGORY_IDS_URL = 'https://api.sofascore.com/api/v1/sport/football/categories'
TOURNAMENT_COUNTRY_URL = 'https://api.sofascore.com/api/v1/category/{country_category_id}/unique-tournaments'
TOURNAMENT_SEASON_ID_URL = 'https://api.sofascore.com/api/v1/unique-tournament/{tournament_id}/featured-events'
CLUB_TOURNAMENT_URL = 'https://api.sofascore.com/api/v1/unique-tournament/{tournament_id}/season/{tournament_season_id}/standings/total'

TEAM_URL = BASE_URL + '/team/football/{team_slug}/{team_id}'

headers = {
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36'
} 

### Lưu dữ liệu

In [3]:
### Xuất dữ liệu vào file csv
def create_csv_file(data, filename, header=True):
  df = pd.DataFrame(data)
  df.to_csv(filename, index=False, header=header)

### Lấy danh sách các nước trên thế giới cùng hệ số của nước đó

In [4]:
def scrape_country_rankings(country_rankings_url=COUNTRY_RANKINGS_URL):
  response = requests.get(country_rankings_url, headers=headers)
  if response.status_code != 200:
    print('Failed to fetch country rankings page')
    return
  
  data = response.json()

  df = pd.DataFrame(data['rankings'])
  countries = []

  for index, row in df.iterrows():
    countries.append({
      'rank': row['team']['ranking'],
      'name': row['team']['name'],
      'name_code': row['team']['nameCode'],
      'points': row['points'],
      'slug': row['team']['slug'],
    })

  create_csv_file(countries, 'country_rankings.csv')

### Lấy id các nước trong danh sách phân loại các nước

In [5]:
def scrape_country_category_ids(country_category_ids_url=COUNTRY_CATEGORY_IDS_URL):
  response = requests.get(country_category_ids_url, headers=headers)
  if response.status_code != 200:
    print('Failed to fetch country category page')
    return
  
  data = response.json()

  df = pd.DataFrame(data['categories'])

  country_rankings = pd.read_csv('country_rankings.csv')
  country_list = country_rankings['name'].tolist()

  for index, row in df.iterrows():
    if row['name'] in country_list:
      country_rankings.loc[country_rankings['name'] == row['name'], 'category_id'] = row['id']

  create_csv_file(country_rankings, 'country_rankings.csv')

### Lấy liên kết đến mùa giải hiện tại của giải đấu lớn nhất của các nước

In [6]:
def scrape_tournament_country(tournament_country_url=TOURNAMENT_COUNTRY_URL):
  countries = pd.read_csv('country_rankings.csv')
  for index, row in countries.iterrows():
    if pd.isnull(row['category_id']):
      continue
    country_category_id = str(int(row['category_id']))

    response = requests.get(tournament_country_url.format(country_category_id=country_category_id), headers=headers)
    if response.status_code != 200:
      print('Failed to fetch tournament country page')
      continue
    
    data = response.json()
    df = pd.DataFrame(data['groups'])
    
    for idx, r in df.iterrows():
      tournament = r['uniqueTournaments']
      countries.loc[countries['name'] == row['name'], 'tournament_name'] = tournament[0]['name']
      countries.loc[countries['name'] == row['name'], 'tournament_slug'] = tournament[0]['slug']
      countries.loc[countries['name'] == row['name'], 'tournament_id'] = tournament[0]['id']

  create_csv_file(countries, 'country_rankings.csv')

### Lấy id của mùa giải hiện tại của các giải đấu

In [7]:
def scrape_tournament_season_id(tournament_season_id_url=TOURNAMENT_SEASON_ID_URL):
  countries = pd.read_csv('country_rankings.csv')
  for index, row in countries.iterrows():
    if pd.isnull(row['tournament_id']):
      continue
    tournament_id = str(int(row['tournament_id']))
    response = requests.get(tournament_season_id_url.format(tournament_id=tournament_id), headers=headers)
    if response.status_code != 200:
      print('Failed to fetch tournament season id page')
      continue
    
    data = response.json()
    df = pd.DataFrame(data['featuredEvents'])
    
    if df is not None and not df.empty:
      if 'season' in df and len(df['season']) > 0 and 'id' in df['season'][0]:
        countries.loc[countries['name'] == row['name'], 'tournament_season_id'] = df['season'][0]['id']

  create_csv_file(countries, 'country_rankings.csv')

### Lấy danh sách các Câu lạc bộ thuộc các giải đấu đã chọn

In [8]:
def scrape_club_tournament(club_tournament_url=CLUB_TOURNAMENT_URL):
  countries = pd.read_csv('country_rankings.csv')
  clubs = []
  for index, row in countries.iterrows():
    if pd.isnull(row['tournament_id']) or pd.isnull(row['tournament_season_id']) or pd.isnull(row['tournament_name']):
      continue
    tournament_id = str(int(row['tournament_id']))
    tournament_season_id = str(int(row['tournament_season_id']))
    country_name = row['name']
    tournament_name = row['tournament_name']
    response = requests.get(club_tournament_url.format(tournament_id=tournament_id, tournament_season_id=tournament_season_id), headers=headers)
    if response.status_code != 200:
      print('Failed to fetch club tournament page')
      continue
    
    data = response.json()

    df = pd.DataFrame(data['standings'])
    rows = pd.DataFrame(df['rows'][0])

    for idx, r in rows.iterrows():
      clubs.append({
        'country': country_name,
        'tournament': tournament_name,
        'name': r['team']['name'],
        'short_name': r['team']['shortName'],
        'name_code': r['team']['nameCode'],
        'position': r['position'],
        'slug': r['team']['slug'],
        'id': r['team']['id']
      })


  create_csv_file(clubs, 'club_tournament.csv')

### Lấy link của các cầu thủ

In [9]:
def scrape_player_links(team_base_url=TEAM_URL):
  link_players = []

  clubs = pd.read_csv('club_tournament.csv')

  for index, club in clubs.iterrows():
    if pd.isnull(club['slug']) or pd.isnull(club['id']) or pd.isnull(club['name']):
      continue
    club_name = club['name']
    club_slug = club['slug']
    club_id = str(int(club['id']))

    response = requests.get(team_base_url.format(team_slug=club_slug, team_id=club_id), headers=headers)
    if response.status_code != 200:
      print('Failed to fetch player links page')
      continue
    soup = BeautifulSoup(response.content, 'html.parser')

    div_players = soup.find_all('div', class_='Box gDjnsl')
    
    if div_players:
      for player in div_players:
        player_url = player.find('a')
        if player_url:
          link_players.append({
            'club': club_name,
            'player_link': player_url.get('href')
          })

  filename = 'player_links.csv'
  create_csv_file(link_players, filename)

### Main

In [10]:
scrape_country_rankings()
scrape_country_category_ids()
scrape_tournament_country()
scrape_tournament_season_id()
scrape_club_tournament()
scrape_player_links()

Failed to fetch club tournament page
Failed to fetch club tournament page
Failed to fetch club tournament page
Failed to fetch player links page
Failed to fetch player links page
