In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import json

In [2]:
chrome_options = Options()
chrome_options.add_argument("--disable-blink-features=AutomationControlled")
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
chrome_options.add_experimental_option("useAutomationExtension", False)
chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")

driver = webdriver.Chrome(options=chrome_options)

driver.get("https://www.sofascore.com/")
time.sleep(5)

groups = ['summary', 'attack', 'defence', 'passing', 'goalkeeper']

# go through 29 pages
for group in groups:
    all_results = []
    print(f"Scraping group: {group}")
    
    for page in range(1, 30):
        offset = (page - 1) * 20
        url = f"https://www.sofascore.com/api/v1/unique-tournament/17/season/52186/statistics?limit=20&order=-rating&offset={offset}&accumulation=total&group={group}"
        
        driver.get(url)
        time.sleep(3) 
        
        # extract JSON from source
        try:
            json_content = driver.find_element("tag name", 'pre').text
            data = json.loads(json_content)
            all_results.extend(data['results'])
            print(f"Page {page} scraped successfully.")
        except Exception as e:
            print(f"Error on page {page}: {str(e)}")
            # Save page source for debugging
            with open(f'error_page_{page}.html', 'w', encoding='utf-8') as f:
                f.write(driver.page_source)
            break

    with open(f"{group}.json", 'w', encoding='utf-8') as f:
        json.dump(all_results, f, ensure_ascii=False, indent=4)
    print(f"Saved {len(all_results)} entries to {group}.json\n")

driver.quit()

Scraping group: summary
Page 1 scraped successfully.
Page 2 scraped successfully.
Page 3 scraped successfully.
Page 4 scraped successfully.
Page 5 scraped successfully.
Page 6 scraped successfully.
Page 7 scraped successfully.
Page 8 scraped successfully.
Page 9 scraped successfully.
Page 10 scraped successfully.
Page 11 scraped successfully.
Page 12 scraped successfully.
Page 13 scraped successfully.
Page 14 scraped successfully.
Page 15 scraped successfully.
Page 16 scraped successfully.
Page 17 scraped successfully.
Page 18 scraped successfully.
Page 19 scraped successfully.
Page 20 scraped successfully.
Page 21 scraped successfully.
Page 22 scraped successfully.
Page 23 scraped successfully.
Page 24 scraped successfully.
Page 25 scraped successfully.
Page 26 scraped successfully.
Page 27 scraped successfully.
Page 28 scraped successfully.
Page 29 scraped successfully.
Saved 570 entries to summary.json

Scraping group: attack
Page 1 scraped successfully.
Page 2 scraped successfully.

In [3]:
import csv
from collections import defaultdict

In [4]:
GROUP_STRUCTURE = {
    "summary": ["goals", "expectedGoals", "successfulDribbles", "tackles", 
               "assists", "accuratePassesPercentage", "rating"],
    "attack": ["goals", "expectedGoals", "bigChancesMissed", "successfulDribbles",
              "totalShots", "goalConversionPercentage", "rating"],
    "defence": ["tackles", "interceptions", "clearances", "errorLeadToGoal", "rating"],
    "passing": ["bigChancesCreated", "assists", "accuratePasses", 
                "accuratePassesPercentage", "keyPasses", "rating"],
    "goalkeeper": ["saves", "cleanSheet", "penaltySave", "savedShotsFromInsideTheBox",
                  "runsOut", "rating"]
}

ordered_stats = []
seen_stats = set()
for group in GROUP_STRUCTURE.values():
    for stat in group:
        if stat == 'rating':
            continue
        if stat not in seen_stats:
            ordered_stats.append(stat)
            seen_stats.add(stat)

# CSV column order
fieldnames = ['player.name', 'player.id', 'team.name'] + ordered_stats + ['rating']

players = defaultdict(dict)

# pimport pandas as pdrocess each JSON file
for group in GROUP_STRUCTURE:
    try:
        with open(f"{group}.json", "r", encoding="utf-8") as f:
            data = json.load(f)
            
        for entry in data:
            player_id = entry["player"]["id"]
            
            for stat in GROUP_STRUCTURE[group]:
                value = entry.get(stat)
                if value is not None: 
                    players[player_id][stat] = value
                    
            # Always update name and team
            players[player_id]["player.name"] = entry["player"]["name"]
            players[player_id]["player.id"] = player_id
            players[player_id]["team.name"] = entry["team"]["name"]
            
    except FileNotFoundError:
        print(f"Warning: {group}.json not found")
        continue

sorted_players = sorted(players.values(), 
                       key=lambda x: x.get("rating", 0), 
                       reverse=True)

#write csv
with open("player_stats.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    
    for player in sorted_players:
        row = {col: player.get(col) for col in fieldnames}
        writer.writerow(row)

print("CSV file created successfully: player_stats.csv")

CSV file created successfully: player_stats.csv


In [5]:
import pandas as pd

In [6]:
df = pd.read_csv('player_stats.csv')

#print(df.head())

cleaned_df = df.copy()

cleaned_df["expectedGoals"] = cleaned_df["expectedGoals"].fillna(0)

print(cleaned_df.head())
print("total NaN:")
print(cleaned_df.isna().sum()) 

       player.name  player.id          team.name  goals  expectedGoals  \
0            Rodri     827606    Manchester City      8           4.07   
1   Arijanet Murić     888971            Burnley      0           0.00   
2  Kevin De Bruyne      70996    Manchester City      4           2.41   
3       Phil Foden     859765    Manchester City     19          10.34   
4  Bruno Fernandes     288205  Manchester United     10          10.01   

   successfulDribbles  tackles  assists  accuratePassesPercentage  \
0                  42       70        9                     92.46   
1                   0        1        0                     67.49   
2                  10       14       10                     83.83   
3                  47       31        8                     89.10   
4                  19       68        8                     79.38   

   bigChancesMissed  ...  errorLeadToGoal  bigChancesCreated  accuratePasses  \
0                 2  ...                1                  8