## DATA COLLECTION 1
From 'https://www.whoscored.com/Statistics'

Imports

In [None]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options

Getting rid of cookies message and loading page

In [None]:
driver = webdriver.Chrome()
driver.implicitly_wait(3)
driver.get("https://www.whoscored.com/Statistics")
wait = WebDriverWait(driver, 5)
accept_cookies_button = wait.until(
        EC.element_to_be_clickable((By.CSS_SELECTOR, '#qc-cmp2-ui > div.qc-cmp2-footer.qc-cmp2-footer-overlay.qc-cmp2-footer-scrolled > div > button.css-1wc0q5e > span')))
print("Cookies button found.")
accept_cookies_button.click()
driver.implicitly_wait(10)
wait.until(EC.invisibility_of_element((By.CSS_SELECTOR, '.accept-cookies')))
print("Cookies pop-up clicked")
wait.until(
        EC.presence_of_element_located((By.CSS_SELECTOR, '#top-team-stats-options > li:nth-child(1) > a'))
    )

Scrolling necessary for scraping

In [None]:
def scroll_to_percentage(percentage): #chatgpt
    # Obtener la altura total del documento
    total_height = driver.execute_script("return document.body.scrollHeight")
    # Calcular la altura del desplazamiento deseado
    scroll_height = total_height * percentage
    # Desplazar hasta esa altura
    driver.execute_script(f"window.scrollTo(0, {scroll_height});")
    time.sleep(3)  # Espera para asegurarse de que el contenido se cargue después del desplazamiento

Method for extracting team data on the summary page

In [None]:
#Tu extract data, lo uso yo tambien
def extract_data():
    goals = []
    shots = []
    teams = []
    league = []
    possesion = []
    pass_success = []
    for i in range(1,21):
        sel_teams = f'#top-team-stats-summary-content > tr:nth-child({i}) > td.col12-lg-2.col12-m-3.col12-s-4.col12-xs-5.grid-abs.overflow-text > a'
        sel_shots = f'#top-team-stats-summary-content > tr:nth-child({i}) > td.shotsPerGame'
        sel_goals = f'#top-team-stats-summary-content > tr:nth-child({i}) > td.goal'
        sel_league = f'#top-team-stats-summary-content > tr:nth-child({i}) > td:nth-child(2) > a'
        sel_possesion = f'#top-team-stats-summary-content > tr:nth-child({i}) > td.possession'
        sel_pass_success = f'#top-team-stats-summary-content > tr:nth-child({i}) > td.passSuccess'
        try:
         # Getting every column 
            shots_element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, sel_shots)))
            goals_element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, sel_goals)))
            teams_element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, sel_teams)))
            league_element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, sel_league)))
            possesion_element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, sel_possesion)))
            pass_success_element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, sel_pass_success)))
        
            goals.append(goals_element.text)
            shots.append(shots_element.text)
            teams.append(teams_element.text)
            league.append(league_element.text)
            possesion.append(possesion_element.text)
            pass_success.append(pass_success_element.text)
        except Exception as e:
            print(f"No se pudo encontrar el elemento con el selector {sel_shots}: {e}")
    return pd.DataFrame({ 'Teams':teams,
                           'League':league,
                    'Goals':goals,
                   'Shots PG':shots,
                   'Possesion %':possesion,
                   'Pass Success%': pass_success
                   })

For loop for extracting each page of team data

In [None]:
df_summary = pd.DataFrame()

for page_number in range(1, 6):  
    if page_number > 1:  
        scroll_to_percentage(0.3)
        try:
            next_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#next')))
            next_button.click()
            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#top-team-stats-options > li:nth-child(1) > a')))
            time.sleep(2)  # Asegúrate de que la página esté completamente cargada
        except Exception as e:
            print(f"No se pudo hacer clic en el botón 'Next' o no se cargó la página: {e}")
            break 

    df_page = extract_data()
    df_summary = pd.concat([df_summary, df_page], ignore_index=True)

Fixing team names

In [None]:
df_summary['Teams'] = df_summary['Teams'].str.replace(r'^\d+\.\s+', '',regex = True)

Switching page to offensive stats

In [None]:
offensive_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#top-team-stats-options > li:nth-child(3) > a')))
offensive_button.click()

Method for extracting offensive data for each team

In [None]:
def extract_offense():
    links = [elem.get_attribute('href') for elem in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#statistics-team-table-offensive .team-link')))]
    teams = [elem.text for elem in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#statistics-team-table-offensive .team-link')))]
    leagues = [elem.text for elem in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#statistics-team-table-offensive .tournament-link')))]
    shots_ot_pg = [elem.text for elem in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#statistics-team-table-offensive #top-team-stats-summary-content .shotOnTargetPerGame')))]
    dribbles_won_pg = [elem.text for elem in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#statistics-team-table-offensive #top-team-stats-summary-content .dribbleWonPerGame')))]
    fouled_pg = [elem.text for elem in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#statistics-team-table-offensive #top-team-stats-summary-content .foulGivenPerGame')))]

    df_offensive = pd.DataFrame({
        'Teams': teams,
        'Links': links,
        'League': leagues,
        'Shots On Target PG': shots_ot_pg,
        'Dribbles won PG': dribbles_won_pg,
        'Fouled PG': fouled_pg,
    })

    return df_offensive

For loop for extracting each page of offensive team data

In [None]:
#Los nexts y df del ofensivo
df_offensive = pd.DataFrame()

for page_number in range(1, 6):
    if page_number > 1:
        scroll_to_percentage(0.3)
        try:
            next_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#statistics-team-paging-offensive #next')))
            next_button.click()
            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#top-team-stats-options > li:nth-child(1) > a')))
            time.sleep(2)  # Asegúrate de que la página esté completamente cargada
        except Exception as e:
            print(f"Could not click the 'Next' button or the page did not load: {e}")
            break
        
    # Extract data from the current page
    df_page_off = extract_offense()
    df_offensive = pd.concat([df_offensive, df_page_off], ignore_index=True)

df_offensive

Fixing team names

In [None]:
df_offensive['Teams'] = df_offensive['Teams'].str.replace(r'^\d+\.\s+', '',regex = True)

Switching page to defensive stats

In [None]:
defensive_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#top-team-stats-options > li:nth-child(2) > a')))
defensive_button.click()

Method for extracting defensive team data

In [None]:
#El extract defensivo que si uso 
def extract_defensive():
    shots_allowed = [elem.text for elem in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#statistics-team-table-defensive #top-team-stats-summary-content .shotsConcededPerGame')))]
    tackles = [elem.text for elem in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#statistics-team-table-defensive #top-team-stats-summary-content .tacklePerGame')))]
    teams_def = [elem.text for elem in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#statistics-team-table-defensive .team-link')))]
    league_def = [elem.text for elem in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#statistics-team-table-defensive .tournament-link')))]
    offsides = [elem.text for elem in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#statistics-team-table-defensive #top-team-stats-summary-content .offsideGivenPerGame')))]
    fouls = [elem.text for elem in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#statistics-team-table-defensive #top-team-stats-summary-content  .foulsPerGame')))]
    interceptions = [elem.text for elem in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#statistics-team-table-defensive #top-team-stats-summary-content  .interceptionPerGame')))]
    
    df_defensive = pd.DataFrame({
        'Teams': teams_def,
        'League': league_def,
        'Tackles': tackles,
        'Shots Allowed PG': shots_allowed,
        'Interceptions PG': interceptions,
        'Offsides PG': offsides,
        'Fouls PG': fouls
    })

    return df_defensive

For loop for extracting each page of defensive team data

In [None]:
# Nexts y df del defensivo
df_defensive = pd.DataFrame()

for page_number in range(1, 6):
    if page_number > 1:
        scroll_to_percentage(0.3)
        try:
            next_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#statistics-team-paging-defensive #next')))
            next_button.click()
            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#top-team-stats-options > li:nth-child(1) > a')))
            time.sleep(2)  # Asegúrate de que la página esté completamente cargada
        except Exception as e:
            print(f"Could not click the 'Next' button or the page did not load: {e}")
            break
        
    # Extract data from the current page
    df_page_def = extract_defensive()
    df_defensive = pd.concat([df_defensive, df_page_def], ignore_index=True)

Fixing team names

In [None]:
df_defensive['Teams'] = df_defensive['Teams'].str.replace(r'^\d+\.\s+', '',regex = True)

Switching page to expected goals

In [None]:
xg_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#top-team-stats-options > li:nth-child(4) > a')))
xg_button.click()

Method for extracting expected goals stats

In [None]:
def extract_XG():
    teams = [elem.text for elem in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#statistics-team-table-xg .team-link')))]
    leagues = [elem.text for elem in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#statistics-team-table-xg .tournament-link')))]
    expected_goals = [elem.text for elem in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#statistics-team-table-xg #top-team-stats-summary-content .xG')))]
    expected_goals_dif= [elem.text for elem in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#statistics-team-table-xg #top-team-stats-summary-content .xGDiff')))]
    xg_per_shot = [elem.text for elem in wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#statistics-team-table-xg #top-team-stats-summary-content .xGPerShot')))]
    df_xg = pd.DataFrame({
        'Teams': teams,
        'Leagues': leagues,
        'Expected Goals': expected_goals,
        'Diference between Exp Goals and Goals': expected_goals_dif,
        'Expected Goals Per Shot': xg_per_shot
    })

    return df_xg

Collecting every page of expected goals data for the teams

In [None]:
#Los nexts y df del xg
df_xg = pd.DataFrame()

for page_number in range(1, 6):
    if page_number > 1:
        scroll_to_percentage(0.3)
        try:
            next_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#statistics-team-paging-xg #next')))
            next_button.click()
            time.sleep(3) 
            wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#top-team-stats-options > li:nth-child(1) > a')))
            time.sleep(3)  # Asegúrate de que la página esté completamente cargada
        except Exception as e:
            print(f"Could not click the 'Next' button or the page did not load: {e}")
            break
        
    # Extract data from the current page
    df_page_xg = extract_XG()
    df_xg = pd.concat([df_xg, df_page_xg], ignore_index=True)

df_xg

In [None]:
df_xg['Teams'] = df_xg['Teams'].str.replace(r'^\d+\.\s+', '',regex = True)