## Scraping the data

In [2]:
import pandas as pd
import time
import json

In [4]:
# Function to generate a list of seasons you want to scrape
def generate_seasons(start_year, end_year):
    seasons = []
    for year in range(start_year, end_year + 1):
        season_start = str(year)
        season_end = str(year + 1)
        season = f"{season_start}-{season_end}"
        seasons.append(season)
    return seasons

start_year = 2014
end_year = 2023

seasons = generate_seasons(start_year, end_year)
print(seasons)

['2014-2015', '2015-2016', '2016-2017', '2017-2018', '2018-2019', '2019-2020', '2020-2021', '2021-2022', '2022-2023', '2023-2024']


In [5]:
def scrape_seasons(seasons):
    for season in seasons:
        url = 'https://fbref.com/en/comps/9/' + season + '/schedule/' + season + '-Premier-League-Scores-and-Fixtures'
        df = pd.read_html(url, attrs={"id": str("sched_" + season + "_9_1")})[0]
        df.to_csv( './data/' + season[:4] + '-' + season[7:9] + '.csv')
        time.sleep(30)

# scrape_seasons(seasons)

In [None]:
def scrape_season_stats(url):
    try:
        df = pd.read_html(url, attrs={"id": "matchlogs_for"})[0]
        return df if not df.empty else None
    except Exception as e:
        print(f"Failed to scrape {url}: {e}")
        return None

def scrape_teams_stats(seasons, squad_id, team_name):
    urls = ['https://fbref.com/en/squads/' + squad_id + '/' + season + '/matchlogs/c9/shooting/' + team_name + '-Match-Logs-Premier-League' for season in seasons]
    
    dfs = []
    for url in urls:
        df = scrape_season_stats(url)
        time.sleep(30)
        if df is not None:
            dfs.append(df)
    if dfs:
        df = pd.concat(dfs, ignore_index=False)
        df = df.droplevel(level=0, axis=1)
        df.to_csv('../data/shooting_data/' + team_name + '.csv')
    else:
        print(f"No valid data for team {team_name} in seasons {seasons}")

def scrape_all_teams_stats(seasons, team_ids):
    counter = 0
    for team, id in team_ids.items():
        scrape_teams_stats(seasons, id, team)
        time.sleep(30)
        if counter == 3:
            time.sleep(120)
        counter +=1

team_ids = json.load(open('../encoders/team_ids.json'))
# team_ids.items()
# scrape_all_teams_stats(seasons, team_ids)

In [None]:
teams = json.load(open('../encoders/training_teams.json'))

def rolling_stats(df, team_name):
    df.dropna(subset=['Date'], inplace=True) 

    # Getting rolling averages
    cols = ["GF", "GA", "Sh", "SoT", "PK","PKatt"]
    new_cols = [f"{c}_rolling" for c in cols]

    rolling_stats = df[cols].rolling(3, closed='left').mean()
    df[new_cols] = rolling_stats
    # df = df.dropna(subset=new_cols)

    df.loc[df['Venue'] == 'Home', 'HomeTeam'] = team_name
    df.loc[df['Venue'] == 'Home', 'AwayTeam'] = df['Opponent']
    df.loc[df['Venue'] == 'Away', 'HomeTeam'] = df['Opponent']
    df.loc[df['Venue'] == 'Away', 'AwayTeam'] = team_name
    df['Date'] = pd.to_datetime(df['Date'], format="%Y-%m-%d")
    
    # Check if 'Venue' column has correct entries
    if 'Home' not in df['Venue'].unique() or 'Away' not in df['Venue'].unique():
        print("Error: 'Venue' column does not contain expected values.")
        return df
    
    rolling_home_cols = [f"{c}_rolling_h" for c in cols]
    rolling_away_cols = [f"{c}_rolling_a" for c in cols]

    df.loc[df['Venue'] == 'Home', rolling_home_cols] = df.loc[df['Venue'] == 'Home', new_cols].values
    df.loc[df['Venue'] == 'Away', rolling_away_cols] = df.loc[df['Venue'] == 'Away', new_cols].values

    return df

def merge_rolling_stats(teams):
    rolling_dfs = []

    for team in teams:
        df = pd.read_csv('shooting_data/'+ team + '.csv')
        rolling_df = rolling_stats(df, teams[team])
        rolling_dfs.append(rolling_df)
    
    combined_df = pd.concat(rolling_dfs, ignore_index=False)
    merged_df = combined_df.groupby(['Date', 'HomeTeam', 'AwayTeam'], as_index=False).first()

    return merged_df

# merge_rolling_stats(teams)

In [8]:
def scrape_season_stats(url):
    try:
        df = pd.read_html(url, attrs={"id": "matchlogs_for"})[0]
        return df if not df.empty else None
    except Exception as e:
        print(f"Failed to scrape {url}: {e}")
        return None

def scrape_teams_stats(seasons, squad_id, team_name):
    urls = ['https://fbref.com/en/squads/' + squad_id + '/' + season + '/matchlogs/c9/shooting/' + team_name + '-Match-Logs-Premier-League' for season in seasons]
    
    dfs = []
    for url in urls:
        df = scrape_season_stats(url)
        time.sleep(10)
        if df is not None:
            dfs.append(df)
    if dfs:
        df = pd.concat(dfs, ignore_index=False)
        df = df.droplevel(level=0, axis=1)
        df.to_csv('../data/shooting_data_2024/' + team_name + '.csv')
        print(f"Exported {team_name} to /data/shooting_data_2024/{team_name}.csv")
    else:
        print(f"No valid data for team {team_name} in seasons {seasons}")

def scrape_all_teams_stats(seasons, team_ids):
    counter = 0
    for team, id in team_ids.items():
        scrape_teams_stats(seasons, id, team)
        if counter == 3:
            time.sleep(10)
        counter +=1

team_ids_2024_25 = {
    'Arsenal': '18bb7c10',
    'Aston-Villa': '8602292d',
    'Bournemouth': '4ba7cbea',
    'Brentford': 'cd051869',
    'Brighton-and-Hove-Albion': 'd07537b9',
    'Chelsea': 'cff3d9bb',
    'Crystal-Palace': '47c64c55',
    'Everton': 'd3fd31cc',
    'Fulham': 'fd962109',
    'Ipswich-Town': 'b74092de',
    'Leicester-City': 'a2d435b3',
    'Liverpool': '822bd0ba',
    'Manchester-City': 'b8fd03ef',
    'Manchester-United': '19538871',
    'Newcastle-United': 'b2b47a98',
    'Nottingham-Forest': 'e4a775cb',
    'Southampton': '33c895d4',
    'Tottenham-Hotspur': '361ca564',
    'West-Ham': '7c21e445',
    'Wolverhampton-Wanderers': '8cec06e1',
    }
# team_ids.items()
season_24 = generate_seasons(2024,2024)   
# scrape_all_teams_stats(season_24, team_ids_2024_25)

In [None]:
import requests
from bs4 import BeautifulSoup

# Step 1: Define the login URL and the target URL
login_url = "https://www.superbru.com/login"
target_url = "https://www.superbru.com/premierleague_predictor/global.php#tab=global-leaderboard"

# Step 2: Start a session
session = requests.Session()

# Step 3: Get the login page to fetch any CSRF tokens (if applicable)
login_page = session.get(login_url)
soup = BeautifulSoup(login_page.text, 'html.parser')

# Step 4: Prepare 
#login credentials and form data
payload = {
    "username": "",
    "password": "",
}

# Step 5: Send POST request to login
response = session.post(login_url, data=payload)

# Check if login was successful
if response.status_code == 200 and "Welcome" in response.text:  # Adjust based on the website
    print("Login successful!")
else:
    print("Login failed!")
    exit()

# Step 6: Access the target page


Login successful!
None


In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def get_top_points():
    target_url = "https://www.superbru.com/premierleague_predictor/global.php#tab=global-leaderboard"
    username = "",
    password = ""

    # Step 1: Set up WebDriver
    driver = webdriver.Chrome()  # Or use another WebDriver
    driver.get(target_url)

    try:
        # Replace 'button.accept-cookies' with the actual selector for the accept button
        cookie_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, "#qc-cmp2-ui > div.qc-cmp2-footer.qc-cmp2-footer-overlay.qc-cmp2-footer-scrolled > div > button.css-13brqst"))
        )
        cookie_button.click()
        print("Cookie popup dismissed.")
    except Exception as e:
        print("No cookie popup found or error occurred:", e)

    try:
        # Adjust the selector to match the desired anchor tag
        anchor_tag = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, "body > main > div > div > div > div > div > div > div > div.entry > div.tab-bar > ul > li.tab-control.active > a"))
        )
        anchor_tag.click()
        print("Clicked the anchor tag.")
    except Exception as e:
        print("Anchor tag not found or timeout:", e)


    # Step 2: Log in (if needed)
    driver.find_element(By.ID, "email-superbru").send_keys(username)
    driver.find_element(By.ID, "password-superbru").send_keys(password)
    driver.find_element(By.XPATH, "/html/body/main/div/div/div/div/div/div/div/div[3]/div[2]/div[1]/form/div[3]/input").submit()

    # Step 3: Wait for the table to load and scrape it
    time.sleep(1)
    table = driver.find_element(By.XPATH, "/html/body/main/div/div[2]/div/div[3]/div[2]/div/div[2]/table")  # CSS selector for multiple classes
    html = table.get_attribute("outerHTML")

    # Use BeautifulSoup to parse the table
    soup = BeautifulSoup(html, "html.parser")
    rows = [[cell.text for cell in row.find_all("td")] for row in soup.find_all("tr")]
    print(rows)
    top_global_points = rows[2][-1]
    top_global_250_points = rows[-1][-1]
    print(top_global_points, top_global_250_points)
    driver.quit()

    return top_global_points, top_global_250_points