In [None]:
# # Comment out and Install necessary libraries 
# pip install pandas
# pip install selenium
# pip install scikit-learn
# pip install requests 
# pip install lxml 

# 1. Data Collection

In [20]:
import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')

In [2]:
import requests
from lxml import html
import pandas as pd

## Extract Ligue 1 League Data

In [61]:

def get_ligue1_data(seasons):
    base_url = 'https://www.ligue1.com/ranking?seasonId={season}&StatsActiveTab=0'
    
    all_data = []
    
    for season in seasons:
        season_url = base_url.format(season=season.replace("/", "-"))
        response = requests.get(season_url)
        
        # Check if the request was successful
        if response.status_code != 200:
            print(f"Failed to retrieve data for Ligue 1 {season}. Status code: {response.status_code}")
            continue

        tree = html.fromstring(response.content)
        
        # XPath to extract the table rows
        rows = tree.xpath('//div[@class="classement-table-body"]/ul/li')
        
        if not rows:
            print(f"Failed to find table for Ligue 1 {season}.")
            continue

        for row in rows:
            position = row.xpath('.//div[contains(@class, "GeneralStats-item--position")]/text()')
            team = row.xpath('.//div[contains(@class, "GeneralStats-item--club")]//span[contains(@class, "desktop-item")]/text()')
            points = row.xpath('.//div[contains(@class, "GeneralStats-item--points")]/text()')
            played = row.xpath('.//div[contains(@class, "GeneralStats-item") and not(contains(@class, "GeneralStats-item--points"))][3]/text()')
            won = row.xpath('.//div[contains(@class, "GeneralStats-item RankPage-mobileHide")][1]/text()')
            drawn = row.xpath('.//div[contains(@class, "GeneralStats-item RankPage-mobileHide")][2]/text()')
            lost = row.xpath('.//div[contains(@class, "GeneralStats-item RankPage-mobileHide")][3]/text()')
            gf = row.xpath('.//div[contains(@class, "GeneralStats-item RankPage-mobileHide")][4]/text()')
            ga = row.xpath('.//div[contains(@class, "GeneralStats-item RankPage-mobileHide")][5]/text()')
            diff = row.xpath('.//div[@class="GeneralStats-item"][2]/text()')
            form_elements = row.xpath('.//div[contains(@class, "GeneralStats-item RankPage-mobileHide forme")]/span')
            
            # Use default values if elements are not found
            position = position[0].strip() if position else 'N/A'
            team = team[0].strip() if team else 'N/A'
            points = points[0].strip() if points else 'N/A'
            played = played[0].strip() if played else 'N/A'
            won = won[0].strip() if won else 'N/A'
            drawn = drawn[0].strip() if drawn else 'N/A'
            lost = lost[0].strip() if lost else 'N/A'
            gf = gf[0].strip() if gf else 'N/A'
            ga = ga[0].strip() if ga else 'N/A'
            diff = diff[0].strip() if diff else 'N/A'
            
            # Check for class attribute in form elements
            form = ''.join([span.attrib['class'].split(' ')[1] if 'class' in span.attrib else '' for span in form_elements]) if form_elements else 'N/A'

            all_data.append([season, position, team, points, played, won, drawn, lost, gf, ga, diff, form])
    
    headers = ['Season', 'Position', 'Team', 'Points', 'Played', 'Won', 'Drawn', 'Lost', 'GF', 'GA', 'Diff', 'Form']
    df = pd.DataFrame(all_data, columns=headers)
    csv_filename = 'Ligue1_all_seasons.csv'
    df.to_csv(csv_filename, index=False)
    print(f"Saved data for all seasons of Ligue 1 to {csv_filename}")

seasons = [f"{year}-{year+1}" for year in range(2003, 2024)]
get_ligue1_data(seasons)

Saved data for all seasons of Ligue 1 to Ligue1_all_seasons_1.csv


## Extract Bundesliga League Data

In [64]:
def get_bundesliga_data(seasons):
    base_url = 'https://www.bundesliga.com/en/bundesliga/table'
    
    all_data = []
    
    for season in seasons:
        season_url = f'{base_url}?seasonId={season.replace("/", "-")}'
        response = requests.get(season_url)
        
        # Check if the request was successful
        if response.status_code != 200:
            print(f"Failed to retrieve data for Bundesliga {season}. Status code: {response.status_code}")
            continue

        tree = html.fromstring(response.content)
        
        # XPath to extract the table rows
        rows = tree.xpath('//tbody[@class="ng-star-inserted"]/tr')
        
        if not rows:
            print(f"Failed to find table for Bundesliga {season}.")
            continue

        for row in rows:
            # print(html.tostring(row, pretty_print=True).decode())  # Print the row's HTML content for debugging

            position = row.xpath('.//td[contains(@class, "rank")]/span/text()')
            team = row.xpath('.//td[contains(@class, "team")]/div/span[@class="d-none d-sm-inline-block"]/text()')
            points = row.xpath('.//td[contains(@class, "pts")]/span/text()')
            played = row.xpath('.//td[contains(@class, "matches")]/span/text()')
            won = row.xpath('.//td[contains(@class, "wins")]/span/text()')
            drawn = row.xpath('.//td[contains(@class, "draws")]/span/text()')
            lost = row.xpath('.//td[contains(@class, "losses")]/span/text()')
            goals = row.xpath('.//td[contains(@class, "goals")]/span/text()')
            diff = row.xpath('.//td[contains(@class, "difference")]/span/text()')
            
            # Split goals into GF and GA
            gf, ga = goals[0].split(':') if goals else ('N/A', 'N/A')

            # Use default values if elements are not found
            position = position[0].strip() if position else 'N/A'
            team = team[0].strip() if team else 'N/A'
            points = points[0].strip() if points else 'N/A'
            played = played[0].strip() if played else 'N/A'
            won = won[0].strip() if won else 'N/A'
            drawn = drawn[0].strip() if drawn else 'N/A'
            lost = lost[0].strip() if lost else 'N/A'
            gf = gf.strip() if gf else 'N/A'
            ga = ga.strip() if ga else 'N/A'
            diff = diff[0].strip() if diff else 'N/A'

            # print(f"Extracted Data - Position: {position}, Team: {team}, Points: {points}, Played: {played}, Won: {won}, Drawn: {drawn}, Lost: {lost}, GF: {gf}, GA: {ga}, Diff: {diff}")  # Debug print

            all_data.append([season, position, team, points, played, won, drawn, lost, gf, ga, diff])
    
    headers = ['Season', 'Position', 'Team', 'Points', 'Played', 'Won', 'Drawn', 'Lost', 'GF', 'GA', 'Diff']
    df = pd.DataFrame(all_data, columns=headers)
    csv_filename = 'bundesliga_data_2.csv'
    df.to_csv(csv_filename, index=False)
    print(f"Saved data for all seasons of Bundesliga to {csv_filename}")

seasons = [f"{year}/{year+1}" for year in range(2003, 2023)]
get_bundesliga_data(seasons)

Saved data for all seasons of Bundesliga to bundesliga_data_2.csv


# Extract Serie A League Data

In [None]:
import csv
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [1]:
# Setup Selenium WebDriver with Firefox options
firefox_options = Options()
firefox_options.binary_location = r"C:\Program Files\Mozilla Firefox\firefox.exe"  # Adjust the path to where Firefox is installed
firefox_options.add_argument("--headless")
firefox_options.add_argument("--disable-gpu")
firefox_options.add_argument("--no-sandbox")
firefox_options.add_argument("--disable-dev-shm-usage")

# Correct path to your geckodriver executable
gecko_driver_path = "./geckodriver.exe"

# Initialize WebDriver
service = Service(gecko_driver_path)
driver = webdriver.Firefox(service=service, options=firefox_options)

# List of seasons to scrape
seasons = [f"{year}-{str(year+1)[2:]}" for year in range(2002, 2024)]

# URL for the Serie A table
url = "https://www.legaseriea.it/en/serie-a/classifica"

# Initialize a list to store the extracted data
all_data = []

try:
    # Open the URL
    driver.get(url)
    
    for season in seasons:
        # Wait for the season dropdown to be available
        wait = WebDriverWait(driver, 10)
        season_select = wait.until(EC.presence_of_element_located((By.NAME, "season")))
        
        # Use JavaScript to set the value of the dropdown
        driver.execute_script("arguments[0].value = arguments[1];", season_select, season)
        
        # Trigger change event for the dropdown
        driver.execute_script("arguments[0].dispatchEvent(new Event('change'));", season_select)
        
        # Wait for the table to be reloaded
        wait.until(EC.presence_of_element_located((By.CLASS_NAME, "hm-tbody")))
        
        # Find the table body
        table = driver.find_element(By.CLASS_NAME, "hm-tbody")
        
        # Find all rows in the table
        rows = table.find_elements(By.TAG_NAME, "tr")
        
        # Extract headers manually since the headers do not load with the rows
        # headers = ['POS', 'Club', 'PTS', 'P', 'W', 'D', 'L', 'GF', 'GA', 'GD', 'Trend']
        headers = ["Position", "Team", "Points","Played", "Won", "Drawn", "Lost", "GF", "GA", "Diff", "Trend"]

        for row in rows:
            # Extract columns from each row
            columns = row.find_elements(By.TAG_NAME, "td")
            data = [column.text.strip() for column in columns if column.text.strip()]
            if data:
                data.append(season)  # Add the season to the data row
                all_data.append(data)
finally:
    # Close the WebDriver
    driver.quit()

# Save the data to a CSV file
csv_file = "serie_a_data_d.csv"
with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Write the headers
    writer.writerow(headers + ['Season'])
    # Write the data
    writer.writerows(all_data)

print(f"Data has been successfully saved to {csv_file}")

Data has been successfully saved to serie_a_data_d.csv


# Extract Premier League Data

In [None]:
# Setup Selenium WebDriver with Firefox options
firefox_options = Options()
firefox_options.binary_location = r"C:\Program Files\Mozilla Firefox\firefox.exe"  # Adjust the path to where Firefox is installed
firefox_options.add_argument("--headless")
firefox_options.add_argument("--disable-gpu")
firefox_options.add_argument("--no-sandbox")
firefox_options.add_argument("--disable-dev-shm-usage")

# Correct path to your geckodriver executable
gecko_driver_path = "./geckodriver.exe"

# Initialize WebDriver
service = Service(gecko_driver_path)
driver = webdriver.Firefox(service=service, options=firefox_options)

# List of seasons to scrape
seasons = [
    "2023/24", "2022/23", "2021/22", "2020/21", "2019/20", "2018/19", "2017/18", "2016/17",
    "2015/16", "2014/15", "2013/14", "2012/13", "2011/12", "2010/11", "2009/10", "2008/09",
    "2007/08", "2006/07", "2005/06", "2004/05", "2003/04", "2002/03"
]

# URL for the Premier League table
url = "https://www.premierleague.com/tables"

# Initialize a list to store the extracted data
all_data = []

def open_dropdown(wait, retries=3):
    for _ in range(retries):
        try:
            dropdown_button = wait.until(
                EC.element_to_be_clickable((By.CSS_SELECTOR, "div#dd-compSeasons"))
            )
            driver.execute_script("arguments[0].scrollIntoView(true);", dropdown_button)
            driver.execute_script("arguments[0].click();", dropdown_button)
            print("Dropdown opened")
            return True
        except Exception as e:
            print(f"Failed to open dropdown: {e}")
            time.sleep(2)
    return False

def select_season(wait, season):
    for _ in range(3):  # Try up to 3 times
        try:
            season_option = wait.until(
                EC.element_to_be_clickable((By.XPATH, f"//li[@data-option-name='{season}']"))
            )
            driver.execute_script("arguments[0].scrollIntoView(true);", season_option)
            driver.execute_script("arguments[0].click();", season_option)
            print(f"Season {season} selected.")
            return True
        except Exception as e:
            print(f"Attempt to select season {season} failed: {e}")
            time.sleep(2)
    return False

try:
    driver.get(url)
    
    # Accept cookie consent if necessary
    try:
        accept_cookies_button = WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.ID, "onetrust-accept-btn-handler"))
        )
        accept_cookies_button.click()
        print("Cookie consent accepted.")
    except Exception as e:
        print("No cookie consent banner found or unable to click.")
    
    for season in seasons:
        print(f"Processing season: {season}")
        
        # Open the dropdown to select the season
        wait = WebDriverWait(driver, 20)
        if not open_dropdown(wait):
            print(f"Failed to open dropdown for season {season}. Skipping season.")
            continue

        # Wait for the season option to be visible and click it
        if not select_season(wait, season):
            print(f"Failed to select season {season} after multiple attempts.")
            continue

        # Wait for the table to load
        time.sleep(5)  # Adjust the sleep time as needed for the table to load

        # Locate the table
        try:
            table = wait.until(
                EC.presence_of_element_located((By.CLASS_NAME, "table"))
            )
        except Exception as e:
            print(f"Failed to locate table for season {season}: {e}")
            continue

        # Extract rows
        try:
            rows = table.find_element(By.TAG_NAME, "tbody").find_elements(By.TAG_NAME, "tr")
        except Exception as e:
            print(f"Failed to extract rows for season {season}: {e}")
            continue

        for row in rows:
            columns = row.find_elements(By.TAG_NAME, "td")
            if len(columns) > 10:  # Ensure that there are enough columns
                data = [
                    columns[0].text.strip(),  # Position
                    columns[1].text.strip(),  # Club
                    columns[2].text.strip(),  # Played
                    columns[3].text.strip(),  # Won
                    columns[4].text.strip(),  # Drawn
                    columns[5].text.strip(),  # Lost
                    columns[6].text.strip(),  # GF
                    columns[7].text.strip(),  # GA
                    columns[8].text.strip(),  # GD
                    columns[9].text.strip()   # Points
                ]
                data.append(season)  # Add the season to the data row
                all_data.append(data)

        print(f"Data for season {season} extracted successfully.")

finally:
    # Close the WebDriver
    driver.quit()

# Save the data to a CSV file
csv_file = "premier_league_data.csv"
with open(csv_file, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(["Position", "Club", "Played", "Won", "Drawn", "Lost", "GF", "GA", "GD", "Points", "Season"])
    writer.writerows(all_data)

print(f"Data saved to {csv_file}")

## Load Data

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [4]:
# Step 1: Load data from CSV files
premier_league_data = pd.read_csv('premier_league_data.csv')
ligue1_data = pd.read_csv('ligue1_data.csv')
bundesliga_data = pd.read_csv('bundesliga_data.csv')
serie_a_data = pd.read_csv('serie_a_data.csv')

In [5]:
print(premier_league_data.columns)
print(ligue1_data.columns)
print(bundesliga_data.columns)
print(serie_a_data.columns)

Index(['Position', 'Club', 'Played', 'Won', 'Drawn', 'Lost', 'GF', 'GA', 'GD',
       'Points', 'Season'],
      dtype='object')
Index(['Season', 'Position', 'Team', 'Points', 'Played', 'Won', 'Drawn',
       'Lost', 'GF', 'GA', 'Diff', 'Form'],
      dtype='object')
Index(['Season', 'Position', 'Team', 'Points', 'Played', 'Won', 'Drawn',
       'Lost', 'GF', 'GA', 'Diff'],
      dtype='object')
Index(['Position', 'Team', 'Points', 'Played', 'Won', 'Drawn', 'Lost', 'GF',
       'GA', 'Diff', 'Trend', 'Season'],
      dtype='object')


In [6]:
premier_league_data.isna().sum()

Position    104
Club        104
Played      104
Won         104
Drawn       104
Lost        104
GF          104
GA          104
GD          104
Points      104
Season        0
dtype: int64

In [7]:
premier_league_data.dropna(inplace=True)

In [8]:
ligue1_data.isna().sum()

Season      0
Position    0
Team        0
Points      0
Played      0
Won         0
Drawn       0
Lost        0
GF          0
GA          0
Diff        0
Form        0
dtype: int64

In [9]:
bundesliga_data.isna().sum()

Season      0
Position    0
Team        0
Points      0
Played      0
Won         0
Drawn       0
Lost        0
GF          0
GA          0
Diff        0
dtype: int64

In [10]:
serie_a_data.isna().sum()

Position    0
Team        0
Points      0
Played      0
Won         0
Drawn       0
Lost        0
GF          0
GA          0
Diff        0
Trend       0
Season      0
dtype: int64

# 2. Data Preprocessing

In [11]:
# Add a column to identify the league
premier_league_data['League'] = 'Premier League'
ligue1_data['League'] = 'Ligue 1'
bundesliga_data['League'] = 'Bundesliga'
serie_a_data['League'] = 'Serie A'

In [12]:
# Standardize column names and drop irrelevant columns
def standardize_columns(df):
    relevant_columns = ['Position', 'Team', 'Played', 'Won', 'Drawn', 'Lost', 'GF', 'GA', 'GD', 'Points', 'Season', 'League']
    df = df.rename(columns={
        'Club': 'Team',
        'Diff': 'GD'
    })
    return df[relevant_columns]

In [13]:
premier_league_data = standardize_columns(premier_league_data)
ligue1_data = standardize_columns(ligue1_data)
bundesliga_data = standardize_columns(bundesliga_data)
serie_a_data = standardize_columns(serie_a_data)

In [14]:
# Combine all data into a single DataFrame
all_data = pd.concat([premier_league_data, ligue1_data, bundesliga_data, serie_a_data], ignore_index=True)

In [15]:
# Save the combined data to a CSV file
all_data.to_csv('combined_league_data.csv', index=False)

In [16]:
# Load the combined data
all_data = pd.read_csv('combined_league_data.csv')

In [17]:
all_data.isna().sum()

Position    0
Team        0
Played      0
Won         0
Drawn       0
Lost        0
GF          0
GA          0
GD          0
Points      0
Season      0
League      0
dtype: int64

In [18]:
# Display the combined data structure
print(all_data.head())

   Position               Team  Played   Won  Drawn  Lost    GF    GA    GD  \
0       1.0    Manchester City    38.0  28.0    7.0   3.0  96.0  34.0  62.0   
1       2.0            Arsenal    38.0  28.0    5.0   5.0  91.0  29.0  62.0   
2       3.0          Liverpool    38.0  24.0   10.0   4.0  86.0  41.0  45.0   
3       4.0        Aston Villa    38.0  20.0    8.0  10.0  76.0  61.0  15.0   
4       5.0  Tottenham Hotspur    38.0  20.0    6.0  12.0  74.0  61.0  13.0   

   Points   Season          League  
0    91.0  2023/24  Premier League  
1    89.0  2023/24  Premier League  
2    82.0  2023/24  Premier League  
3    68.0  2023/24  Premier League  
4    66.0  2023/24  Premier League  


# 3. AI Model

In [19]:
# Function to train model and predict top 5 teams
def train_and_predict(data, league_name):
    league_data = data[data['League'] == league_name]
    X = league_data[['Points', 'GF', 'GA', 'GD']]
    y = league_data['Position']
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train the model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Predict top 5 teams for the upcoming season using latest season's data
    latest_season_data = league_data[league_data['Season'] == league_data['Season'].max()]
    X_latest = latest_season_data[['Points', 'GF', 'GA', 'GD']]
    latest_season_data['Predicted Position'] = model.predict(X_latest)
    
    # Sort by predicted position and get top 5 teams
    top_5_teams = latest_season_data.sort_values(by='Predicted Position').head(5)
    
    return top_5_teams[['Team', 'Points', 'GF', 'GA', 'GD', 'Predicted Position']]

In [21]:
# Predict top 5 teams for each league
leagues = ['Premier League', 'Ligue 1', 'Bundesliga', 'Serie A']
predictions = {}

for league in leagues:
    predictions[league] = train_and_predict(all_data, league)

## Display Predictions

In [22]:
for league, top_5 in predictions.items():
    print(f"Top 5 teams for {league} in the upcoming season 2024-25:")
    print(top_5[['Team', 'Points', 'GF', 'GA', 'GD', 'Predicted Position']])
    print("\n")

Top 5 teams for Premier League in the upcoming season 2024-25:
                Team  Points    GF    GA    GD  Predicted Position
0    Manchester City    91.0  96.0  34.0  62.0                 1.0
1            Arsenal    89.0  91.0  29.0  62.0                 2.0
2          Liverpool    82.0  86.0  41.0  45.0                 3.0
3        Aston Villa    68.0  76.0  61.0  15.0                 5.0
4  Tottenham Hotspur    66.0  74.0  61.0  13.0                 5.0


Top 5 teams for Ligue 1 in the upcoming season 2024-25:
                    Team  Points    GF    GA    GD  Predicted Position
760  PARIS SAINT-GERMAIN    76.0  81.0  33.0  48.0                 1.0
761            AS MONACO    67.0  68.0  42.0  26.0                 2.0
762    STADE BRESTOIS 29    61.0  53.0  34.0  19.0                 3.0
763           LOSC LILLE    59.0  52.0  34.0  18.0                 4.0
765   OLYMPIQUE LYONNAIS    53.0  49.0  55.0  -6.0                 6.0


Top 5 teams for Bundesliga in the upcoming season