In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import io
from time import sleep
from functools import reduce

In [None]:
season = 2023

In [None]:
li = []

li.append(pd.read_csv("../premier_league_2023.csv", index_col=0))

matches = pd.concat(li, axis=0, ignore_index=True)
matches.shape

In [None]:
def get_premier_league_url(season_begin_year: int):
	default_url_prefix = "https://fbref.com/en/comps/9/"
	default_url_suffix = "Premier-League-Stats"

	if(season_begin_year > 2023):
		raise ValueError("Year " + str(season_begin_year) + " didn't happen yet!")
	elif(season_begin_year == 2023):
		return default_url_prefix + default_url_suffix
	return default_url_prefix + str(season_begin_year) + "-" + str(season_begin_year + 1) + "/" + str(season_begin_year) + "-" + str(season_begin_year + 1) + "-" + default_url_suffix

In [None]:
def get_team_urls(premier_league_url: str):
	premier_league_data = requests.get(premier_league_url)

	if(premier_league_data.status_code != 200):
		print(premier_league_data.status_code)
		retry_after = int(premier_league_data.headers['Retry-After'])
		print(retry_after)
		return
	
	premier_league_soup = BeautifulSoup(premier_league_data.text)
	teams_table = premier_league_soup.select('table.stats_table')[0]

	links = list(filter(lambda x: x is not None, [l.get("href") if "squad" in l.get("href") else None for l in teams_table.find_all('a')]))
	team_urls = [f"https://fbref.com{l}" for l in links]

	return team_urls


In [None]:
def write_list_into_csv(all_matches: list, name: str):
	match_df = pd.concat(all_matches)
	match_df.columns = [c.lower() for c in match_df.columns]
	match_df.to_csv(name, na_rep='NaN')

In [None]:
matches = matches.sort_values("date").dropna(subset=['date', 'result', 'gf'])
matches = matches[~matches['date'].str.contains('2024')]
all_matches = []
all_matches.append(matches)

In [None]:
team_urls = get_team_urls(get_premier_league_url(season))

index = 0
for team_url in team_urls:
	sleep(4)
	team_name = team_url.split("/")[-1].replace("-Stats", "").replace("-", " ")

	print("Current year: " + str(season) + " in interval " + str(season) + "-" + str(season))
	print("Current team: " + team_name + " being " + str(index + 1) + "/" + str(len(team_urls)))
	print(team_url)
	
	team_data = requests.get(team_url)
	team_soup = BeautifulSoup(team_data.text)

	team_links = [l.get("href") for l in team_soup.find_all('a')]
	
	team_links_shooting = [l for l in team_links if l and 'matchlogs/c9/shooting' in l] # Shooting
	team_links_keeper = [l for l in team_links if l and 'matchlogs/c9/keeper' in l] # Goalkeeping

	schedule_html = pd.read_html(io.StringIO(team_data.text), match="Scores & Fixtures")[0]

	sleep(4)
	shooting_data = requests.get(f"https://fbref.com{team_links_shooting[0]}")
	shooting_html = pd.read_html(io.StringIO(shooting_data.text), match="Shooting")[0]
	shooting_html.columns = shooting_html.columns.droplevel()

	sleep(4)
	keeper_data = requests.get(f"https://fbref.com{team_links_keeper[0]}")
	keeper_html = pd.read_html(io.StringIO(keeper_data.text), match="Goalkeeping")[0]
	keeper_html.columns = keeper_html.columns.droplevel()
		
	dfs = [schedule_html, shooting_html, keeper_html]		

	team_data_end = reduce(lambda left,right: pd.merge(left,right,on='Date', suffixes=('', '_drop'), how='outer'), dfs)
	team_data_end.drop([col for col in team_data_end.columns if 'drop' in col], axis=1, inplace=True)

	team_data_end["Season"] = str(season) + "-" + str(season + 1)
	team_data_end["Team"] = team_name

	all_matches.append(team_data_end)
	index += 1
	break

all_matches[-1]

In [None]:
name = "premier_league_" + str(season) + "new.csv"
write_list_into_csv(all_matches, name)

In [None]:
# Initialize a list to store duplicate indices
all_duplicate_indices = []

# Iterate through each DataFrame in the list
for df in all_matches:
    # Get duplicate indices for the current DataFrame
    duplicate_indices = df[df.duplicated()].index.tolist()
    # Append duplicate indices to the list
    all_duplicate_indices.extend(duplicate_indices)

# Convert the list to a set to remove duplicate indices
all_duplicate_indices = list(set(all_duplicate_indices))

# Print the indices of duplicate values
print("Indices of duplicate values:", all_duplicate_indices)
