# import libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import random
import pandas as pd
import re
import numpy as np
import time
import jupytext


# Extracting all leagues titles and urls

In [2]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36',
           "Accept-Language": "en-US,en;q=0.5"}

In [3]:
Countries_Name = ['England', 'Germany', 'Italy', 'France', 'Spain']
Countries_1stLeagueName = ['premier-league', 'bundesliga', 'serie-a', 'ligue-1', 'laliga']
Countries_Url = ['https://www.transfermarkt.com/premier-league/startseite/wettbewerb/GB1/plus/?saison_id=20',
             'https://www.transfermarkt.com/bundesliga/startseite/wettbewerb/L1/plus/?saison_id=20',
             'https://www.transfermarkt.com/serie-a/startseite/wettbewerb/IT1/plus/?saison_id=20',
             'https://www.transfermarkt.com/ligue-1/startseite/wettbewerb/FR1/plus/?saison_id=20',
             'https://www.transfermarkt.com/laliga/startseite/wettbewerb/ES1']

# Request all leagues pages and extract clubs urls and titles

In [4]:
links = []
titles = []
countries = []

for i, url in enumerate(Countries_Url):
    country = Countries_Name[i]
    for i in range(15,22):
        response = requests.get(url=url, headers=headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        table = soup.find('div', class_='responsive-table')

        if table:
            rows = table.find_all('td', class_='hauptlink no-border-links')

            for row in rows:
                team_link = row.find('a')
                
                if team_link:
                    team_href = team_link.get('href')
                    team_title = team_link.get('title')
                    if team_title not in titles:
                        links.append(team_href)
                        titles.append(team_title)
                        countries.append(country)
                        
        time.sleep(random.random())

In [5]:
print(len(dict(zip(titles,links))))
print('*'*50)
dict(zip(titles,links))

98
**************************************************


{'Manchester City': '/manchester-city/startseite/verein/281/saison_id/2022',
 'Chelsea FC': '/fc-chelsea/startseite/verein/631/saison_id/2022',
 'Arsenal FC': '/fc-arsenal/startseite/verein/11/saison_id/2022',
 'Liverpool FC': '/fc-liverpool/startseite/verein/31/saison_id/2022',
 'Manchester United': '/manchester-united/startseite/verein/985/saison_id/2022',
 'Tottenham Hotspur': '/tottenham-hotspur/startseite/verein/148/saison_id/2022',
 'Newcastle United': '/newcastle-united/startseite/verein/762/saison_id/2022',
 'West Ham United': '/west-ham-united/startseite/verein/379/saison_id/2022',
 'Leicester City': '/leicester-city/startseite/verein/1003/saison_id/2022',
 'Aston Villa': '/aston-villa/startseite/verein/405/saison_id/2022',
 'Wolverhampton Wanderers': '/wolverhampton-wanderers/startseite/verein/543/saison_id/2022',
 'Southampton FC': '/fc-southampton/startseite/verein/180/saison_id/2022',
 'Brighton & Hove Albion': '/brighton-amp-hove-albion/startseite/verein/1237/saison_id/20

In [6]:
ids = [re.search(r'verein/(.*?)/saison_id', element).group(1) for element in links]
link_titles = [re.search(r'(.*?)/startseite', element).group(1) for element in links]

# Generating cup pages for every club

In [7]:
main_url = 'https://www.transfermarkt.com'
extension = '/erfolge/verein/'

Victories_page_links = []

for i in range(len(links)):
    page_link = main_url + link_titles[i] + extension + ids[i]
    Victories_page_links.append(page_link)

In [8]:
Victories_page_links

['https://www.transfermarkt.com/manchester-city/erfolge/verein/281',
 'https://www.transfermarkt.com/fc-chelsea/erfolge/verein/631',
 'https://www.transfermarkt.com/fc-arsenal/erfolge/verein/11',
 'https://www.transfermarkt.com/fc-liverpool/erfolge/verein/31',
 'https://www.transfermarkt.com/manchester-united/erfolge/verein/985',
 'https://www.transfermarkt.com/tottenham-hotspur/erfolge/verein/148',
 'https://www.transfermarkt.com/newcastle-united/erfolge/verein/762',
 'https://www.transfermarkt.com/west-ham-united/erfolge/verein/379',
 'https://www.transfermarkt.com/leicester-city/erfolge/verein/1003',
 'https://www.transfermarkt.com/aston-villa/erfolge/verein/405',
 'https://www.transfermarkt.com/wolverhampton-wanderers/erfolge/verein/543',
 'https://www.transfermarkt.com/fc-southampton/erfolge/verein/180',
 'https://www.transfermarkt.com/brighton-amp-hove-albion/erfolge/verein/1237',
 'https://www.transfermarkt.com/fc-everton/erfolge/verein/29',
 'https://www.transfermarkt.com/notti

In [9]:
len(Victories_page_links)

98

# Request every cup page

In [11]:
soups = []

for url in Victories_page_links:
    response = requests.get(url=url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    soups.append(soup)
    time.sleep(random.random())

# Extracting cup tags for each club

In [12]:
cup_tags = []
for soup in soups:
    cup_tags.append(soup.select('div.large-6.columns div.box'))

In [13]:
# transforming '16/17' to '2016'
def year_transformer(raw_year):
    year = raw_year.split('/')[0]
    if len(year) == 2:
        if int(year) <=22:
            return '20' + year
        else:
            return '19' + year
    elif len(year) == 4:
        return year

# Find all cups for each cup page and club from cup_tags

In [14]:
result = []
for i, club_tag in enumerate(cup_tags):
    country = countries[i]
    club_name = titles[i]
    club_id = ids[i]
    for tag in club_tag:
        cup_name = tag.find('img').get('title')
        years_part = tag.find('div', class_='erfolg_infotext_box').get_text()
        years = [year_transformer(element) for element in years_part.split() if not element.__contains__(',')]
        for year in years:
            year = int(year)
            result.append({'nation': country, 'club_id':club_id, 'club_name':club_name, 'cup_name':cup_name, 'win_year_from':year, 'win_year_to':year+1})
    


# Convert and export results

In [15]:
df = pd.DataFrame(result)
df.to_csv('cup_winners.csv', index=False)

In [16]:
df_selection = df[(df.win_year_from >=2015) & (df.win_year_to<=2022)].reset_index(drop=True)
df_selection

Unnamed: 0,nation,club_id,club_name,cup_name,win_year_from,win_year_to
0,England,281,Manchester City,English Champion,2021,2022
1,England,281,Manchester City,English Champion,2020,2021
2,England,281,Manchester City,English Champion,2018,2019
3,England,281,Manchester City,English Champion,2017,2018
4,England,281,Manchester City,FA Cup Winner,2018,2019
...,...,...,...,...,...,...
161,Spain,368,Sevilla FC,Europa League Winner,2015,2016
162,Spain,331,CA Osasuna,Spanish 2nd tier champion,2018,2019
163,Spain,714,RCD Espanyol Barcelona,Spanish 2nd tier champion,2020,2021
164,Spain,367,Rayo Vallecano,Spanish 2nd tier champion,2017,2018


In [17]:
df_selection.to_csv('cup_winners_selected.csv', index=False)

In [18]:
!jupytext --to py -o cup_crawler.py cup_crawler.ipynb

[jupytext] Reading cup_crawler.ipynb in format ipynb
[jupytext] Writing cup_crawler.py (destination file replaced)
