### 크롤링 코드

In [6]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import time

# 시작 주소
URL = 'https://www.premierleague.com/tables'

# 브라우저 꺼짐 방지 옵션
chrome_options = Options()
chrome_options.add_experimental_option("detach", True)

# 크롬 드라이버 생성
driver = webdriver.Chrome(options=chrome_options)

try:
    # 브라우저 창 최대화
    driver.maximize_window()
    # 웹사이트 열기
    driver.get(URL)
    
    # 쿠키 수락 버튼 클릭
    accept_button = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, '//*[@id="onetrust-accept-btn-handler"]'))
    )
    accept_button.click()
    print('쿠키 창 제거 완료!!')
    time.sleep(5)


    # 광고 닫기 버튼 클릭
    x_button = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, '//*[@id="advertClose"]'))
    )
    x_button.click()
    print('x 버튼 클릭 완료!!')
    time.sleep(5)

    # 시즌선택 드롭다운 메뉴 클릭
    dropdown_current = WebDriverWait(driver, 20).until(
        EC.element_to_be_clickable((By.CSS_SELECTOR, '[data-dropdown-current="compSeasons"]'))
    )
    dropdown_current.click()
    print('드롭다운 클릭 완료!!')
    time.sleep(5)

    
    # 드롭다운 요소들이 뜰 때 까지 대기
    dropdown_options = WebDriverWait(driver, 20).until(
        EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.dropdownList li'))
    )
    
    # '2022/23' 옵션 선택 후 클릭
    for option in dropdown_options:
        if option.get_attribute("data-option-name") == "2023/24":
            option.click()
            break
    print('드롭다운에서 2023/24 선택 완료!!')    

    # URL이 바뀔 때 까지 명시적으로 대기, 안 그러면 바뀌기 전의 데이터를 긁어옴.
    WebDriverWait(driver, 10).until(lambda driver: driver.current_url != URL)
    print('URL 변경 완료!!') 

    # 테이블이 로드될 때까지 대기
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, 'div.tableContainer table'))
    )
    
    # 페이지 소스 가져오기
    html_source = driver.page_source

    # 뷰티풀숲을 사용하여 HTML 파싱
    soup = BeautifulSoup(html_source, 'html.parser')
    tbody = soup.find('tbody', {'class':'league-table__tbody isPL'})
    tds = tbody.select('tr > td')

    # 각 구단의 정보를 저장할 리스트
    clubs_info = []

    # 데이터에서 13개씩 묶어 구단 정보를 추출
    for i in range(0, len(tds), 13):
        rank = tds[i].get_text().split()[0]
        club_name = tds[i + 1].get_text()
        played = tds[i + 2].get_text()
        won = tds[i + 3].get_text()
        drawn = tds[i + 4].get_text()
        lost = tds[i + 5].get_text()
        gf = tds[i + 6].get_text()
        ga = tds[i + 7].get_text()
        gd = tds[i + 8].get_text()
        points = tds[i + 9].get_text()
        
        club_info = {
            "Rank" : rank,
            "Club": club_name,
            "Played": played,
            "Won": won,
            "Drawn": drawn,
            "Lost": lost,
            "GF": gf,
            "GA": ga,
            "GD": gd,
            "Points": points
        }
        clubs_info.append(club_info)
        clubs_info
    # 결과 출력
    for info in clubs_info:
        print(info)
except TimeoutException as e:
    print("Timeout occurred:", e)

finally:
    # 드라이버 종료
    driver.quit()


쿠키 창 제거 완료!!
x 버튼 클릭 완료!!
드롭다운 클릭 완료!!
드롭다운에서 2023/24 완료!!
URL 변경 완료!!
{'Rank': '1', 'Club': '     Manchester City MCI  ', 'Played': '38', 'Won': '28', 'Drawn': '7', 'Lost': '3', 'GF': '96', 'GA': '34', 'GD': '62', 'Points': '91'}
{'Rank': '2', 'Club': '     Arsenal ARS  ', 'Played': '38', 'Won': '28', 'Drawn': '5', 'Lost': '5', 'GF': '91', 'GA': '29', 'GD': '62', 'Points': '89'}
{'Rank': '3', 'Club': '     Liverpool LIV  ', 'Played': '38', 'Won': '24', 'Drawn': '10', 'Lost': '4', 'GF': '86', 'GA': '41', 'GD': '45', 'Points': '82'}
{'Rank': '4', 'Club': '     Aston Villa AVL  ', 'Played': '38', 'Won': '20', 'Drawn': '8', 'Lost': '10', 'GF': '76', 'GA': '61', 'GD': '15', 'Points': '68'}
{'Rank': '5', 'Club': '     Tottenham Hotspur TOT  ', 'Played': '38', 'Won': '20', 'Drawn': '6', 'Lost': '12', 'GF': '74', 'GA': '61', 'GD': '13', 'Points': '66'}
{'Rank': '6', 'Club': '     Chelsea CHE  ', 'Played': '38', 'Won': '18', 'Drawn': '9', 'Lost': '11', 'GF': '77', 'GA': '63', 'GD': '14', 'Poin

### 데이터프레임 만드는 코드

In [3]:
import pandas as pd
df = pd.DataFrame(clubs_info)
df.set_index('Rank', inplace=True)

# 클럽 이름에서 앞뒤 공백 제거
df['Club'] = df['Club'].str.strip()

# 데이터프레임 출력
df.to_csv('data/EPL_DATA_2023_24.csv')

df

Unnamed: 0_level_0,Club,Played,Won,Drawn,Lost,GF,GA,GD,Points
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,Manchester City MCI,38,28,7,3,96,34,62,91
2,Arsenal ARS,38,28,5,5,91,29,62,89
3,Liverpool LIV,38,24,10,4,86,41,45,82
4,Aston Villa AVL,38,20,8,10,76,61,15,68
5,Tottenham Hotspur TOT,38,20,6,12,74,61,13,66
6,Chelsea CHE,38,18,9,11,77,63,14,63
7,Newcastle United NEW,38,18,6,14,85,62,23,60
8,Manchester United MUN,38,18,6,14,57,58,-1,60
9,West Ham United WHU,38,14,10,14,60,74,-14,52
10,Crystal Palace CRY,38,13,10,15,57,58,-1,49


In [4]:
epl=pd.read_csv('data/EPL2022-2024.csv')
epl

Unnamed: 0.1,Unnamed: 0,LEAGUE,SEASON,"GROSS P/W (GBP, K)","GROSS P/Y (GBP, K)","ADJ. GROSS (GBP, K)","KEEPER (GBP, K)","DEFENSE (GBP, K)","MIDFIELD (GBP, K)","FORWARD (GBP, K)"
0,arsenal,Premier League,2023-2024,3298,171496,171496,11180,47476,31460,81380
1,arsenal,Premier League,2022-2023,2558,133016,133016,8060,38896,37960,48100
2,aston-villa,Premier League,2023-2024,2213,115070,115070,9360,42640,29640,33430
3,aston-villa,Premier League,2022-2023,2123,110390,110390,9880,35620,30160,34730
4,bournemouth,Premier League,2023-2024,1083,56290,56290,6500,14456,11934,23400
5,bournemouth,Premier League,2022-2023,957,49764,49764,5044,17368,9672,17680
6,brentford,Premier League,2023-2024,859,44642,44642,3900,16952,11310,12480
7,brentford,Premier League,2022-2023,698,36296,36296,2600,13936,9100,10660
8,brighton,Premier League,2023-2024,1240,64480,64480,3380,19240,12870,28990
9,brighton,Premier League,2022-2023,863,44876,44876,2860,13780,13676,14560
