## Web-Scraping
---

In [1]:
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup

pd.set_option("display.max_columns", None)

### Data Ingestion
---

In [2]:
df = pd.DataFrame()
for i in range(1, 10+1):
    url = f"https://www.totalcorner.com/league/view/73/end/page:{i}?copy=yes"
    response = requests.get(url)
    assert response.status_code == 200
    
    df_merge = pd.read_html(response.text)[0].astype(str)
    df = pd.concat([df, df_merge], ignore_index=True)
    time.sleep(1)
df.head()

Unnamed: 0,start time,Unnamed: 1,Home,Home Goal,Away Goal,Away,Handicap,Home Corner,Away Corner,HT Home Corner,HT Away Corner,Asian Corn.,Corner O/U,Goals,Goals O/U,Home Attack,Away Attack,Home Shots,Away Shots
0,2022-11-05 05:00:00,Full,Consadole Sapporo,4,3,Shimizu S-Pulse,0.0,5,8,3,4,9.0,Over,3.0,Over,98,88,18,16
1,2022-11-05 05:00:00,Full,Kashima Antlers,0,0,Gamba Osaka,0.0,4,2,1,2,8.5,Under,"2.5, 3.0",Under,159,84,12,1
2,2022-11-05 05:00:00,Full,Urawa Red Diamonds,1,1,Avispa Fukuoka,"0.0, -0.5",2,1,0,1,9.0,Under,"2.0, 2.5",Under,131,106,16,8
3,2022-11-05 05:00:00,Full,Kashiwa Reysol,1,2,Shonan Bellmare,"0.0, +0.5",1,11,0,5,8.5,Over,"2.0, 2.5",Over,108,84,11,12
4,2022-11-05 05:00:00,Full,FC Tokyo,2,3,Kawasaki Frontale,"+0.5, +1.0",8,2,1,2,9.0,Over,"2.5, 3.0",Over,139,74,19,5


### Data Cleansing
---

In [3]:
df = df[["start time", "Home", "Away", "Home Goal", "Away Goal", "Handicap"]]
df.columns = ["date", "home", "away", "goals_home", "goals_away", "handicap"]
df.head()

Unnamed: 0,date,home,away,goals_home,goals_away,handicap
0,2022-11-05 05:00:00,Consadole Sapporo,Shimizu S-Pulse,4,3,0.0
1,2022-11-05 05:00:00,Kashima Antlers,Gamba Osaka,0,0,0.0
2,2022-11-05 05:00:00,Urawa Red Diamonds,Avispa Fukuoka,1,1,"0.0, -0.5"
3,2022-11-05 05:00:00,Kashiwa Reysol,Shonan Bellmare,1,2,"0.0, +0.5"
4,2022-11-05 05:00:00,FC Tokyo,Kawasaki Frontale,2,3,"+0.5, +1.0"


### Data Export
---

In [4]:
df.to_parquet("../data/j1_league.parquet")