# Scrape Data from sumo db

In this notebook we will use requests, beautiful soup and pandas to scrape banzuke (rankings), hoshitori (tournament results) from sumo db and store locally for further processing

## MVP

for the mvp we only need the rankings and results from the previous two tournaments

### Banzuke format (note that Banzuke_text does not have tournament results - must get from aspx)
sample URL
http://sumodb.sumogames.de/Banzuke.aspx?b=202009

URL template
http://sumodb.sumogames.de/Banzuke.aspx?b=yyyymm

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
url = 'http://sumodb.sumogames.de/Banzuke.aspx?b=202009'

res = requests.get(url)
soup = BeautifulSoup(res.content)
# table = soup.find_all('table')[0] 
# df = pd.read_html(str(table))
# df.head()

In [3]:
a = soup.find_all('table', 'banzuke')
dfs = pd.read_html(str(a), flavor = 'html5lib')

In [4]:
df = pd.concat(dfs[0:3])
df.head()

Unnamed: 0,Result,East,Rank,West,Result.1
0,0-0-15,Hakuho,Y,Kakuryu,0-0-15
1,10-5,Asanoyama,O,Takakeisho,12-3 J
2,13-2 YSK ↑,Shodai,S,Mitakeumi,8-7
3,5-10 ↓,Daieisho,S,,
4,4-11 ↓,Okinoumi,K,Endo,3-9-3 ↓


In [5]:
def retrieve_results(soup):
    """
    takes banzuke soup object and turns into data frame for 
    further processing
    """
    
    tables = soup.find_all('table', 'banzuke')
    dfs = pd.read_html(str(tables), flavor = 'html5lib')
    df = pd.concat(dfs[0:3])# concats makuuchi through makushita
    df.columns = df.columns.astype('str')
    if ~df.columns.str.contains('East|West').any():
        return 'None'
    
    return df

retrieve_results(soup)

Unnamed: 0,Result,East,Rank,West,Result.1
0,0-0-15,Hakuho,Y,Kakuryu,0-0-15
1,10-5,Asanoyama,O,Takakeisho,12-3 J
2,13-2 YSK ↑,Shodai,S,Mitakeumi,8-7
3,5-10 ↓,Daieisho,S,,
4,4-11 ↓,Okinoumi,K,Endo,3-9-3 ↓
...,...,...,...,...,...
55,4-3,Sadanoryu,Ms56,Tokunomusashi,4-3
56,0-0-7 ↓,Inoue,Ms57,Fukamiyama,4-3
57,2-5 ↓,Izumigawa,Ms58,Oyamatoumi,2-5 ↓
58,5-2,Ito,Ms59,Itadaki,4-3


In [6]:
def scrape_hoshitori(year = 2019):
    """
    scrape all the banzukes for given year and get hoshitori back
    """

    months = [str(x).zfill(2) for x in list(range(1,13,1))]
    
    url = 'http://sumodb.sumogames.de/Banzuke.aspx?b={}{}'
    
    urls = [url.format(year, x) for x in months ]
    
    hoshitori = []
    
    for r in urls:
        res = requests.get(r)
        soup = BeautifulSoup(res.content)
        hoshi = retrieve_results(soup)
        if type(hoshi) == type(str()):
            continue        
        print(r)
        hoshi['year'] = year
        hoshi['month'] = r[-2::1]
        hoshitori.append(hoshi)
    
    hoshitori = pd.concat(hoshitori, ignore_index=True)
    
    return hoshitori

scrape_hoshitori(year = 2020)

http://sumodb.sumogames.de/Banzuke.aspx?b=202001


http://sumodb.sumogames.de/Banzuke.aspx?b=202003


http://sumodb.sumogames.de/Banzuke.aspx?b=202007


http://sumodb.sumogames.de/Banzuke.aspx?b=202009


http://sumodb.sumogames.de/Banzuke.aspx?b=202011


Unnamed: 0,Result,East,Rank,West,Result.1,year,month
0,1-3-11,Hakuho,Y,Kakuryu,1-4-10,2020,01
1,11-4,Takakeisho,O,Goeido,5-10,2020,01
2,10-5,Asanoyama,S,Takayasu,6-9 ↓,2020,01
3,5-10 ↓,Abi,K,Daieisho,7-8 ↓,2020,01
4,9-6 S ↑,Endo,M1,Myogiryu,5-10,2020,01
...,...,...,...,...,...,...,...
473,3-4,Shoji,Ms56,Chiyoraizan,3-4,2020,11
474,1-6,Takaryu,Ms57,Hananofuji,5-2,2020,11
475,2-5,Kitadaichi,Ms58,Tosamidori,4-3,2020,11
476,5-2,Kirinofuji,Ms59,Tanabe,5-2,2020,11


In [7]:
year = 2020
hoshitori = scrape_hoshitori(year = year)
hoshitori.to_csv('hoshitori_{}.csv'.format(year), index = False)

http://sumodb.sumogames.de/Banzuke.aspx?b=202001


http://sumodb.sumogames.de/Banzuke.aspx?b=202003


http://sumodb.sumogames.de/Banzuke.aspx?b=202007


http://sumodb.sumogames.de/Banzuke.aspx?b=202009


http://sumodb.sumogames.de/Banzuke.aspx?b=202011
