In [None]:
import bs4
import pandas
import re
import requests
import datetime
import os
from typing import Tuple, Optional, Dict

from mma.glm.merge_data import DATA_PATH, load_fighters_df, load_merged_df

In [None]:
def parse_id_slug(href: str) -> Tuple[str, str]:
    *extra, fid, slug = href.split('/')
    return str(int(fid)), slug

def parse_event_id(event_url: str) -> str:
    """ 'https://www.espn.com/mma/fightcenter/_/id/600022153/league/ufc' """
    *_, event_id, _, _ = event_url.split('/')
    return str(int(event_id))

def parse_event_id_opt(row_td) -> Optional[int]:
    a = row_td.find('a')
    if not a:
        return None
    href = a.get('href')
    if not href:
        return None
    return parse_event_id(href)

def map_rows(header, row) -> Dict:
    row_tds = [r for r in row.find_all('td')]
    row_map = dict(zip(header, [td.get_text().strip() for td in row_tds]))
    opponent_id, opponent_slug = parse_id_slug(row_tds[1].find('a').get('href'))
    row_map['opponentEspnId'] = opponent_id
    row_map['opponentEspnSlug'] = opponent_slug
    row_map['eventEspnId'] = parse_event_id_opt(row_tds[2])
    return row_map

def make_rows(table, espn_id) -> pandas.DataFrame:
    header, *rows = table.find_all('tr')
    header_cols = [h.get_text().strip() for h in header.find_all('th')]
    mapped_rows = [
        map_rows(header_cols, row)
        for row in rows
    ]
    df = pandas.DataFrame(mapped_rows)
    df['espnId'] = espn_id
    return df

In [None]:
letters = list('abcdefghijklmnopqrstuvwxyz')
len(letters)

In [None]:
# id_to_slug = {}
# for letter in letters:
#     response = requests.get(f'http://www.espn.com/mma/fighters?search={letter}')
#     soup = bs4.BeautifulSoup(response.text)
#     trs = soup.find('table').find_all('tr')
#     hrefs = [tr.find('a').get('href') for tr in trs[2:]]
#     id_to_slug.update(dict([parse_id_slug(hr) for hr in hrefs]))
    
# espn_fighters = pandas.DataFrame([{'espnId': str(espn_id), 'espnSlug': slug} for espn_id, slug in id_to_slug.items()])
# espn_fighters.head(1)
# espn_fighters.to_csv(f'{DATA_PATH}/espn_fighters.csv', index=False)

In [None]:
espn_dir = f'{DATA_PATH}/espn_fighters'
os.makedirs(espn_dir, exist_ok=True)
scraped_ids = set(os.listdir(espn_dir))
# espn_ids = espn_fighters['espnId']

In [None]:


error_ids = set()
for i, espn_id in enumerate(espn_ids):
    
    if str(espn_id) in scraped_ids:
        continue

    if (i+1) % 100 == 0:
        print(f'fetching for {espn_id=} [{i+1}/{len(espn_ids)}]')

    url = f'https://www.espn.com/mma/fighter/stats/_/id/{espn_id}'
    try:
        response = requests.get(url)
        soup = bs4.BeautifulSoup(response.text)
        try:    
            distance, clinch, ground = soup.find_all('table')
        except ValueError:
            continue

        di_df = make_rows(distance, espn_id)
        cl_df = make_rows(clinch, espn_id)
        gr_df = make_rows(ground, espn_id)

        os.makedirs(f'{espn_dir}/{espn_id}', exist_ok=True)
        di_df.to_csv(f'{espn_dir}/{espn_id}/distance.csv', index=False)
        cl_df.to_csv(f'{espn_dir}/{espn_id}/clinch.csv', index=False)
        gr_df.to_csv(f'{espn_dir}/{espn_id}/ground.csv', index=False)
    except:
        error_ids.add(espn_id)