In [2]:
from tarfile import RECORDSIZE
import gazpacho

In [3]:
URL = "https://en.wikipedia.org/wiki/List_of_world_records_in_swimming"
html = gazpacho.get(URL)

In [4]:
print(len(html))

572990


In [5]:
html[:500]

'<!DOCTYPE html>\n<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 skin-theme-clientpref-day vector-sticky-header-enabled wp25easte'

In [6]:
html[-500:]

'anization","name":"Contributors to Wikimedia projects"},"publisher":{"@type":"Organization","name":"Wikimedia Foundation, Inc.","logo":{"@type":"ImageObject","url":"https:\\/\\/www.wikimedia.org\\/static\\/images\\/wmf-hor-googpub.png"}},"datePublished":"2007-03-15T21:20:10Z","dateModified":"2026-01-17T08:19:18Z","image":"https:\\/\\/upload.wikimedia.org\\/wikipedia\\/commons\\/a\\/ae\\/Caeleb_Dressel_before_winning_100_fly_%2842769914221%29.jpg","headline":"Wikimedia list article"}</script>\n</body>\n</html>'

In [7]:
table_start = html.find("<table")
html[table_start:table_start+500]

'<table class="wikitable sortable" style="font-size: 95%;">\n<caption>\n</caption>\n<tbody><tr>\n<th>Event\n</th>\n<th style="width:4em" class="unsortable">Time\n</th>\n<th class="unsortable">\n</th>\n<th>Name</th>\n<th>Nationality</th>\n<th>Date</th>\n<th>Meet</th>\n<th>Location\n</th>\n<th class="unsortable">Ref\n</th></tr>\n\n<tr>\n<td><span data-sort-value="01&#160;!"><a href="/wiki/World_record_progression_50_metres_freestyle" title="World record progression 50 metres freestyle">50m freestyle</a></span>\n</td>\n<'

In [8]:
soup = gazpacho.Soup(html)

In [9]:
type(soup)

gazpacho.soup.Soup

In [10]:
tables = soup.find("table")

In [11]:
for n, table in enumerate(tables):
    rows = table.find("tr", mode="all")
    cols = rows[-1].find("td", mode="all")
    print(f"{n}->{len(rows)} rows,{len(cols)} columns")

0->21 rows,9 columns
1->21 rows,9 columns
2->3 rows,9 columns
3->25 rows,9 columns
4->27 rows,9 columns
5->4 rows,9 columns
6->25 rows,11 columns
7->1 rows,2 columns
8->7 rows,1 columns
9->5 rows,1 columns
10->8 rows,1 columns


In [12]:
table = tables[0]
for row in table.find("tr", mode="all")[1:]:
    cols = row.find("td", mode="all")
    event = cols[0].text
    time = cols[1].text
    print(f"the event is {event} at {time}")

the event is 50m freestyle at 20.91
the event is 100m freestyle at 46.40
the event is 200m freestyle at 1:42.00
the event is 400m freestyle at 3:39.96
the event is 800m freestyle at 7:32.12
the event is 1500m freestyle at 14:30.67
the event is 50m backstroke at 23.55
the event is 100m backstroke at 51.60
the event is 200m backstroke at 1:51.92
the event is 50m breaststroke at 25.95
the event is 100m breaststroke at 56.88
the event is 200m breaststroke at 2:05.48
the event is 50m butterfly at 22.27
the event is 100m butterfly at 49.45
the event is 200m butterfly at 1:50.34
the event is 200m individual medley at 1:52.69
the event is 400m individual medley at 4:02.50
the event is 4 × 100 m freestyle relay at 3:08.24
the event is 4 × 200 m freestyle relay at 6:58.55
the event is 4 × 100 m medley relay at 3:26.78


In [13]:
RECORDS = (0, 1, 3, 4)
COURSES = ("LC Men", "LC Woman", "SC Men", "SC Woman")
records = {}
for table, course in zip(RECORDS, COURSES):
    records[course] = {}
    for row in tables[table].find("tr", mode="all")[1:]:
        cols = row.find("td", mode="all")
        event = cols[0].text
        time = cols[1].text
        if "relay" not in event:
            records[course][event] = time

In [14]:
conversions = {
    "Free": "freestyle",
    "Back": "backstroke",
    "Breast": "breaststroke",
    "Fly": "butterfly",
    "IM": "individual medley",
}

In [15]:
event = "Mike-15-100m-Fly.txt"
*_, distance, stroke = event.removesuffix(".txt").split("-")
lookup = f"{distance} {conversions[stroke]}"
lookup

'100m butterfly'

In [19]:
import json

with open("records.json", "w") as jf:
    json.dump(records, jf)

In [21]:
def event_lookup(file_name):

    conversion = {
    "Free": "freestyle",
    "Back": "backstroke",
    "Breast": "breaststroke",
    "Fly": "butterfly",
    "IM": "individual medley",
}
    *_, distance, stroke = file_name.removesuffix(".txt").split("-")
    return f"{distance} {conversion[stroke]}"

In [22]:
with open("records.json") as jf:
    record = json.load(jf)

In [24]:
record["LC Men"][event_lookup("Mike-15-100m-Fly.txt")]

'49.45'