In [50]:
import re
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
import time

# Selenium to fetch the page
driver = webdriver.Chrome()
url = driver.get("https://en.wikipedia.org/wiki/List_of_Testudines_families")
time.sleep(10)

soup = BeautifulSoup(driver.page_source, 'html.parser')

table = soup.find('table', {'class': 'wikitable'})

header_rows = table.find_all('tr')[:2]

print(header_rows)
driver.quit() 

[<tr>
<td align="center" bgcolor="#BBBBFF" colspan="5"><b><a href="/wiki/Cryptodira" title="Cryptodira">Cryptodira</a></b> – 11 families, 74 genera, over 200 species
</td></tr>, <tr>
<th>Family<sup class="reference" id="cite_ref-7"><a href="#cite_note-7"><span class="cite-bracket">[</span>7<span class="cite-bracket">]</span></a></sup></th>
<th>Genera<sup class="reference" id="cite_ref-8"><a href="#cite_note-8"><span class="cite-bracket">[</span>8<span class="cite-bracket">]</span></a></sup></th>
<th>Common name(s)</th>
<th>Example species</th>
<th>Example image
</th></tr>]


In [51]:
combined_headers = []
for i in range(max(len(row.find_all('th')) for row in header_rows)):  # Handle varying lengths
    header_parts = []
    for row in header_rows:
        cells = row.find_all('th')
        header_parts.append(cells[i].text.strip() if i < len(cells) else '')
    combined_headers.append(' '.join(header_parts).strip())

# Clean headers
combined_headers = [re.sub(r'\[\d+\]', '', h) for h in combined_headers]

# Insert "Year Discovered"
insert_position = combined_headers.index('Family') + 1  # Use clean header for position
combined_headers.insert(insert_position, 'Year Discovered')

print("Combined Headers:", combined_headers)

Combined Headers: ['Family', 'Year Discovered', 'Genera', 'Common name(s)', 'Example species', 'Example image']


In [53]:
data = []
for row in table.find_all('tr')[2:]:  # Skip header rows
    cells = row.find_all('td')
    row_data = [cell.text.strip() for cell in cells]
    if row_data:
        family_col = row_data[combined_headers.index('Family')]
        match = re.search(r'\b(18|19|20)\d{2}\b', family_col)  
        year_discovered = match.group() if match else 'Unknown'
        row_data.insert(insert_position, year_discovered)  
        data.append(row_data)

for row in data:
    while len(row) < len(combined_headers):  
        row.append('')
    while len(row) > len(combined_headers):  
        row.pop()

df = pd.DataFrame(data, columns=combined_headers)
output_file = 'turtle_families.csv'
df.to_csv(output_file, index=False, encoding='utf-8')

print(f"Data written to {output_file}")

Data written to turtle_families.csv


In [54]:
df

Unnamed: 0,Family,Year Discovered,Genera,Common name(s),Example species,Example image
0,"CarettochelyidaeBoulenger, 1887",1887,1.0,Pig-nosed turtle,Pig-nosed turtle (Carettochelys insculpta),
1,"CheloniidaeOppel, 1811",1811,6.0,Sea turtles,Green sea turtle (Chelonia mydas),
2,"ChelydridaeGray, 1831",1831,2.0,Snapping turtles,Alligator snapping turtle (Macrochelys temminc...,
3,"DermatemydidaeGray, 1870",1870,1.0,Central American river turtle,Central American river turtle (Dermatemys mawii),
4,"DermochelyidaeFitzinger, 1843",1843,1.0,Leatherback sea turtle,Leatherback sea turtle (Dermochelys coriacea),
5,"EmydidaeRafinesque, 1815",1815,12.0,"Pond turtles, terrapins, and sliders",Red-eared slider (Trachemys scripta elegans),
6,"GeoemydidaeTheobald, 1868",1868,24.0,"Asian leaf turtles, roofed turtles, and Asian ...",Amboina box turtle (Cuora amboinensis),
7,"KinosternidaeAgassiz, 1857",1857,4.0,Mud and musk turtles,Common musk turtle (Sternotherus odoratus),
8,"PlatysternidaeGray, 1869",1869,1.0,Big-headed turtle,Big-headed turtle (Platysternon megacephalum),
9,"TestudinidaeBatsch, 1788",Unknown,12.0,Tortoises,Aldabra giant tortoise (Geochelone gigantea),


In [57]:
df['Family'] = df['Family'].str.replace(r'\d+', '', regex=True) 
df['Family'] = df['Family'].str.replace(r',', '', regex=True)
df = df.drop(index=11)

df

Unnamed: 0,Family,Year Discovered,Genera,Common name(s),Example species,Example image
0,CarettochelyidaeBoulenger,1887,1,Pig-nosed turtle,Pig-nosed turtle (Carettochelys insculpta),
1,CheloniidaeOppel,1811,6,Sea turtles,Green sea turtle (Chelonia mydas),
2,ChelydridaeGray,1831,2,Snapping turtles,Alligator snapping turtle (Macrochelys temminc...,
3,DermatemydidaeGray,1870,1,Central American river turtle,Central American river turtle (Dermatemys mawii),
4,DermochelyidaeFitzinger,1843,1,Leatherback sea turtle,Leatherback sea turtle (Dermochelys coriacea),
5,EmydidaeRafinesque,1815,12,"Pond turtles, terrapins, and sliders",Red-eared slider (Trachemys scripta elegans),
6,GeoemydidaeTheobald,1868,24,"Asian leaf turtles, roofed turtles, and Asian ...",Amboina box turtle (Cuora amboinensis),
7,KinosternidaeAgassiz,1857,4,Mud and musk turtles,Common musk turtle (Sternotherus odoratus),
8,PlatysternidaeGray,1869,1,Big-headed turtle,Big-headed turtle (Platysternon megacephalum),
9,TestudinidaeBatsch,Unknown,12,Tortoises,Aldabra giant tortoise (Geochelone gigantea),
