In [38]:
from bs4 import BeautifulSoup
import json
import os




In [15]:
def extract_f1_race_data(html_content):
    try:
        soup = BeautifulSoup(html_content, 'html.parser')
        race_data = {}

        # Extract basic race details
        race_data['title'] = soup.title.string.strip() if soup.title else "Unknown"
        race_info = soup.find('div', {'id': 'info-pane'})
        if race_info:
            race_data['race_name'] = race_info.find('h1').get_text(strip=True)
            stats = race_info.find_all('div', class_='stats')
            for stat in stats:
                title = stat.find('div', class_='title').get_text(strip=True)
                content = stat.find('div', class_='content').get_text(strip=True)
                race_data[title.lower().replace(' ', '_')] = content

        # Extract driver and team performance data
        race_data['performance'] = {}
        performance_blocks = soup.find_all('div', class_='stats-block')
        for block in performance_blocks:
            title = block.find('div', class_='title').get_text(strip=True)
            value = block.find('div', class_='value').get_text(strip=True)
            race_data['performance'][title.lower().replace(' ', '_')] = value

        # Extract session results tables
        race_data['results'] = {}
        for table in soup.find_all('div', class_='table-wrapper'):
            # Look for the session title based on the nearest section header (id attribute)
            session_header = table.find_previous('div', {'id': True})
            if session_header:
                session_id = session_header['id']
                if session_id not in race_data['results']:
                    race_data['results'][session_id] = []

                headers = [th.get_text(strip=True) for th in table.find('thead').find_all('th')]
                rows = table.find('tbody').find_all('tr')
                for row in rows:
                    data = [td.get_text(strip=True) for td in row.find_all('td')]
                    race_data['results'][session_id].append(dict(zip(headers, data)))

        return race_data

    except Exception as e:
        return {"error": str(e)}

In [37]:


def process_all_races(directory_path, output_base_path):
	for year_dir in os.listdir(directory_path):
		year_path = os.path.join(directory_path, year_dir)
		if os.path.isdir(year_path):
			for race_name in os.listdir(year_path):
				race_path = os.path.join(year_path, race_name)
				if os.path.isdir(race_path):
					for file in os.listdir(race_path):
						if file.endswith(".html"):
							try:
								with open(os.path.join(race_path, file), 'r', encoding='utf-8') as file:
									html_content = file.read()
								race_data = extract_f1_race_data(html_content)

								# Create output directory structure
								output_year_path = os.path.join(output_base_path, year_dir)
								os.makedirs(output_year_path, exist_ok=True)

								# Save race data as JSON
								output_file = os.path.join(output_year_path, f"{os.path.splitext(race_name)[0]}.json")

								with open(output_file, 'w', encoding='utf-8') as json_file:
									json.dump(race_data, json_file, indent=4, ensure_ascii=False)
							except Exception as e:
								print(f"Error processing {race_path}: {e}")

	print(f"Processed races saved in: {output_base_path}")

# Usage Example
directory_path = "f1_all_seasons_html"
output_base_path = "f1_all_seasons_results"
process_all_races(directory_path, output_base_path)

Processed races saved in: f1_all_seasons_results
