In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Base URL for the races category
base_url = 'https://www.irunfar.com/category/races'

# Function to get the soup object for a given URL
def get_soup(url):
    response = requests.get(url)
    if response.status_code == 200:
        return BeautifulSoup(response.content, 'html.parser')
    else:
        print(f"Failed to retrieve page {url}")
        return None

# Function to scrape the main race page for article links
def scrape_article_links(url):
    soup = get_soup(url)
    if soup is None:
        return []

    articles = soup.find_all('article')
    article_links = []

    for article in articles:
        link_tag = article.find('a', href=True)
        if link_tag:
            article_links.append(link_tag['href'])

    return article_links

# Function to scrape a single article page for its content
def scrape_article_content(article_url):
    soup = get_soup(article_url)
    if soup is None:
        return None

    # Attempt to scrape the article title
    title = None
    if soup.find('h1'):
        title = soup.find('h1').get_text(strip=True)
    elif soup.find('h2'):
        title = soup.find('h2').get_text(strip=True)
    elif soup.find('title'):
        title = soup.find('title').get_text(strip=True)

    if not title:
        print(f"Title not found for {article_url}")
        return None

    # Scrape the article content (assuming it's in 'entry-content' class)
    content_div = soup.find('div', class_='entry-content')
    if content_div:
        paragraphs = content_div.find_all('p')
        content = ' '.join([p.get_text(strip=True) for p in paragraphs])
    else:
        print(f"Content not found for {article_url}")
        return None

    # Return the scraped article with title and content
    return {'Header': title, 'Body': content}

# Main scraping logic
def scrape_irunfar_race_articles(base_url, max_pages=60):
    all_articles = []

    for page_num in range(1, max_pages + 1):
        print(f"Scraping page {page_num}")
        
        page_url = base_url if page_num == 1 else f'{base_url}/page/{page_num}'
        
        # Get article links from the page
        article_links = scrape_article_links(page_url)

        # Visit each article and get the content if "results" is in the URL
        for article_url in article_links:
            if 'results' in article_url:  # Only scrape articles with "results" in the URL
                print(f"Scraping article: {article_url}")
                article_data = scrape_article_content(article_url)
                if article_data:
                    all_articles.append(article_data)
    
    return all_articles

# Store results into a DataFrame and save as CSV with "Header" and "Body"
def save_articles_to_csv(articles, file_name='irunfar_races_articles_results.csv'):
    # Create a DataFrame with specific columns for "Header" and "Body"
    df = pd.DataFrame(articles, columns=['Header', 'Body'])
    df.to_csv(file_name, index=False)
    print(f"Saved {len(articles)} articles to {file_name}")

# Main Execution
if __name__ == "__main__":
    articles = scrape_irunfar_race_articles(base_url, max_pages=60)  # Adjust max_pages as needed
    save_articles_to_csv(articles)


Scraping page 1
Scraping article: https://www.irunfar.com/2024-grand-trail-des-templiers-results-caitlin-fielder-and-thomas-cardin-victorious
Scraping article: https://www.irunfar.com/2024-mountain-running-world-cup-finals-trail-race-results
Scraping article: https://www.irunfar.com/2024-mountain-running-world-cup-finals-vk-results
Scraping article: https://www.irunfar.com/mammoth-trail-fest-26k-results-joyce-muthoni-njeru-and-elhousine-elazzaoui-dominate
Scraping article: https://www.irunfar.com/2024-run-rabbit-run-100-mile-results
Scraping article: https://www.irunfar.com/2024-utmb-results
Scraping article: https://www.irunfar.com/2024-ccc-results-hawks-wins-again-mccann-victorious-in-step-up-in-distance
Scraping article: https://www.irunfar.com/2024-occ-results-hemming-and-yao-win-utmb-50k-world-series-final
Scraping page 2
Scraping article: https://www.irunfar.com/2024-tds-results
Scraping article: https://www.irunfar.com/2024-sierre-zinal-results
Scraping article: https://www.irun