In [3]:
pip install requests beautifulsoup4 pandas

Note: you may need to restart the kernel to use updated packages.


In [None]:
# Scrape article URLs from the hub page: https://www.cyclingnews.com/tour-de-france/ (from 27.06.2025)
import requests
from bs4 import BeautifulSoup
import time

def get_tour_de_france_article_links():
    """
    Scrapes all unique article links from the first 4 pages of the 
    CyclingNews Tour de France section using a predefined list of URLs.

    Returns:
        list: A list of unique article URLs.
    """
    # Use the exact page URLs you provided
    page_urls = [
        "https://www.cyclingnews.com/tour-de-france/",
        "https://www.cyclingnews.com/tour-de-france/page/2/",
        "https://www.cyclingnews.com/tour-de-france/page/3/",
        "https://www.cyclingnews.com/tour-de-france/page/4/"
    ]
    
    unique_article_urls = set()
    
    print("Starting to scrape article links...")

    # Set a user-agent header to mimic a real browser visit
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    for url in page_urls:
        print(f"Scraping page: {url}")
        
        try:
            # Make the HTTP request
            response = requests.get(url, headers=headers, timeout=15)
            # This will raise an error if the page is not found (e.g., 404)
            response.raise_for_status()
            
            # Parse the HTML content
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Find all <a> tags with the specific class 'article-link'
            links_on_page = soup.find_all('a', class_='article-link')
            
            if not links_on_page:
                print("  --> No links with class 'article-link' found on this page.")
                continue

            found_count = 0
            for link in links_on_page:
                # Ensure the link tag has an 'href' attribute
                if 'href' in link.attrs:
                    article_url = link['href']
                    # The URLs are already absolute, so we can add them directly
                    unique_article_urls.add(article_url)
                    found_count += 1
            
            print(f"  --> Found {found_count} article links on this page.")

            # Be polite and wait a moment before scraping the next page
            time.sleep(1)

        except requests.exceptions.RequestException as e:
            print(f"  --> Failed to fetch or process page {url}. Error: {e}")
            # Continue to the next URL even if one fails
            continue
            
    return list(unique_article_urls)

# --- Main script execution ---
if __name__ == '__main__':
    # Call the function to get the links
    scraped_links = get_tour_de_france_article_links()
    
    if scraped_links:
        print("\n-------------------------------------------")
        print(f"✅ Success! Found a total of {len(scraped_links)} unique article links.")
        print("-------------------------------------------\n")
        
        print("Here is a sample of the links found:")
        # Print the first 15 links as a sample
        for i, link in enumerate(scraped_links[:15]):
            print(f"{i+1:02d}: {link}")
    else:
        print("\n-------------------------------------------")
        print("❌ No article links were found. Please check the website structure and class names again.")
        print("-------------------------------------------")

Starting to scrape article links...
Scraping page: https://www.cyclingnews.com/tour-de-france/
  --> Found 10 article links on this page.
Scraping page: https://www.cyclingnews.com/tour-de-france/page/2/
  --> Found 10 article links on this page.
Scraping page: https://www.cyclingnews.com/tour-de-france/page/3/
  --> Found 10 article links on this page.
Scraping page: https://www.cyclingnews.com/tour-de-france/page/4/
  --> Found 10 article links on this page.

-------------------------------------------
✅ Success! Found a total of 40 unique article links.
-------------------------------------------

Here is a sample of the links found:
01: https://www.cyclingnews.com/news/3-2-1-cest-partiiieeee-remco-evenepoel-begins-tour-de-france-preparation/
02: https://www.cyclingnews.com/news/it-is-not-a-game-it-is-a-fight-lotto-ceo-gives-insight-into-sponsorship-struggle-after-finding-temporary-tour-de-france-solution/
03: https://www.cyclingnews.com/news/compared-to-last-year-we-have-taken-anot

In [5]:
pip install pandas nltk

Note: you may need to restart the kernel to use updated packages.


In [6]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\raclo\AppData\Roaming\nltk_data...


True

In [7]:
# Scrape and analyse articles
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# --- This function is from our previous success ---
def get_tour_de_france_article_links():
    """
    Scrapes all unique article links from the first 4 pages of the 
    CyclingNews Tour de France section.
    """
    page_urls = [
        "https://www.cyclingnews.com/tour-de-france/",
        "https://www.cyclingnews.com/tour-de-france/page/2/",
        "https://www.cyclingnews.com/tour-de-france/page/3/",
        "https://www.cyclingnews.com/tour-de-france/page/4/"
    ]
    unique_article_urls = set()
    print("Step 1: Starting to scrape article links...")
    headers = {'User-Agent': 'Mozilla/5.0'}

    for url in page_urls:
        try:
            response = requests.get(url, headers=headers, timeout=15)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'html.parser')
            links_on_page = soup.find_all('a', class_='article-link')
            for link in links_on_page:
                if 'href' in link.attrs:
                    unique_article_urls.add(link['href'])
            time.sleep(1)
        except requests.exceptions.RequestException as e:
            print(f"  --> Warning: Could not process page {url}. Error: {e}")
            
    print(f"-> Found {len(unique_article_urls)} unique article links.")
    return list(unique_article_urls)

def scrape_and_analyze_article(article_url, analyzer):
    """
    Scrapes the content of a single article, analyzes its sentiment,
    and returns the structured data.
    """
    try:
        response = requests.get(article_url, headers={'User-Agent': 'Mozilla/5.0'}, timeout=15)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        # --- Scrape the key information ---
        headline = soup.find('h1').get_text(strip=True) if soup.find('h1') else "N/A"
        pub_date_tag = soup.find('time', class_='published-date')
        pub_date = pub_date_tag['datetime'] if pub_date_tag else "N/A"
        
        article_body_div = soup.find('div', id='article-body')
        if article_body_div:
            # Combine the text from all paragraphs into a single string
            paragraphs = article_body_div.find_all('p')
            full_text = "\n".join([p.get_text(strip=True) for p in paragraphs])
        else:
            full_text = ""

        # --- Perform Sentiment Analysis ---
        # We analyze the headline and the body text together for a more accurate score
        text_to_analyze = f"{headline}. {full_text}"
        
        if not text_to_analyze.strip():
            return None # Skip if there's no text to analyze

        # Get the polarity scores from VADER
        sentiment_scores = analyzer.polarity_scores(text_to_analyze)
        compound_score = sentiment_scores['compound']

        return {
            'headline': headline,
            'publication_date': pub_date,
            'url': article_url,
            'compound_sentiment': compound_score, # The score we care about
            'full_text': full_text
        }

    except requests.exceptions.RequestException as e:
        print(f"  --> Failed to fetch or process article {article_url}. Error: {e}")
        return None

# --- Main script execution ---
if __name__ == '__main__':
    # Get the list of links first
    article_links = get_tour_de_france_article_links()
    
    if not article_links:
        print("No article links found. Exiting.")
    else:
        print("\nStep 2: Initializing sentiment analyzer and scraping content...")
        
        # Initialize the VADER sentiment analyzer
        sia = SentimentIntensityAnalyzer()
        
        all_articles_data = []
        total_links = len(article_links)

        # Loop through each link, scrape, and analyze
        for i, link in enumerate(article_links):
            print(f"  ({i+1}/{total_links}) Scraping and analyzing: {link}")
            article_data = scrape_and_analyze_article(link, sia)
            
            # Add the data to our list if scraping was successful
            if article_data:
                all_articles_data.append(article_data)
            
            time.sleep(0.5) # Be polite

        if not all_articles_data:
            print("\nCould not scrape content from any of the articles.")
        else:
            # Convert the list of dictionaries into a pandas DataFrame
            df = pd.DataFrame(all_articles_data)
            
            print("\n-------------------------------------------")
            print(f"✅ Success! Scraped and analyzed {len(df)} articles.")
            print("-------------------------------------------\n")

            # Display a sample of the final DataFrame
            print("Here is a sample of your data with sentiment scores:")
            print(df[['headline', 'publication_date', 'compound_sentiment']].head().to_string())

            # Save the complete DataFrame to a CSV file for your project
            output_filename = "tour_de_france_articles_with_sentiment.csv"
            df.to_csv(output_filename, index=False, encoding='utf-8-sig')
            
            print(f"\nAll data has been saved to '{output_filename}'")
            print("You can now open this file in Excel, Python (with pandas), or any other analysis tool.")

Step 1: Starting to scrape article links...
-> Found 40 unique article links.

Step 2: Initializing sentiment analyzer and scraping content...
  (1/40) Scraping and analyzing: https://www.cyclingnews.com/news/3-2-1-cest-partiiieeee-remco-evenepoel-begins-tour-de-france-preparation/
  (2/40) Scraping and analyzing: https://www.cyclingnews.com/news/it-is-not-a-game-it-is-a-fight-lotto-ceo-gives-insight-into-sponsorship-struggle-after-finding-temporary-tour-de-france-solution/
  (3/40) Scraping and analyzing: https://www.cyclingnews.com/news/compared-to-last-year-we-have-taken-another-big-step-visma-management-rate-jonas-vingegaard-as-favourite-for-tour-de-france/
  (4/40) Scraping and analyzing: https://www.cyclingnews.com/news/tough-irregular-very-long-remco-evenepoel-prepares-for-tour-de-france-with-col-de-la-loze-recon/
  (5/40) Scraping and analyzing: https://www.cyclingnews.com/news/no-bad-feeling-towards-uno-x-after-tour-de-france-non-selection-says-alexander-kristoff/
  (6/40) Scr

In [10]:
df_loaded = pd.read_csv(output_filename, encoding='utf-8-sig')
df_loaded.head(40)

Unnamed: 0,headline,publication_date,url,compound_sentiment,full_text
0,"'3-2-1, C’est Partiiieeee' - Remco Evenepoel b...",,https://www.cyclingnews.com/news/3-2-1-cest-pa...,0.916,Remco Evenepoelhas kicked off his preparation ...
1,"'It is not a game, it is a fight' – Lotto CEO ...",,https://www.cyclingnews.com/news/it-is-not-a-g...,0.9967,Amid the struggle for sponsorship among lower-...
2,"'Compared to last year, we have taken another ...",,https://www.cyclingnews.com/news/compared-to-l...,0.9828,Visma-Lease A Bike manager Richard Plugge has ...
3,"'Tough, irregular, very long' – Remco Evenepoe...",,https://www.cyclingnews.com/news/tough-irregul...,0.9133,The topTour de Francecontenders are currently ...
4,'No bad feeling' towards Uno-X after Tour de F...,,https://www.cyclingnews.com/news/no-bad-feelin...,0.9606,"Alexander Kristoffhas said that ""there's no ba..."
5,'It's all out of respect' – Visma-Lease a Bike...,,https://www.cyclingnews.com/news/its-all-out-o...,0.9996,The Grand Tour rivalry betweenVisma-Lease a Bi...
6,Tour de France sprinters prepare for opening d...,,https://www.cyclingnews.com/news/tour-de-franc...,0.9983,With the Critérium du Dauphiné in the books an...
7,Relegation Watch 2025: Tour de France will be ...,,https://www.cyclingnews.com/news/relegation-wa...,0.9938,For full size graphicclick here.\nWith the Cri...
8,'There's more at stake than just a Tour de Fra...,,https://www.cyclingnews.com/news/theres-more-a...,0.9968,Dylan Groenewegen has thrown his hat into the ...
9,'I’m gutted' – Stevie Williams to miss Tour de...,,https://www.cyclingnews.com/news/im-gutted-ste...,0.2303,Stevie Williamshas revealed he will miss the T...


In [None]:
# Link articles to cyclists and aggregate the sentiment for each cyclist
import pandas as pd

# Assume 'df' is the DataFrame from the previous step
# df = pd.read_csv('tour_de_france_articles_with_sentiment.csv')

# Step 1: Create your list of top cyclists to track (can be much longer)
# For simplicity, we'll just use a few names and last names.
cyclists_to_track = {
    "Tadej Pogačar": ["Pogačar", "Tadej Pogačar"],
    "Jonas Vingegaard": ["Vingegaard", "Jonas Vingegaard"],
    "Remco Evenepoel": ["Evenepoel", "Remco Evenepoel"],
    "Adam Yates": ["Adam Yates"],
    "Carlos Rodríguez": ["Rodríguez", "Carlos Rodríguez"],
    



}

# Step 2: Link articles to cyclists
cyclist_mentions = []
for index, article in df.iterrows():
    text_to_search = article['headline'] + " " + article['full_text']
    
    for cyclist_name, search_terms in cyclists_to_track.items():
        for term in search_terms:
            if term in text_to_search:
                # If found, add a record for this mention
                cyclist_mentions.append({
                    'cyclist': cyclist_name,
                    'sentiment': article['compound_sentiment'],
                    'url': article['url']
                })
                # Break after the first match to avoid double-counting in the same article
                break

# Create a new DataFrame from the mentions
df_mentions = pd.DataFrame(cyclist_mentions)

# Step 3: Aggregate the data to create rankings
if not df_mentions.empty:
    # Group by cyclist and calculate the mean sentiment and number of mentions
    cyclist_ranking = df_mentions.groupby('cyclist').agg(
        average_sentiment=('sentiment', 'mean'),
        mention_count=('sentiment', 'count')
    ).reset_index()

    # --- Step 4: Rank the cyclists ---
    
    # Rank by most positive sentiment (for cyclists with at least 2 mentions)
    top_10_positive = cyclist_ranking[cyclist_ranking['mention_count'] > 1].sort_values(
        by='average_sentiment', ascending=False
    ).head(10)

    # Rank by most mentioned
    top_10_mentioned = cyclist_ranking.sort_values(
        by='mention_count', ascending=False
    ).head(10)

    print("--- Top Cyclists by Average Sentiment Score ---")
    print(top_10_positive.to_string(index=False))

    print("\n--- Top Cyclists by Number of Mentions ---")
    print(top_10_mentioned.to_string(index=False))

else:
    print("No mentions of the tracked cyclists were found in the articles.")

In [17]:
# Link articles to cyclists and aggregate the sentiment for each cyclist
import pandas as pd

# Assume 'df' is the DataFrame from the previous step
# df = pd.read_csv('tour_de_france_articles_with_sentiment.csv')

# Step 1: Create your list of top cyclists to track (can be much longer)
# For simplicity, we'll just use a few names and last names.
cyclists_to_track = {
    "Tadej Pogačar": ["Pogačar", "Tadej Pogačar"],
    "Jonas Vingegaard": ["Vingegaard", "Jonas Vingegaard"],
    "Remco Evenepoel": ["Evenepoel", "Remco Evenepoel"],
    "Adam Yates": ["Adam Yates"],
    "João Almeida": ["Almeida", "João Almeida"],
    "Primož Roglič": ["Roglič", "Primož Roglič"],
    "Enric Mas": ["Mas", "Enric Mas"],
    "Richard Carapaz": ["Carapaz", "Richard Carapaz"],
    "Ben O'Connor": ["O'Connor", "Ben O'Connor"],
    "Simon Yates": ["Simon Yates"],
    "David Gaudu": ["Gaudu", "David Gaudu"],
    "Guillaume Martin": ["Martin", "Guillaume Martin"],
    "Emanuel Buchmann": ["Buchmann", "Emanuel Buchmann"],
    "Sepp Kuss": ["Kuss", "Sepp Kuss"],
    "Aleksandr Vlasov": ["Vlasov", "Aleksandr Vlasov"],
    "Matteo Jorgenson": ["Jorgenson", "Matteo Jorgenson"],
    "Neilson Powless": ["Powless", "Neilson Powless"],
    "Oscar Onley": ["Onley", "Oscar Onley"],
    "Mattias Skjelmose": ["Skjelmose", "Mattias Skjelmose"]
    }

# Step 2: Link articles to cyclists
cyclist_mentions = []
for index, article in df.iterrows():
    text_to_search = article['headline'] + " " + article['full_text']
    
    for cyclist_name, search_terms in cyclists_to_track.items():
        for term in search_terms:
            if term in text_to_search:
                # If found, add a record for this mention
                cyclist_mentions.append({
                    'cyclist': cyclist_name,
                    'sentiment': article['compound_sentiment'],
                    'url': article['url']
                })
                # Break after the first match to avoid double-counting in the same article
                break

# Create a new DataFrame from the mentions
df_mentions = pd.DataFrame(cyclist_mentions)

# Step 3: Aggregate the data to create rankings
if not df_mentions.empty:
    # Group by cyclist and calculate the mean sentiment and number of mentions
    cyclist_ranking = df_mentions.groupby('cyclist').agg(
        average_sentiment=('sentiment', 'mean'),
        mention_count=('sentiment', 'count')
    ).reset_index()

    # --- Step 4: Rank the cyclists ---
    
    # Rank by most positive sentiment (for cyclists with at least 2 mentions)
    top_10_positive = cyclist_ranking[cyclist_ranking['mention_count'] > 1].sort_values(
        by='average_sentiment', ascending=False
    ).head(10)

    # Rank by most mentioned
    top_10_mentioned = cyclist_ranking.sort_values(
        by='mention_count', ascending=False
    ).head(10)

    print("--- Top Cyclists by Average Sentiment Score ---")
    print(top_10_positive.to_string(index=False))

    print("\n--- Top Cyclists by Number of Mentions ---")
    print(top_10_mentioned.to_string(index=False))

else:
    print("No mentions of the tracked cyclists were found in the articles.")

--- Top Cyclists by Average Sentiment Score ---
          cyclist  average_sentiment  mention_count
      Simon Yates           0.999175              4
     Ben O'Connor           0.998350              2
        Enric Mas           0.997467              3
     João Almeida           0.997180              5
       Adam Yates           0.995500              3
Mattias Skjelmose           0.990060              5
    Primož Roglič           0.984710             10
  Remco Evenepoel           0.982215             20
        Sepp Kuss           0.967960              5
 Matteo Jorgenson           0.958883              6

--- Top Cyclists by Number of Mentions ---
          cyclist  average_sentiment  mention_count
    Tadej Pogačar           0.901788             24
 Jonas Vingegaard           0.887045             22
  Remco Evenepoel           0.982215             20
    Primož Roglič           0.984710             10
 Matteo Jorgenson           0.958883              6
     João Almeida       

In [20]:
df_mentions.head(15)

Unnamed: 0,cyclist,sentiment,url
0,Tadej Pogačar,0.916,https://www.cyclingnews.com/news/3-2-1-cest-pa...
1,Jonas Vingegaard,0.916,https://www.cyclingnews.com/news/3-2-1-cest-pa...
2,Remco Evenepoel,0.916,https://www.cyclingnews.com/news/3-2-1-cest-pa...
3,Tadej Pogačar,0.9828,https://www.cyclingnews.com/news/compared-to-l...
4,Jonas Vingegaard,0.9828,https://www.cyclingnews.com/news/compared-to-l...
5,Tadej Pogačar,0.9133,https://www.cyclingnews.com/news/tough-irregul...
6,Jonas Vingegaard,0.9133,https://www.cyclingnews.com/news/tough-irregul...
7,Remco Evenepoel,0.9133,https://www.cyclingnews.com/news/tough-irregul...
8,Primož Roglič,0.9133,https://www.cyclingnews.com/news/tough-irregul...
9,Matteo Jorgenson,0.9133,https://www.cyclingnews.com/news/tough-irregul...


In [21]:
# Rank by Most Positive Coverage (Average Sentiment)
# First, filter for riders with more than one mention to get more reliable scores
reliable_ranking = cyclist_ranking[cyclist_ranking['mention_count'] > 1]

# Now, sort by the average sentiment score
top_10_by_sentiment = reliable_ranking.sort_values(
    by='average_sentiment', 
    ascending=False
).head(10)

print("--- Top 10 Riders by Most Positive Average Sentiment ---")
print(top_10_by_sentiment)

--- Top 10 Riders by Most Positive Average Sentiment ---
              cyclist  average_sentiment  mention_count
16        Simon Yates           0.999175              4
1        Ben O'Connor           0.998350              2
4           Enric Mas           0.997467              3
7        João Almeida           0.997180              5
0          Adam Yates           0.995500              3
9   Mattias Skjelmose           0.990060              5
12      Primož Roglič           0.984710             10
13    Remco Evenepoel           0.982215             20
15          Sepp Kuss           0.967960              5
8    Matteo Jorgenson           0.958883              6


In [22]:
# Rank by media prominence
# Sort by the number of times each rider was mentioned
top_10_by_mentions = cyclist_ranking.sort_values(
    by='mention_count', 
    ascending=False
).head(10)

print("\n--- Top 10 Riders by Media Prominence (Most Mentions) ---")
print(top_10_by_mentions)


--- Top 10 Riders by Media Prominence (Most Mentions) ---
              cyclist  average_sentiment  mention_count
17      Tadej Pogačar           0.901788             24
6    Jonas Vingegaard           0.887045             22
13    Remco Evenepoel           0.982215             20
12      Primož Roglič           0.984710             10
8    Matteo Jorgenson           0.958883              6
7        João Almeida           0.997180              5
15          Sepp Kuss           0.967960              5
9   Mattias Skjelmose           0.990060              5
16        Simon Yates           0.999175              4
4           Enric Mas           0.997467              3


In [23]:
# Rank by combined score (Sentiment + Prominence)
# Create the combined score
cyclist_ranking['combined_score'] = cyclist_ranking['average_sentiment'] * cyclist_ranking['mention_count']

# Sort by this new combined score
top_10_by_combined_score = cyclist_ranking.sort_values(
    by='combined_score', 
    ascending=False
).head(10)

print("\n--- Top 10 Riders by Combined Score (Sentiment * Mentions) ---")
print(top_10_by_combined_score)


--- Top 10 Riders by Combined Score (Sentiment * Mentions) ---
              cyclist  average_sentiment  mention_count  combined_score
17      Tadej Pogačar           0.901788             24         21.6429
13    Remco Evenepoel           0.982215             20         19.6443
6    Jonas Vingegaard           0.887045             22         19.5150
12      Primož Roglič           0.984710             10          9.8471
8    Matteo Jorgenson           0.958883              6          5.7533
7        João Almeida           0.997180              5          4.9859
9   Mattias Skjelmose           0.990060              5          4.9503
15          Sepp Kuss           0.967960              5          4.8398
16        Simon Yates           0.999175              4          3.9967
4           Enric Mas           0.997467              3          2.9924
