In [None]:
import requests
import time
import tqdm
import pandas as pd
import os
import json
from bs4 import BeautifulSoup

In [None]:
# Set the option to display DataFrame columns without truncation
pd.set_option('display.max_colwidth', None)

# List of search terms
search_terms = [
    "Klimaforandringer", "CO2-udledning"
]

# Number of pages to scrape for each term
num_pages = 1

data = {"Term": [], "Date": [], "Headline": [], "Category": []}

for term in search_terms:
    for page in range(1, num_pages + 1):
        url = f'https://jyllands-posten.dk/soeg/?term={term}&sort=newest&page={page}'

        try:
            response = requests.get(url)
            response.raise_for_status()

            soup = BeautifulSoup(response.content, "html.parser")

            date_elements = soup.find_all("time", class_="pl-1")
            headline_elements = soup.find_all("h2", class_="font-article-heading text-lg font-medium leading-snug group-hover:underline md:text-2xl")
            category_elements = soup.find_all("span", class_="pr-1")

            if date_elements:
                for date_element in date_elements:
                    date = date_element.text.strip()
                    data["Term"].append(term)
                    data["Date"].append(date)
            else:
                print(f"No dates found for {term} on page {page}.")

            if headline_elements:
                for headline_element in headline_elements:
                    headline = headline_element.text.strip()
                    data["Headline"].append(headline)
            else:
                print(f"No headlines found for {term} on page {page}.")
            
            if category_elements:
                for category_element in category_elements:
                    category = category_element.text.strip()
                    data["Category"].append(category)
            else:
                print(f"No category found for {term} on page {page}.")
            
        except requests.RequestException as e:
            print("Error fetching the web page:", e)
        except Exception as e:
            print("An error occurred:", e)

# Create a pandas DataFrame from the combined dictionary
df = pd.DataFrame(data)
df

df.to_csv('jyllands_posten.csv', index=False)

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import time
import locale

# Set the locale to Danish for month name parsing
locale.setlocale(locale.LC_TIME, 'da_DK.utf8')

# Set the option to display DataFrame columns without truncation
pd.set_option('display.max_colwidth', None)

# List of search terms
search_terms = [
    "CO2-mål", "CO2-reduktion", "CO2-udledning", "CO2-udledninger",
    "global opvarmning", "havstigning", "havstigninger", "klimaaftryk",
    "klimaangst", "klimaforandring", "klimaforandringer", "klimakrise",
    "klimakrisen", "klimasynder"
]

data = {"Term": [], "Date": [], "Headline": [], "Category": []}

target_date = datetime.strptime("1. januar 2011", "%d. %B %Y")
target_date_str = target_date.strftime("%d. %B %Y")

# Your custom User-Agent string
headers = {
    'User-Agent': 'Student from Copenhagen University - vmn430@alumni.ku.dk'
}

for term in search_terms:
    page = 1
    while True:
        url = f'https://jyllands-posten.dk/soeg/?term={term}&sort=newest&page={page}'

        try:
            # Include the custom headers in your request
            response = requests.get(url, headers=headers)
            response.raise_for_status()

            soup = BeautifulSoup(response.content, "html.parser")

            date_elements = soup.find_all("time", class_="pl-1")
            headline_elements = soup.find_all("h2", class_="font-article-heading text-lg font-medium leading-snug group-hover:underline md:text-2xl")
            category_elements = soup.find_all("span", class_="pr-1")

            if date_elements:
                for date_element, headline_element, category_element in zip(date_elements, headline_elements, category_elements):
                    date_str = date_element.text.strip()
                    date = datetime.strptime(date_str, "%d. %B %Y")
                    if date >= target_date:
                        data["Term"].append(term)
                        data["Date"].append(date_str)
                        data["Headline"].append(headline_element.text.strip())
                        data["Category"].append(category_element.text.strip())
                        print(f"Appended: {term} | {date_str}")  # Debugging
                    else:
                        break
            else:
                print(f"No dates found for {term} on page {page}.")

            page += 1  # Move to the next page

            # Introduce a delay of 0.5 second before making the next request
            time.sleep(0.5)

            # Check if the scraped date is before the target date
            if date < target_date:
                break

        except requests.RequestException as e:
            print("Error fetching the web page:", e)
        except Exception as e:
            print("An error occurred:", e)

# Create a pandas DataFrame from the combined dictionary
df = pd.DataFrame(data)

# Sort the DataFrame by Date
df["Date"] = pd.to_datetime(df["Date"], format="%d. %B %Y")
df = df.sort_values("Date", ascending=True)

df.to_csv('jyllands_posten_data.csv', index=False)


In [17]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import time
import locale

# Set the locale to Danish for month name parsing
locale.setlocale(locale.LC_TIME, 'da_DK.utf8')

# Set the option to display DataFrame columns without truncation
pd.set_option('display.max_colwidth', None)

# List of search terms
search_terms = [
    "CO2-mål", "CO2-reduktion", "CO2-udledning", "CO2-udledninger",
    "global opvarmning", "havstigning", "havstigninger", "klimaaftryk",
    "klimaangst", "klimaforandring", "klimaforandringer", "klimakrise",
    "klimakrisen", "klimasynder"
]

data = {"Term": [], "Date": [], "Headline": [], "Category": []}

target_date = datetime.strptime("1. januar 2011", "%d. %B %Y")

# Your custom User-Agent string
headers = {
    'User-Agent': 'Student from Copenhagen University - vmn430@alumni.ku.dk'
}

for term in search_terms:
    page = 1
    no_dates_found_counter = 0  # Counter to keep track of consecutive pages with no dates found

    while True:
        url = f'https://jyllands-posten.dk/soeg/?term={term}&sort=newest&page={page}'

        try:
            response = requests.get(url, headers=headers)
            response.raise_for_status()

            soup = BeautifulSoup(response.content, "html.parser")

            date_elements = soup.find_all("time", class_="pl-1")
            headline_elements = soup.find_all("h2", class_="font-article-heading text-lg font-medium leading-snug group-hover:underline md:text-2xl")
            category_elements = soup.find_all("span", class_="pr-1")

            if not date_elements:
                print(f"No dates found for {term} on page {page}.")
                no_dates_found_counter += 1
                if no_dates_found_counter >= 5:  # Adjust this threshold as needed
                    print(f"Stopping search for {term} after {no_dates_found_counter} consecutive pages with no dates found.")
                    break
            else:  # Reset counter if dates are found
                no_dates_found_counter = 0

            for date_element, headline_element, category_element in zip(date_elements, headline_elements, category_elements):
                date_str = date_element.text.strip()
                date = datetime.strptime(date_str, "%d. %B %Y")
                if date >= target_date:
                    data["Term"].append(term)
                    data["Date"].append(date_str)
                    data["Headline"].append(headline_element.text.strip())
                    data["Category"].append(category_element.text.strip())
                    print(f"Appended: {term} | {date_str}")  # Debugging
                else:
                    break

            page += 1

            # Introduce a delay of 0.5 second before making the next request
            time.sleep(0.5)

            # Check if the scraped date is before the target date
            if date < target_date:
                break

        except requests.RequestException as e:
            print("Error fetching the web page:", e)
        except Exception as e:
            print("An error occurred:", e)

# Create a pandas DataFrame from the combined dictionary
df = pd.DataFrame(data)

# Sort the DataFrame by Date
df["Date"] = pd.to_datetime(df["Date"], format="%d. %B %Y")
df = df.sort_values("Date", ascending=True)

df.to_csv('jyllands_posten_data.csv', index=False)


Appended: CO2-mål | 13. august 2023
Appended: CO2-mål | 10. august 2023
Appended: CO2-mål | 10. august 2023
Appended: CO2-mål | 9. august 2023
Appended: CO2-mål | 7. august 2023
Appended: CO2-mål | 7. august 2023
Appended: CO2-mål | 7. august 2023
Appended: CO2-mål | 3. august 2023
Appended: CO2-mål | 1. august 2023
Appended: CO2-mål | 27. juli 2023
Appended: CO2-mål | 27. juli 2023
Appended: CO2-mål | 26. juli 2023
Appended: CO2-mål | 25. juli 2023
Appended: CO2-mål | 20. juli 2023
Appended: CO2-mål | 20. juli 2023
Appended: CO2-mål | 13. juli 2023
Appended: CO2-mål | 8. juli 2023
Appended: CO2-mål | 7. juli 2023
Appended: CO2-mål | 28. juni 2023
Appended: CO2-mål | 27. juni 2023
Appended: CO2-mål | 22. juni 2023
Appended: CO2-mål | 21. juni 2023
Appended: CO2-mål | 20. juni 2023
Appended: CO2-mål | 19. juni 2023
Appended: CO2-mål | 19. juni 2023
Appended: CO2-mål | 15. juni 2023
Appended: CO2-mål | 15. juni 2023
Appended: CO2-mål | 15. juni 2023
Appended: CO2-mål | 14. juni 2023
Appe