In [12]:
import os
import requests
from bs4 import BeautifulSoup
import re
import json
from datetime import datetime

url = "https://www.esgtoday.com/germany-walks-back-call-to-scrap-the-eus-supply-chain-sustainability-law/"
slug = "germany_csddd_2025-05-26"
base_path = "C:/Users/Neha/Desktop/Stratalignproject/data/news"

# File paths
html_path = os.path.join(base_path, "raw_html", f"{slug}.html")
text_path = os.path.join(base_path, "parsed_text", f"{slug}.txt")
meta_path = os.path.join(base_path, "metadata", f"{slug}.json")

# Ensure folders exist
os.makedirs(os.path.dirname(html_path), exist_ok=True)
os.makedirs(os.path.dirname(text_path), exist_ok=True)
os.makedirs(os.path.dirname(meta_path), exist_ok=True)



In [13]:

# Scrape using beautifulsoup
response = requests.get(url)
if response.status_code != 200:
    raise Exception(f"Failed to fetch the page: {response.status_code}")

# Save raw HTML
with open(html_path, "w", encoding="utf-8") as f:
    f.write(response.text)

# Parse and extract article text
soup = BeautifulSoup(response.content, "html.parser")
paragraphs = soup.find_all("p")
text = "\n".join(p.get_text(strip=True) for p in paragraphs if len(p.get_text(strip=True)) > 40)
text




'Germany’s federal government softened its language on calls to eliminate the EU’s new law requiring companies to address their negative impacts on human rights and the environment across their value chains – the Corporate Sustainability Due Diligence Directive (CSDDD) – with a government spokesperson clarifying that the government’s position is to “de-bureaucratize” and “streamline” the regulation.\nThe clarification, provided in a press conference by government spokesperson Stefan Kornelius, followscomments earlier this monthby German Chancellor Friedrich Merz to European Commission President Ursula von der Leyen saying that he expects the EU to “cancel this directive,” referring to the CSDDD.\nShortly after Merz’s comments, French PresidentEmmanuel Macron similarly calledto push the CSDDD regulation “out of the table,” adding that France’s position was “very aligned now with Chancellor Merz.”\nThe CSDDD was initiallyproposed by the European Commissionin February 2022, setting out ob

In [14]:
# Preprocess the text
def clean(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s\.,\-]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

cleaned_text = clean(text)

# Save cleaned text
with open(text_path, "w", encoding="utf-8") as f:
    f.write(cleaned_text)

# Save metadata
metadata = {
    "url": url,
    "title": soup.title.string.strip() if soup.title else "",
    "scraped_on": datetime.today().strftime("%Y-%m-%d"),
    "source": "ESG Today",
    "file_slug": slug
}

with open(meta_path, "w", encoding="utf-8") as f:
    json.dump(metadata, f, indent=2)



In [15]:
metadata

{'url': 'https://www.esgtoday.com/germany-walks-back-call-to-scrap-the-eus-supply-chain-sustainability-law/',
 'title': 'Germany Walks Back Call to Scrap the EU’s Supply Chain Sustainability Law - ESG Today',
 'scraped_on': '2025-05-27',
 'source': 'ESG Today',
 'file_slug': 'germany_csddd_2025-05-26'}

In [16]:
cleaned_text

'germanys federal government softened its language on calls to eliminate the eus new law requiring companies to address their negative impacts on human rights and the environment across their value chains the corporate sustainability due diligence directive csddd with a government spokesperson clarifying that the governments position is to de-bureaucratize and streamline the regulation. the clarification, provided in a press conference by government spokesperson stefan kornelius, followscomments earlier this monthby german chancellor friedrich merz to european commission president ursula von der leyen saying that he expects the eu to cancel this directive, referring to the csddd. shortly after merzs comments, french presidentemmanuel macron similarly calledto push the csddd regulation out of the table, adding that frances position was very aligned now with chancellor merz. the csddd was initiallyproposed by the european commissionin february 2022, setting out obligations for companies 

In [None]:
#trying newspaper3k

In [7]:
!pip install newspaper3k

Collecting newspaper3k
  Downloading newspaper3k-0.2.8-py3-none-any.whl (211 kB)
Collecting jieba3k>=0.35.1
  Downloading jieba3k-0.35.1.zip (7.4 MB)
Collecting feedparser>=5.2.1
  Downloading feedparser-6.0.11-py3-none-any.whl (81 kB)
Collecting tinysegmenter==0.3
  Downloading tinysegmenter-0.3.tar.gz (16 kB)
Collecting feedfinder2>=0.0.4
  Downloading feedfinder2-0.0.4.tar.gz (3.3 kB)
Collecting sgmllib3k
  Downloading sgmllib3k-1.0.0.tar.gz (5.8 kB)
Building wheels for collected packages: tinysegmenter, feedfinder2, jieba3k, sgmllib3k
  Building wheel for tinysegmenter (setup.py): started
  Building wheel for tinysegmenter (setup.py): finished with status 'done'
  Created wheel for tinysegmenter: filename=tinysegmenter-0.3-py3-none-any.whl size=13553 sha256=6c16dde0d94893ee68fe1855e553e01a29b2d1963772f3ef67705789099019cb
  Stored in directory: c:\users\neha\appdata\local\pip\cache\wheels\94\ad\df\a2a01300cea47d5695f242f7e925a805970106fd9e4b151468
  Building wheel for feedfinder2 (s

In [8]:
from newspaper import Article
from urllib.parse import urlparse

In [9]:
url = "https://www.esgtoday.com/ecb-warns-eu-against-removing-80-of-companies-from-mandatory-sustainability-reporting/"
slug = urlparse(url).path.strip("/").replace("-", "_").split("/")[-1]
slug += "_" + datetime.today().strftime("%Y-%m-%d")
slug

'ecb_warns_eu_against_removing_80_of_companies_from_mandatory_sustainability_reporting_2025-05-27'

In [10]:
#extract article
article = Article(url)
article.download()
article.parse()

In [17]:
article.authors

['Mark Segal']

In [18]:
article.images

{'https://secure.gravatar.com/avatar/21d81d20c4f3982cf0d942af2f7067935b36db66df71acba8da3cf4550654487?s=30&d=mm&r=g',
 'https://secure.gravatar.com/avatar/21d81d20c4f3982cf0d942af2f7067935b36db66df71acba8da3cf4550654487?s=96&d=mm&r=g',
 'https://www.esgtoday.com/wp-content/plugins/bloom/images/premade-image-07.png',
 'https://www.esgtoday.com/wp-content/plugins/bloom/images/premade-image-16.png',
 'https://www.esgtoday.com/wp-content/uploads/2022/07/ESG-Today-logo-sml.png',
 'https://www.esgtoday.com/wp-content/uploads/2025/01/300x250-1.jpg',
 'https://www.esgtoday.com/wp-content/uploads/2025/04/sidebar-image-May-2025-1.jpg',
 'https://www.esgtoday.com/wp-content/uploads/2025/05/Blackrock-1024x619.jpg',
 'https://www.esgtoday.com/wp-content/uploads/2025/05/EBA2-1024x649.jpg',
 'https://www.esgtoday.com/wp-content/uploads/2025/05/EBA2.jpg',
 'https://www.esgtoday.com/wp-content/uploads/2025/05/ECB2.jpg',
 'https://www.esgtoday.com/wp-content/uploads/2025/05/EU-Council3.jpg',
 'https://w

In [11]:
# Save raw HTML
html_path = os.path.join(base_path, "raw_html", f"{slug}.html")
with open(html_path, "w", encoding="utf-8") as f:
    f.write(article.html)

# Save cleaned text
text_path = os.path.join(base_path, "parsed_text", f"{slug}.txt")
with open(text_path, "w", encoding="utf-8") as f:
    f.write(article.text)

# Save metadata
meta = {
    "url": url,
    "title": article.title,
    "authors": article.authors,
    "published_date": str(article.publish_date) if article.publish_date else None,
    "scraped_on": datetime.today().strftime("%Y-%m-%d"),
    "source": "ESG Today",
    "file_slug": slug
}

meta_path = os.path.join(base_path, "metadata", f"{slug}.json")
with open(meta_path, "w", encoding="utf-8") as f:
    json.dump(meta, f, indent=2)