# Greenwashing detector

## Data Preparation

Load Data from PDF and extract the text

In [32]:
import pdfplumber
import spacy
import os
import json
import re

In [33]:
# Load NLP-Model
nlp = spacy.load("en_core_web_md")
nlp.max_length = 2_000_000

In [34]:
# Create directory
pdf_dir = "reports"
output_dir = "reports_data"
os.makedirs(output_dir, exist_ok=True)

In [35]:
# Extract text from pdf
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

In [36]:
# Clean text
def clean_text(text):
    # whitespaces
    text = re.sub(r'\n{2,}', '\n', text)

    # pagenumbers
    text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE)

    # dash
    text = re.sub(r'-\n', '', text)

    # line break
    text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)

    # reduce whitespaces
    text = re.sub(r' +', ' ', text)

    return text.strip()

In [37]:
def process_report(pdf_file, company, year):
    pdf_path = os.path.join(pdf_dir, pdf_file)
    text = extract_text_from_pdf(pdf_path)
    text = clean_text(text)

    doc = nlp(text)
    sentences = []

    for sent in doc.sents:
        sent_text = sent.text.strip()
        if len(sent_text) < 20:
            continue

        entities = [(ent.text, ent.label_) for ent in sent.ents]

        sentences.append({
            "company": company,
            "year": year,
            "sentence": sent_text,
            "entities": entities,
            "label": "unlabeled"
        })

    output_path = os.path.join(output_dir, f"{company.lower()}_{year}.json")
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(sentences, f, ensure_ascii=False, indent=2)

    print(f"Processed: {company} {year} - {len(sentences)} sentences")

In [38]:
reports = [
    ("2020_Volkswagen_Sustainability_Report.pdf", "Volkswagen", 2020),
    ("2021_Volkswagen_Sustainability_Report.pdf", "Volkswagen", 2021),
    ("2022_Volkswagen_Sustainability_Report.pdf", "Volkswagen", 2022),
    ("2023_Volkswagen_Sustainability_Report.pdf", "Volkswagen", 2023),
    ("BMW_Group_Report_2020.PDF", "BMW", 2020),
    ("BMW_Group_Report_2021.pdf", "BMW", 2021),
    ("BMW_Group_Report_2022.PDF", "BMW", 2022),
    ("BMW-Group-Report-2023.pdf", "BMW", 2023),
    ("BMW-Group-Report-2024.pdf", "BMW", 2024),
    ("mercedes-benz-sustainability-report-2020.pdf", "Mercedes-Benz", 2020),
    ("mercedes-benz-sustainability-report-2021.pdf", "Mercedes-Benz", 2021),
    ("mercedes-benz-sustainability-report-2022.pdf", "Mercedes-Benz", 2022),
    ("mercedes-benz-sustainability-report-2023.pdf", "Mercedes-Benz", 2023),
    ("mercedes-benz-sustainability-report-2024.pdf", "Mercedes-Benz", 2024)
]

In [None]:
for pdf_file, company, year in reports:
    process_report(pdf_file, company, year)

Labeling

In [20]:
import csv

In [21]:
input_dir = "reports_data"
output_file = "labeled_data.csv"

In [22]:
def auto_label(sentence):
    s = sentence.lower()

    # red light
    red_keywords = ["climate neutral", "we care", "we believe", "we are green", "carbon free", "sustainable image"]
    if any (kw in s for kw in red_keywords):
        return "red"
    
    # yellow light
    yellow_keywords = ["aim to", "strive to", "plan to", "commitment to", "by 2050", "long-term vision"]
    if any(kw in s for kw in yellow_keywords):
        return "yellow"
    
    # green light
    if ("reduce" in s or "cut" in s) and "%" in s:
        return "green"
    if ("renewable energy" in s or "solar" in s or "wind" in s):
        return "green"
    if "by 20" in s and any(w in s for w in ["reduce", "cut", "transition"]):
        return "green"
    
    return "unlabeled"

In [23]:
all_rows = []

for file_name in os.listdir(input_dir):
    if file_name.endswith(".json"):
        file_path = os.path.join(input_dir, file_name)
        with open(file_path, "r", encoding="utf-8") as f:
            data = json.load(f)
            for entry in data:
                sentence = entry["sentence"]
                label = auto_label(sentence)

                row = {
                    "company": entry["company"],
                    "year": entry["year"],
                    "sentence": sentence,
                    "entities": ", ".join([f"{e[0]} ({e[1]})" for e in entry.get("entities", [])]),
                    "label": label
                }
                all_rows.append(row)

with open(output_file, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["company", "year", "sentence", "entities", "label"])
    writer.writeheader()
    writer.writerows(all_rows)

print(f"{len(all_rows)} Sätze automatisch gelabelt")

64684 Sätze automatisch gelabelt
