# Greenwashing detector

## Data Preparation

In [2]:
import pdfplumber
import spacy
import os
import json

In [9]:
# Load NLP-Model
nlp = spacy.load("en_core_web_md")
nlp.max_length = 2_000_000

In [10]:
# Create directory
pdf_dir = "reports"
output_dir = "reports_data"
os.makedirs(output_dir, exist_ok=True)

In [11]:
# Extract text from pdf
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"
    return text

In [13]:
def process_report(pdf_file, company, year):
    pdf_path = os.path.join(pdf_dir, pdf_file)
    text = extract_text_from_pdf(pdf_path)

    doc = nlp(text)
    sentences = []

    for sent in doc.sents:
        sent_text = sent.text.strip()
        if len(sent_text) < 20:
            continue

        entities = [(ent.text, ent.label_) for ent in sent.ents]

        sentences.append({
            "company": company,
            "year": year,
            "sentence": sent_text,
            "entities": entities,
            "label": "unlabeled"
        })

    output_path = os.path.join(output_dir, f"{company.lower()}_{year}.json")
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(sentences, f, ensure_ascii=False, indent=2)

    print(f"Processed: {company} {year} - {len(sentences)} sentences")

In [18]:
reports = [
    ("2020_Volkswagen_Sustainability_Report.pdf", "Volkswagen", 2020),
    ("2021_Volkswagen_Sustainability_Report.pdf", "Volkswagen", 2021),
    ("2022_Volkswagen_Sustainability_Report.pdf", "Volkswagen", 2022),
    ("2023_Volkswagen_Sustainability_Report.pdf", "Volkswagen", 2023),
    ("BMW_Group_Report_2020.PDF", "BMW", 2020),
    ("BMW_Group_Report_2021.pdf", "BMW", 2021),
    ("BMW_Group_Report_2022.PDF", "BMW", 2022),
    ("BMW-Group-Report-2023.pdf", "BMW", 2023),
    ("BMW-Group-Report-2024.pdf", "BMW", 2024),
    ("mercedes-benz-sustainability-report-2020.pdf", "Mercedes-Benz", 2020),
    ("mercedes-benz-sustainability-report-2021.pdf", "Mercedes-Benz", 2021),
    ("mercedes-benz-sustainability-report-2022.pdf", "Mercedes-Benz", 2022),
    ("mercedes-benz-sustainability-report-2023.pdf", "Mercedes-Benz", 2023),
    ("mercedes-benz-sustainability-report-2024.pdf", "Mercedes-Benz", 2024)
]

In [19]:
for pdf_file, company, year in reports:
    process_report(pdf_file, company, year)

Processed: Volkswagen 2020 - 2124 sentences
Processed: Volkswagen 2021 - 2222 sentences


Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P1' is an invalid float value


Processed: Volkswagen 2022 - 2450 sentences


Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P1' is an invalid float value


Processed: Volkswagen 2023 - 2608 sentences


Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P1' is an invalid float value


Processed: BMW 2020 - 5055 sentences
Processed: BMW 2021 - 5413 sentences
Processed: BMW 2022 - 5213 sentences
Processed: BMW 2023 - 5399 sentences
Processed: BMW 2024 - 7922 sentences


Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value
Cannot set gray stroke color because /'P0' is an invalid float value


Processed: Mercedes-Benz 2020 - 4195 sentences
Processed: Mercedes-Benz 2021 - 5997 sentences
Processed: Mercedes-Benz 2022 - 5140 sentences
Processed: Mercedes-Benz 2023 - 4429 sentences
Processed: Mercedes-Benz 2024 - 6517 sentences
