In [1]:
import spacy
import re
from nltk.corpus import stopwords

nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words("english"))

def preprocess(text):
    # Remove headers/footers
    text = re.sub(r"\n\s*\n", "\n", text)  
    text = re.sub(r"\bPage \d+\b", "", text)
    
    # spaCy pipeline
    doc = nlp(text)
    tokens = [
        token.lemma_.lower() 
        for token in doc 
        if not token.is_stop 
        and not token.is_punct 
        and len(token.text) > 2
    ]
    
    # ESG-specific keyword filtering
    esg_keywords = {"environment", "social", "governance", "carbon", "diversity"}
    
    # Keep ESG keywords and the first 100 tokens
    result_tokens = []
    for t in tokens:
        if t in esg_keywords or len(result_tokens) < 100:
            result_tokens.append(t)
    
    return " ".join(result_tokens)

Text Preprocessing 

In [2]:
import spacy
import re
from nltk.corpus import stopwords

nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words("english"))

def preprocess(text):
    # Remove headers/footers
    text = re.sub(r"\n\s*\n", "\n", text)  
    text = re.sub(r"\bPage \d+\b", "", text)
    
    # spaCy pipeline
    doc = nlp(text)
    tokens = [
        token.lemma_.lower() 
        for token in doc 
        if not token.is_stop 
        and not token.is_punct 
        and len(token.text) > 2
    ]
    
    # ESG-specific keyword filtering
    esg_keywords = {"environment", "social", "governance", "carbon", "diversity"}
    
    # Keep ESG keywords and the first 100 tokens
    result_tokens = []
    for t in tokens:
        if t in esg_keywords or len(result_tokens) < 100:
            result_tokens.append(t)
    
    return " ".join(result_tokens)

Fine-Tuning BERT for ESG Sentiment


In [3]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
from transformers import Trainer, TrainingArguments
from torch.utils.data import Dataset

# Load pre-trained model
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", 
    num_labels=3  # Positive, Neutral, Negative
)

# Create a custom Dataset class
class ESGDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Example data
train_texts = ["This company reduced carbon emissions", "Poor governance practices", "Employee diversity improved"]
train_labels = [0, 2, 0]  # 0: Positive, 1: Neutral, 2: Negative
val_texts = ["Environmental concerns remain", "Board oversight is adequate", "No significant social impact"]
val_labels = [2, 1, 1]  # 0: Positive, 1: Neutral, 2: Negative

# Create dataset objects
train_dataset = ESGDataset(train_texts, train_labels, tokenizer)
val_dataset = ESGDataset(val_texts, val_labels, tokenizer)

# Define training arguments - Fixed version
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,
    save_steps=100,  # Explicitly set
    eval_steps=100   # Explicitly set
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Start training
trainer.train()


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,1.13606
2,No log,1.17529
3,No log,1.201091


TrainOutput(global_step=3, training_loss=1.0588812033335369, metrics={'train_runtime': 14.4723, 'train_samples_per_second': 0.622, 'train_steps_per_second': 0.207, 'total_flos': 16299990126.0, 'train_loss': 1.0588812033335369, 'epoch': 3.0})

In [5]:
#SQL Database Integration


In [6]:
import sqlite3

# Connect to SQLite database
conn = sqlite3.connect("esg.db")

# SQL Schema
create_table_query = """
CREATE TABLE IF NOT EXISTS esg_reports (
    report_id INTEGER PRIMARY KEY,
    company TEXT,
    year INTEGER,
    environmental_sentiment REAL,
    social_sentiment REAL,
    governance_sentiment REAL,
    full_text TEXT
);
"""
conn.execute(create_table_query)

# Commit and close
conn.commit()
conn.close()

print("Database and table created successfully!")


Database and table created successfully!


In [7]:
import pandas as pd
import plotly.express as px

# Sample DataFrame
df = pd.DataFrame({
    "company": ["Apple", "Microsoft", "Google"],
    "avg_env": [0.85, 0.72, 0.78],  # Environmental sentiment
    "avg_soc": [0.80, 0.68, 0.75]   # Social sentiment
})

# Create bar chart
fig = px.bar(df, x="company", y=["avg_env", "avg_soc"], 
             title="ESG Sentiment by Category (2023)",
             labels={"value": "Sentiment Score", "variable": "Category"},
             barmode="group")  # Groups bars for better comparison

# Show the plot
fig.show()
