In [None]:
import os
import pandas as pd

Core Questions (3 total):

1. "Should employee business travel be classified as Scope 1 or Scope 3? Explain the
reasoning and describe how I can calculate my business travel emissions?"
2. “Are my scope 2 emissions calculation valid according to the Greenhouse Gas
Protocol?”
3. How do my scope 1 & 2 emissions compare with other companies (refer to peer
reports in the data sources) in my industry, and what insights can I derive from this
comparison?

Bonus Questions:

1. "What is our highest emitting Scope 3 category and what specific activities contribute
to it?"
2. “Which suppliers should I prioritise to engage for emissions reduction efforts?”
3. "Generate a summary report of our total emissions by scope with key insights"

In [None]:
scope1_df = pd.read_csv('data/raw/scope1.csv')
scope2_df = pd.read_csv('data/raw/scope2.csv')
scope3_df = pd.read_csv('data/raw/scope3.csv')


scope1_df['scope'] = 'scope1'
scope2_df['scope'] = 'scope2'
scope3_df['scope'] = 'scope3'
## Improvement: Some columns are referring to same data but in different names, should be combined.
 # ex: scope1_df['Activity_Type'] = scope2_df['Energy_Type'] = scope3_df['Activity_Description']
scope_comb_df = pd.concat([scope1_df, scope2_df, scope3_df])

In [None]:
scope1_df.head()

## 1. Emission Analyzer

In [None]:
totals = {}
totals['scope1'] = scope1_df['CO2e_Tonnes'].sum()
totals['scope2'] = scope2_df['CO2e_Tonnes'].sum()
totals['scope3'] = scope3_df['CO2e_Tonnes'].sum()
totals['total'] = sum(totals.values())

### Get relevant information from vectorDB

In [3]:
import PyPDF2
import fitz
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OpenAIEmbeddings

from langchain_community.vectorstores import FAISS
from typing import List, Dict
import re

In [None]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

# For Gemini, you need a Google API key (set as GOOGLE_API_KEY)
import os
chunk_size = 1000
chunk_overlap = 200
api_key = os.getenv("GOOGLE_API_KEY")

rec_text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    length_function=len
)

embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=api_key)

In [20]:
doc_name, path = 'ghg_protocol', 'data/raw/ghg-protocol-revised.pdf'

doc = fitz.open(path)

In [5]:
doc

Document('data/raw/ghg-protocol-revised.pdf')

In [7]:
def _clean_text(text: str) -> str:
        """Clean extracted text"""
        # Remove excessive whitespace
        text = re.sub(r'\s+', ' ', text)
        # Remove special characters but keep punctuation
        text = re.sub(r'[^\w\s\-.,;:!?()]', '', text)
        return text.strip()

In [None]:
text = ""
for page_num, page in enumerate(doc):
    page_text = page.get_text()
    # Clean and format text
    page_text = _clean_text(page_text)
    text += f"\n--- Page {page_num + 1} ---\n{page_text}\n"

# Split into chunks
chunks = rec_text_splitter.split_text(text)

# Add metadata
documents = []
for i, chunk in enumerate(chunks):
    documents.append({
        "content": chunk,
        "metadata": {
            "source": doc_name,
            "chunk_id": i,
            "total_chunks": len(chunks)
        }
    })

texts = [doc["content"] for doc in documents]
metadatas = [doc["metadata"] for doc in documents]

In [24]:
documents[0]

{'content': '--- Page 1 ---\nA Corporate Accounting and Reporting Standard R E V I S E D E D I T I O N The Greenhouse Gas Protocol  390  370  350  330  310  290  270 ppm 1000 1500 2000 Year: W O R L D R E S O U R C E S I N S T I T U T E',
 'metadata': {'source': 'ghg_protocol', 'chunk_id': 0, 'total_chunks': 586}}

In [25]:
documents[-1]

{'content': '--- Page 116 ---\nW O R L D R E S O U R C E S I N S T I T U T E 10 G Street, NE (Suite 800) Washington, DC 20002 USA Tel: (1 202) 729 76 00 Fax: (1 202) 729 76 10 E-mail: sepinfo  wri.org Internet: www.wri.org 4, chemin de Conches 1231 Conches-Geneva Switzerland Tel: (41 22) 839 31 00 Fax: (41 22) 839 31 31 E-mail: info  wbcsd.org Internet: www.wbcsd.org',
 'metadata': {'source': 'ghg_protocol', 'chunk_id': 585, 'total_chunks': 586}}

In [None]:
vector_stores = {}
vector_store = FAISS.from_texts(
    texts=texts,
    embedding=embeddings,
    metadatas=metadatas,
    index_type="Flat"
)

TypeError: FAISS.__init__() got an unexpected keyword argument 'index_type'

In [None]:
vector_stores[doc_name] = vector_store

In [None]:
"""
## Improvements:
- Some irrelevant text is included in the text.
- Ignore table of contents and unimportant texts in margins of pdf.

ex:
--- Page 3 ---
2 6 10 16 24 34 40 48 58 62 68 74 86 88 90 92 95 96 103 104 Table of Contents G U I D A N C E S T A N D A R D G U I D A N C E S T A N D A R D G U I D A N C E S T A N D A R D G U I D A N C E S T A N D A R D G U I D A N C E S T A N D A R D G U I D A N C E G U I D A N C E G U I D A N C E G U I D A N C E G U I D A N C E G U I D A N C E G U I D A N C E G U I D A N C E S T A N D A R D Introduction The Greenhouse Gas Protocol Initiative Chapter 1 GHG Accounting and Reporting Principles Chapter 2 Business Goals and Inventory Design Chapter 3 Setting Organizational Boundaries Chapter 4 Setting Operational Boundaries Chapter 5 Tracking Emissions Over Time Chapter 6 Identifying and Calculating GHG Emissions Chapter 7 Managing Inventory Quality Chapter 8 Accounting for GHG Reductions Chapter 9 Reporting GHG Emissions Chapter 10 Verification of GHG Emissions Chapter 11 Setting GHG Targets Appendix A Accounting for Indirect Emissions from Electricity Appendix B Accounting for Sequestered Atmospheric Carbon Appendix C Overview of GHG Programs Appendix D Industry Sectors and Scopes Acronyms Glossary References Contributors

"""
print(text)

### Summary from emissions data

In [None]:
# scope_comb_df.groupby(['Category', 'scope'])[['CO2e_Tonnes']].sum()
top_k = 3
emitters_by_cat = scope3_df.groupby(['Category'])[['CO2e_Tonnes']].sum().reset_index()
top3_emitters_by_cat = emitters_by_cat.sort_values(by='CO2e_Tonnes', ascending=False).iloc[:top_k]

summary = f"""
Total Emissions Overview:
- Scope 1: {totals.get('scope1', 0):.2f} tCO2e
- Scope 2: {totals.get('scope2', 0):.2f} tCO2e  
- Scope 3: {totals.get('scope3', 0):.2f} tCO2e
- Total: {totals.get('total', 0):.2f} tCO2e

Data Records:
- Scope 1: {len(scope1_df)} entries
- Scope 2: {len(scope2_df)} entries
- Scope 3: {len(scope3_df)} entries
"""
# Add top categories if available
if not top3_emitters_by_cat.empty:
    summary += "\n\nTop3 Scope 3 Categories:\n"
    ## Improvements: Avoid loop, print whole dataframe at once or pass markdown table.
    for row in top3_emitters_by_cat.itertuples():
        cat = getattr(row, 'Category')
        emissions = getattr(row, 'CO2e_Tonnes')
        summary += f"- {cat}: {emissions:.2f} tCO2e\n"

print(summary)

## 2. Document Processor
Read PDF then store in VectorDB

## 4. QA and actionable suggestion measurements

How can Agent give suggestions:
App1:
1. We can use external (real-time) sources like News, regulations, etc... to retrieve information about changing trends/regulations
2. feed this as context 
