In [1]:
import os
import numpy as np
import pandas as pd
import faiss

from langchain.chains import LLMChain


Core Questions (3 total):

1. "Should employee business travel be classified as Scope 1 or Scope 3? Explain the
reasoning and describe how I can calculate my business travel emissions?"
2. “Are my scope 2 emissions calculation valid according to the Greenhouse Gas
Protocol?”
3. How do my scope 1 & 2 emissions compare with other companies (refer to peer
reports in the data sources) in my industry, and what insights can I derive from this
comparison?

Bonus Questions:

1. "What is our highest emitting Scope 3 category and what specific activities contribute
to it?"
2. “Which suppliers should I prioritise to engage for emissions reduction efforts?”
3. "Generate a summary report of our total emissions by scope with key insights"

In [None]:
scope1_df = pd.read_csv('data/raw/scope1.csv')
scope2_df = pd.read_csv('data/raw/scope2.csv')
scope3_df = pd.read_csv('data/raw/scope3.csv')

scope1_df['scope'] = 'scope1'
scope2_df['scope'] = 'scope2'
scope3_df['scope'] = 'scope3'
## Improvement: Some columns are referring to same data but in different names, should be combined.
 # ex: scope1_df['Activity_Type'] = scope2_df['Energy_Type'] = scope3_df['Activity_Description']
scope_comb_df = pd.concat([scope1_df, scope2_df, scope3_df])

totals = {}
totals['scope1'] = scope1_df['CO2e_Tonnes'].sum()
totals['scope2'] = scope2_df['CO2e_Tonnes'].sum()
totals['scope3'] = scope3_df['CO2e_Tonnes'].sum()
totals['total'] = sum(totals.values())

In [None]:
scope1_df.head()

## 1. Emission Analyzer

### Get relevant information from vectorDB

In [2]:
import PyPDF2
import fitz
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain_community.vectorstores import FAISS
from typing import List, Dict
import re

In [3]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI

# Wrap Gemini model for LangChain
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    google_api_key=os.environ['GOOGLE_API_KEY'],
    temperature=0.1
)

In [14]:
def _clean_text(text: str) -> str:
        """Clean extracted text"""
        # Remove excessive whitespace
        text = re.sub(r'\s+', ' ', text)
        # Remove special characters but keep punctuation
        text = re.sub(r'[^\w\s\-.,;:!?()]', '', text)
        return text.strip()

chunk_size = 1000
chunk_overlap = 200
api_key = os.getenv("GOOGLE_API_KEY")
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=api_key)
doc_name, path = 'peer2', 'data/raw/peer2_emissions_report.pdf'
doc = fitz.open(path)

documents = []
rec_text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    length_function=len
)

text = ""
for page_num, page in enumerate(doc):
    page_text = page.get_text()
    # Clean and format text
    page_text = _clean_text(page_text)
    text += f"\n--- Page {page_num + 1} ---\n{page_text}\n"

In [22]:
for page_num, page in enumerate(doc):
    if page_num == 10:
        break

In [25]:
page.get_images()

[(863, 0, 328, 179, 8, 'ICCBased', '', 'Im87', 'FlateDecode'),
 (863, 0, 328, 179, 8, 'ICCBased', '', 'Im87', 'FlateDecode')]

In [29]:
import pytesseract

In [34]:
a = page.get_pixmap().tobytes("png")

In [21]:
for img_indx, img in images:
    xref = img[0]
    img_info = {
        ""
    }

[(7, 0, 400, 113, 8, 'ICCBased', '', 'Im1', 'FlateDecode')]

In [None]:
"""
## Improvements: improve Chunking
- Some irrelevant text is included in the text.
- Ignore table of contents and unimportant texts in margins of pdf.

ex:
--- Page 3 ---
2 6 10 16 24 34 40 48 58 62 68 74 86 88 90 92 95 96 103 104 Table of Contents G U I D A N C E S T A N D A R D G U I D A N C E S T A N D A R D G U I D A N C E S T A N D A R D G U I D A N C E S T A N D A R D G U I D A N C E S T A N D A R D G U I D A N C E G U I D A N C E G U I D A N C E G U I D A N C E G U I D A N C E G U I D A N C E G U I D A N C E G U I D A N C E S T A N D A R D Introduction The Greenhouse Gas Protocol Initiative Chapter 1 GHG Accounting and Reporting Principles Chapter 2 Business Goals and Inventory Design Chapter 3 Setting Organizational Boundaries Chapter 4 Setting Operational Boundaries Chapter 5 Tracking Emissions Over Time Chapter 6 Identifying and Calculating GHG Emissions Chapter 7 Managing Inventory Quality Chapter 8 Accounting for GHG Reductions Chapter 9 Reporting GHG Emissions Chapter 10 Verification of GHG Emissions Chapter 11 Setting GHG Targets Appendix A Accounting for Indirect Emissions from Electricity Appendix B Accounting for Sequestered Atmospheric Carbon Appendix C Overview of GHG Programs Appendix D Industry Sectors and Scopes Acronyms Glossary References Contributors

"""
chunks = rec_text_splitter.split_text(text)

# Add metadata
for i, chunk in enumerate(chunks):
    documents.append({
        "content": chunk,
        "metadata": {
            "source": doc_name,
            "chunk_id": i,
            "total_chunks": len(chunks)
        }
    })

In [None]:
texts = [doc["content"] for doc in documents]
metadatas = [doc["metadata"] for doc in documents]

vector_stores = {}

emb_dim = len(embeddings.embed_query("get_dimension"))
# index_type = "FlatL2"
# if index_type == "FlatL2":
#         index = faiss.IndexFlatL2(emb_dim)
# elif index_type == "FlatIP":
#     index = faiss.IndexFlatIP(emb_dim)
# elif index_type == "HNSW":
#     index = faiss.IndexHNSWFlat(emb_dim, 32)  # M=32 neighbors
# else:
#     raise ValueError(f"Unknown index_type: {index_type}")

vector_store = FAISS.from_texts(
    texts=texts,
    embedding=embeddings,
    metadatas=metadatas
)

### Summary from emissions data

In [None]:
vector_stores[doc_name] = vector_store

In [None]:
question1 = """Should employee business travel be classified as Scope 1 or Scope 3? 
Explain the reasoning and describe how I can calculate my business travel emissions?"""

ghg_context = vector_stores['ghg_protocol'].similarity_search(question1, k=3)

In [None]:
from langchain.prompts import ChatPromptTemplate
qa_prompt = ChatPromptTemplate.from_template("""
You are an expert sustainability consultant specializing in GHG emissions analysis.

Context from GHG Protocol:
{ghg_context}

Emissions Data Summary:
{data_summary}

User Question: {question}

Please provide a detailed, accurate answer based on the GHG Protocol guidelines and the emissions data.
If the question involves calculations, show your work step by step.
""")

In [None]:
print(ghg_context[0].page_content)
print(ghg_context[1].page_content)
print(ghg_context[2].page_content)

### Summary from emissions data

##Improvements: Find a way to make retrieved sentences seem not cut off. (ensure full sentence structure)
 #query_translation: Query decomposition (Multi-query), HyDE
 #Find bigger pictures -> zoom in. For example, read through table of contents to figure out few chapters
  #that may talk about list of scope3. Then grab top-k from there.

companys scope 1 and scope 2 emissions  They contribute to the companys GHG risk exposure  They are deemed critical by key stakeholders (e.g., feedback from customers, suppliers, investors, or civil society)  There are potential emissions reductions that could be undertaken or influenced by the company. The following examples may help decide which scope 3 categories are relevant to the company.  If fossil fuel or electricity is required to use the companys products, product use phase emissions may be a relevant category to report. This may be espe- cially important if the company can influence product design attributes (e.g., energy efficiency) or customer behavior in ways that reduce GHG emissions during the use of the products. G U I D A N C E Set ting Operational Boundaries C H A P T E R 4 30 FIGURE 5. Accounting of emissions from leased assets Parent Company Company A Scope 1 Scope 1 Scope 2 Scope 3 Leased car fleet (selected consolidation criterion applies) Leased building


GHG emissions Scope 3 is an optional reporting category that allows for the treatment of all other indirect emissions. Scope 3 emissions are a consequence of the activities of the company, but occur from sources not owned or controlled by the company. Some examples of scope 3 activities are extraction and production of purchased materials; transportation of purchased fuels; and use of sold products and services. C H A P T E R 4 Setting Operational Boundaries 25 S T A N D A R D FIGURE 2. Organizational and operational boundaries of a company Parent Company Company A Ship fleet Leased building Direct and indirect emissions Car fleet Power generation unit Leased factory Owned Controlled building Owned Controlled building Company B Company C Company D ORGANIZATIONAL BOUNDARIES OPERATIONAL BOUNDARIES


C H A P T E R 4 Setting Operational Boundaries 31  Outsourced activities are often candidates for scope 3 emissions assessments. It may be particularly important to include these when a previously outsourced activity contributed significantly to a companys scope 1 or scope 2 emissions.  If GHG-intensive materials represent a significant fraction of the weight or composition of a product used or manufactured (e.g., cement, aluminum), companies may want to examine whether there are opportunities to reduce their consumption of the product or to substitute less GHG-intensive materials.  Large manufacturing companies may have significant emissions related to transporting purchased materials to centralized production facilities.  Commodity and consumer product companies may want to account for GHGs from transporting raw materials, products, and waste.  Service sector companies may want to report on emis- sions from employee business travel; this emissions source is not as likely to be


 --------------- try2 :
companys scope 1 and scope 2 emissions  They contribute to the companys GHG risk exposure  They are deemed critical by key stakeholders (e.g., feedback from customers, suppliers, investors, or civil society)  There are potential emissions reductions that could be undertaken or influenced by the company. The following examples may help decide which scope 3 categories are relevant to the company.  If fossil fuel or electricity is required to use the companys products, product use phase emissions may be a relevant category to report. This may be espe- cially important if the company can influence product design attributes (e.g., energy efficiency) or customer behavior in ways that reduce GHG emissions during the use of the products. G U I D A N C E Set ting Operational Boundaries C H A P T E R 4 30 FIGURE 5. Accounting of emissions from leased assets Parent Company Company A Scope 1 Scope 1 Scope 2 Scope 3 Leased car fleet (selected consolidation criterion applies) Leased building


GHG emissions Scope 3 is an optional reporting category that allows for the treatment of all other indirect emissions. Scope 3 emissions are a consequence of the activities of the company, but occur from sources not owned or controlled by the company. Some examples of scope 3 activities are extraction and production of purchased materials; transportation of purchased fuels; and use of sold products and services. C H A P T E R 4 Setting Operational Boundaries 25 S T A N D A R D FIGURE 2. Organizational and operational boundaries of a company Parent Company Company A Ship fleet Leased building Direct and indirect emissions Car fleet Power generation unit Leased factory Owned Controlled building Owned Controlled building Company B Company C Company D ORGANIZATIONAL BOUNDARIES OPERATIONAL BOUNDARIES


C H A P T E R 4 Setting Operational Boundaries 31  Outsourced activities are often candidates for scope 3 emissions assessments. It may be particularly important to include these when a previously outsourced activity contributed significantly to a companys scope 1 or scope 2 emissions.  If GHG-intensive materials represent a significant fraction of the weight or composition of a product used or manufactured (e.g., cement, aluminum), companies may want to examine whether there are opportunities to reduce their consumption of the product or to substitute less GHG-intensive materials.  Large manufacturing companies may have significant emissions related to transporting purchased materials to centralized production facilities.  Commodity and consumer product companies may want to account for GHGs from transporting raw materials, products, and waste.  Service sector companies may want to report on emis- sions from employee business travel; this emissions source is not as likely to be

In [None]:
print(qa_prompt.format(ghg_context=ghg_context_txt, data_summary=summary, question=question1))

In [None]:
ghg_context_txt = ""
for i, context in enumerate(ghg_context):
    ghg_context_txt += f"{i+1}. {context.page_content} \n\n"

### Summary from emissions data

In [None]:
def df_to_text(grouped_df, value_cols=["CO2e_Tonnes"], decimal_places=2):
    """
    Convert a grouped DataFrame into a readable text format with automatic column names.
    
    Args:
        grouped_df (pd.DataFrame): Grouped DataFrame (can have any grouping columns).
        value_col (str): Column to display as the value.
        decimal_places (int): Decimal places for the value column.
        
    Returns:
        str: Formatted text.
    """
    grouped_df = grouped_df.reset_index()
    grouped_df[value_cols] = np.round(grouped_df[value_cols], decimal_places)
    
    # Determine all columns except the value column
    grp_cols = [col for col in grouped_df.columns if col not in value_cols]
    
    text_output = ""
    for row in grouped_df.itertuples():
        row_text = ", ".join([f"{col}: {getattr(row, col)}" for col in grp_cols])
        for value_col in value_cols:
            row_text += f", {value_col}: {getattr(row, value_col)}"
        text_output += row_text + "\n"
        
    return text_output

In [None]:
scope1_grouped_df = scope1_df.groupby(['Facility', 'Activity_Type', "Fuel_Type"])[['CO2e_Tonnes']].sum()
scope2_grouped_df = scope2_df.groupby(['Facility', 'Energy_Type']).agg(
    CO2e_Tonnes=('CO2e_Tonnes', 'sum'),  
    Renewable_Percentage=('Renewable_Percentage', 'mean')
)

In [None]:
scope3_grp_cate_activity_df = scope3_df.groupby(['Category', 'Activity_Description'])[['CO2e_Tonnes']].sum()
scope3_grp_supplier_df = scope3_df.groupby(['Supplier'])[['CO2e_Tonnes']].sum()

In [None]:
# scope_comb_df.groupby(['Category', 'scope'])[['CO2e_Tonnes']].sum()
top_k = 3
emitters_by_cat = scope3_df.groupby(['Category'])[['CO2e_Tonnes']].sum().reset_index()
top3_emitters_by_cat = emitters_by_cat.sort_values(by='CO2e_Tonnes', ascending=False).iloc[:top_k]

summary = f"""
Total Emissions Overview:
- Scope 1: {totals.get('scope1', 0):.2f} tCO2e
- Scope 2: {totals.get('scope2', 0):.2f} tCO2e  
- Scope 3: {totals.get('scope3', 0):.2f} tCO2e
- Total: {totals.get('total', 0):.2f} tCO2e

<Scope 1 emissions summary by facility, activity type, and fuel type>
{df_to_text(scope1_grouped_df, decimal_places=2)}
</Scope 1 emissions summary by facility, activity type, and fuel type>

<Scope 2 emissions summary by facility and renewable energy percentage>
{df_to_text(scope2_grouped_df, value_cols=["CO2e_Tonnes", "Renewable_Percentage"], decimal_places=2)}
</Scope 2 emissions summary by facility and renewable energy percentage>

<Scope 3 emissions summary by category and activity>
{df_to_text(scope3_grp_cate_activity_df, decimal_places=2)}
</Scope 3 emissions summary by category and activity>

<Scope 3 emissions summary by supplier>
{df_to_text(scope3_grp_supplier_df, decimal_places=2)}
</Scope 3 emissions summary by supplier>

Data Records:
- Scope 1: {len(scope1_df)} entries
- Scope 2: {len(scope2_df)} entries
- Scope 3: {len(scope3_df)} entries
"""
# Add top categories if available
if not top3_emitters_by_cat.empty:
    summary += "\n\nTop3 Scope 3 Categories:\n"
    ## Improvements: Avoid loop, print whole dataframe at once or pass markdown table.
    for row in top3_emitters_by_cat.itertuples():
        cat = getattr(row, 'Category')
        emissions = getattr(row, 'CO2e_Tonnes')
        summary += f"- {cat}: {emissions:.2f} tCO2e\n"

print(summary)

# Answer1 - t1

As an expert sustainability consultant specializing in GHG emissions analysis, I can definitively answer your question regarding employee business travel classification and calculation based on GHG Protocol guidelines.

---

### Classification of Employee Business Travel Emissions

Employee business travel should be classified as **Scope 3 emissions**.

**Reasoning based on GHG Protocol:**

The provided GHG Protocol context explicitly supports this classification:
[###Improvements: context2 is giving vague answer, is straight answer better? -> use hybrid search (BM25+Semantic),
  context3 is just bad...]
*   **Context 2** states: "Scope 3 emissions are a consequence of the activities of the company, but occur from sources not owned or controlled by the company."
*   **Context 3** further clarifies: "Service sector companies may want to report on emissions from employee business travel; this emissions source is not as likely to be [significant for other sectors, implied]."

When employees travel for business, they typically use services provided by third parties (e.g., airlines, train companies, rental car agencies, hotels). The company does not own or control the aircraft, trains, vehicles, or hotel buildings that generate these emissions. Therefore, these emissions are indirect and fall under Scope 3.

---

### How to Calculate Business Travel Emissions
[###Improvements: We could make this sequential, ask first question then use answer to select dataset to 
 summarize, in this case scope3. Then use that summary to answer second question.]

Calculating business travel emissions involves collecting activity data and applying appropriate emission factors. Here's a step-by-step guide:

**1. Data Collection:**
Gather detailed information on all employee business travel activities for the reporting period. This typically includes:

Your company's Scope 3 emissions are substantial, totaling **374,485.51 tCO2e** across 1000 entries. While "employee business travel" is not explicitly listed among your "Top 3 Scope 3 Categories," it is a common and important category for many companies, particularly those in the service sector. Given the large number of Scope 3 entries, it is highly probable that employee business travel is already included within your reported Scope 3, or it should be if not currently accounted for.

By following these steps, you can accurately quantify your employee business travel emissions and ensure compliance with GHG Protocol guidelines.
Output is truncated. View as a scrollable element or open in a text editor. Adjust cell output settings...

Answer1 - t2
"""Employee business travel should be classified as **Scope 3** emissions.

### Reasoning for Classification

According to the GHG Protocol guidelines provided:

*   **Scope 1 emissions** are direct emissions from sources that are owned or controlled by the company. Examples include emissions from company-owned vehicles (like the fleet operations in your data) or stationary combustion at owned facilities.
*   **Scope 2 emissions** are indirect emissions from the generation of purchased electricity, steam, heat, or cooling consumed by the company.
*   **Scope 3 emissions** are all other indirect emissions that are a consequence of the company's activities but occur from sources *not owned or controlled* by the company.

The provided GHG Protocol context explicitly mentions: "Service sector companies may want to report on emissions from employee business travel; this emissions source is not as likely to be..." (implying it's a relevant Scope 3 category).

Employee business travel typically involves activities such as air travel, hotel stays, or use of rental cars, which are services provided by third parties (airlines, hotels, car rental companies). The company does not own or directly control the aircraft, hotels, or rental vehicles used for these activities. Therefore, the emissions generated from these activities are indirect and fall under Scope 3.

### How to Calculate Business Travel Emissions

To calculate your business travel emissions, you need to gather activity data related to employee travel and apply appropriate emission factors.

**General Calculation Method:**

1.  **Collect Activity Data:** This involves tracking the distance traveled by mode of transport (e.g., air miles, train miles, car miles), the type of fuel consumed (if applicable for rental cars), or the number of nights spent in hotels.
    *   For air travel: Distance flown (e.g., passenger-miles or tonne-miles) by flight class (economy, business, first) and route type (short-haul, long-haul).
    *   For rail travel: Distance traveled by train type.
    *   For road travel (rental cars, taxis): Distance traveled or fuel consumed.
2.  **Apply Emission Factors:** Use relevant emission factors for each mode of transport. These factors convert activity data (e.g., miles, liters of fuel) into CO2e emissions. Emission factors are typically sourced from government agencies, industry bodies, or specialized databases (e.g., DEFRA, EPA, ICAO).
    *   Example: (Total miles flown by employees) x (CO2e emission factor per mile for air travel) = Total air travel emissions.

**Calculation from Your Provided Emissions Data:**

Your provided data already includes a breakdown of Scope 3 emissions by activity, specifically listing "Business Travel - Air" across various categories. To calculate your total business travel emissions from this data, you need to sum the `CO2e_Tonnes` for all entries with the `Activity_Description` of "Business Travel - Air".

Let's sum these values:

*   Category 1: 3230.54 tCO2e
*   Category 10: 2245.73 tCO2e
*   Category 11: 1836.18 tCO2e
*   Category 12: 2446.51 tCO2e
*   Category 13: 4271.45 tCO2e
*   Category 14: 2866.76 tCO2e
*   Category 15: 1582.76 tCO2e
*   Category 2: 2632.44 tCO2e
*   Category 3: 4874.98 tCO2e
*   Category 4: 836.95 tCO2e
*   Category 5: 2748.65 tCO2e
*   Category 6: 4414.45 tCO2e
*   Category 7: 3395.58 tCO2e
*   Category 8: 2473.86 tCO2e
*   Category 9: 6455.23 tCO2e

**Total Business Travel Emissions (Air):**
3230.54 + 2245.73 + 1836.18 + 2446.51 + 4271.45 + 2866.76 + 1582.76 + 2632.44 + 4874.98 + 836.95 + 2748.65 + 4414.45 + 3395.58 + 2473.86 + 6455.23 = **46317.87 tCO2e** [###Improvements: summation error. It should be 46312.07]

Based on your provided data, your total employee business travel (air) emissions are **46,317.87 tCO2e**. This figure represents a significant portion of your overall Scope 3 emissions and should be a key focus for reduction strategies.
"""



In [None]:
chain = qa_prompt | llm # LAngChain Expresison LAngugage
response = chain.invoke({
    'ghg_context':ghg_context_txt,
    'data_summary':summary,
    'question':question1}
)



In [None]:
print(question1)
print("---")
print(response)

In [None]:
np.round(scope3_grp_cate_activity_df.loc[(slice(None), 'Business Travel - Air'), :], 2).sum()

In [None]:
response

# Q2

In [None]:
    
def assess_scope2_validity(scope2_data: pd.DataFrame) -> Dict:
    """
    Simple validation: Are Scope 2 emissions calculations valid according to GHG Protocol?
    
    Things to check score_range=[0,1]:
    1. Basic data structure
    2. Required columns(co2e emission)/data for GHG Protocol
        2.1. Must include electricity, steam, heating, and cooling
        2.2. Should contain two reporting methods: location-based, market-based
            In order to compute location-based emissions, it must include Emission factor and Consumption amount
            and market-based emissions, it must include Renewable percentage
    3. Accounting method (GHG Protocol requirement)
    4. Data quality - no negative emissions
    """
    issues = []
    validity_threshold = 100
    # Check 1: Basic data structure
    if scope2_data.empty:
        return {
            "is_valid": False,
            "score": 0,
            "issues": ["No Scope 2 data provided"],
            "summary": "Invalid: No data to validate"
        }
    
    # Check 2: Required columns/data for GHG Protocol
    ### Improvements: Should handle synonyms. -> Should do some data engineering or notice users to change column name apporpriately.
    # For example, 'CO2e_Tonnes' -> 'emissions_tco2e' 
    required_cols = ['CO2e_Tonnes']
    scope2_cols = scope2_data.columns   
    missing_cols = [col for col in required_cols if col not in scope2_cols]
    if missing_cols:
        issues.append(f"Missing required columns: {', '.join(missing_cols)}")
    
    location_based_possible = False
    market_based_possible = False
    # 2.2. Check if we can compute location-based emissions
    if 'Emission_Factor' in scope2_cols and 'Consumption_Amount' in scope2_cols:
        location_based_possible = True
        # Check if at least one record is negative (this also checks Null.)
        invalid_emission_factor = (scope2_data['Emission_Factor'] < 0).sum()
        invalid_consumption_amount =  (scope2_data['Consumption_Amount'] < 0).sum()
        if invalid_emission_factor > 0 and invalid_consumption_amount > 0:
            issues.append(f"Negative values in Emission_Factor or Consumption_Amount columns")
    else:
        issues.append("Cannot compute location-based emissions: missing Emission_Factor or Consumption_Amount")
    
    # Check if we can compute market-based emissions
    if 'Renewable_Percentage' in scope2_data.columns:
        market_based_possible = True
        # Market-based emissions would be lower due to renewable energy
        renewable_avg = scope2_data['Renewable_Percentage'].mean()
        if renewable_avg <= 0:
            issues.append("No renewable energy data available for market-based calculation")
    else:
        issues.append("Cannot compute market-based emissions: missing Renewable_Percentage data")
    
    # Check 4: Data quality - no negative emissions
    if 'CO2e_Tonnes' in scope2_data.columns:
        negative_emissions = (scope2_data['CO2e_Tonnes'] < 0).sum()
        if negative_emissions > 0:
            issues.append(f"{negative_emissions} records have negative emissions")
    
    # Calculate simple score
    score = max(0, 100 - len(issues) * 25)  # -25 points per issue
    is_valid = score >= validity_threshold
    if is_valid:
        summary = "Valid: Scope 2 emissions meets GHG Protocol requirements"
    else:
        summary = f"Invalid: {len(issues)} issues found that need addressing"
    
    return {
        "is_valid": is_valid,
        "score": score,
        "issues": issues,
        "location_based_possible": location_based_possible,
        "market_based_possible": market_based_possible,
        "summary": summary
    }

In [None]:
question2 = "Are my scope 2 emissions calculation valid according to the Greenhouse Gas Protocol?"

validation_result = agent.validate_scope2_emissions()

# Show issues if any
if 'issues' in validation_result and validation_result['issues']:
    display(Markdown("\n**Issues Found:**"))
    for issue in validation_result['issues']:
        display(Markdown(f"- **{issue.severity.upper()}:** {issue.description}"))
        display(Markdown(f"  - *Recommendation:* {issue.recommendation}"))

In [None]:
validation_result = assess_scope2_validity(scope2_df)

In [None]:
question2 = "Are my scope 2 emissions calculation valid according to the Greenhouse Gas Protocol?"
question2_revised = "Scope 2 accounting methods location-based market-based requirements"

q2_context = vector_stores['ghg_protocol'].similarity_search(question2_revised, k=3)

In [None]:
q2_context[0]

In [None]:
[{"content": doc.page_content,
            "source": doc.metadata.get("source", doc_name)
        } for doc, score in q2_context]

# Q3

In [None]:
totals

In [None]:
question3 = """How do my scope 1 & 2 emissions compare with other companies in my industry, 
and what insights can I derive from this comparison?"""
comparison = agent.compare_with_peers()

display(Markdown(f"### Question: {question3}"))
display(Markdown("### Comparison Results:"))

# Display comparison table
comparison_df = pd.DataFrame(comparison['comparison_table'])
display(comparison_df.style.highlight_min(axis=0, color='lightgreen').highlight_max(axis=0, color='lightcoral'))

# Display insights
display(Markdown("\n### Key Insights:"))
for insight in comparison['insights']:
    display(Markdown(f"- {insight}"))

# Create visualization
import plotly.graph_objects as go

fig = go.Figure()
companies = comparison_df['Company'].tolist()
scopes = ['Scope 1', 'Scope 2', 'Scope 3']

for scope in scopes:
    fig.add_trace(go.Bar(
        name=scope,
        x=companies,
        y=comparison_df[scope].tolist()
    ))

fig.update_layout(
    title="Emissions Comparison with Industry Peers",
    xaxis_title="Company",
    yaxis_title="Emissions (tCO2e)",
    barmode='stack'
)

fig.show()

## 4. QA and actionable suggestion measurements

How can Agent give suggestions:
App1:
1. We can use external (real-time) sources like News, regulations, etc... to retrieve information about changing trends/regulations
2. feed this as context 
