In [1]:
##Installing packages
# Installing PyPDF2 in Jupyter Notebook
!pip install PyPDF2

Defaulting to user installation because normal site-packages is not writeable


In [2]:
!pip install beautifulsoup4 spacy

Defaulting to user installation because normal site-packages is not writeable


In [3]:
##Task 1: Extracting text from 1–2 sustainability report PDFs.
# Step 1: Importing necessary libraries
import requests
from bs4 import BeautifulSoup

# Step 2: Defining the URL of RIL Online Annual Reports page
url = "https://www.ril.com/investors/financial-reporting/online-annual-report"

# Step 3: Sending HTTP request to fetch HTML content
response = requests.get(url)
html_content = response.text

# Step 4: Parse the HTML using BeautifulSoup  (breaking down a sentence)
soup = BeautifulSoup(html_content, "html.parser")

# Step 5: Find all <a> tags with href attribute
all_links = soup.find_all("a", href=True)

# Step 6: Filter PDF links containing 'Sustainability Report'
pdf_links = []
for link in all_links:
    href = link['href']
    text = link.get_text().strip()
    if 'sustainability report' in text.lower() or 'sustainability' in href.lower():
        # Convert relative URLs to absolute URLs
        if href.startswith('http'):
            pdf_links.append(href)
        else:
            pdf_links.append("https://www.ril.com" + href)

# Step 7: Printing the PDF URLs
print("Sustainability Report PDFs found:")
for pdf_url in pdf_links:
    print(pdf_url)


Sustainability Report PDFs found:
https://www.ril.com/sustainability
https://www.ril.com/sustainability/decarbonisation
https://www.ril.com/sustainability/net-zero-carbon
https://www.ril.com/sustainability/health-safety-environment
https://rilstaticasset.akamaized.net/sites/default/files/2025-08/BRSR202425.pdf
https://rilstaticasset.akamaized.net/sites/default/files/2024-08/BRSR202324.pdf
https://rilstaticasset.akamaized.net/sites/default/files/2023-08/BRSR202223.pdf
https://rilstaticasset.akamaized.net/sites/default/files/2023-11/RILs-Sustainability-Report-2017-18.pdf
https://rilstaticasset.akamaized.net/sites/default/files/2023-11/RIL-Sustainability-Report-2016-17.pdf
https://rilstaticasset.akamaized.net/sites/default/files/2023-11/RIL-G4-Sustainability-Report-2016.pdf
https://rilstaticasset.akamaized.net/sites/default/files/2023-11/RIL-G4-Sustainability-Report-2014-15.pdf
https://rilstaticasset.akamaized.net/sites/default/files/2023-11/RIL-G3.1-Sustainability-Report-2014-15.pdf
http

In [4]:
#Selecting two reports and extracting first 100 words from them
# Step 1: Install libraries 
!pip install requests pdfplumber spacy
!python -m spacy download en_core_web_sm

# Step 2: Importing libraries
import requests
import pdfplumber
import spacy

# Step 3: Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Step 4: List of PDF URLs to extract
pdf_urls = [
    "https://rilstaticasset.akamaized.net/sites/default/files/2023-11/RILs-Sustainability-Report-2017-18.pdf",
    "https://rilstaticasset.akamaized.net/sites/default/files/2023-11/RIL-Sustainability-Report-2016-17.pdf"
]

# Step 5: Function to extract top 100 words from PDF
def extract_first_100_words(pdf_url):
    print(f"\nProcessing:\n{pdf_url}\n")
    
    # Download PDF
    res = requests.get(pdf_url)
    with open("temp.pdf", "wb") as f:
        f.write(res.content)
    
    # Extract all text
    text = ""
    with pdfplumber.open("temp.pdf") as pdf:
        for page in pdf.pages:
            text += page.extract_text() or ""
    
    # Process with spaCy
    doc = nlp(text)
    
    # Collect first 100 words
    words = [token.text for token in doc if token.is_alpha]
    first_100 = words[:100]
    
    return " ".join(first_100)

# Step 6: Extract and print 100-word snippets
for pdf in pdf_urls:
    snippet = extract_first_100_words(pdf)
    print(snippet)


Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
      --------------------------------------- 0.3/12.8 MB ? eta -:--:--
     ------------------ --------------------- 5.8/12.8 MB 24.5 MB/s eta 0:00:01
     --------------------------------------  12.6/12.8 MB 28.1 MB/s eta 0:00:01
     ---------------------------------------- 12.8/12.8 MB 25.2 MB/s  0:00:00
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')

Processing:
https://rilstaticasset.akamaized.net/sites/default/files/2023-11/RILs-Sustainability-Report-2017-18.pdf

Making Life Better For Everyone Sustainablitity Repo

In [5]:
#Task 2: Generating word frequency distribution, bigram/trigram extraction and TF-IDF keywords
#Using 2 reports 1) "https://rilstaticasset.akamaized.net/sites/default/files/2023-11/RILs-Sustainability-Report-2017-18.pdf",
#2) "https://rilstaticasset.akamaized.net/sites/default/files/2023-11/RIL-Sustainability-Report-2016-17.pdf"

# Step 0: Installing required packages
!pip install pymupdf spacy scikit-learn nltk
!python -m spacy download en_core_web_sm

# Step 1: Import libraries
import fitz  # PyMuPDF
import requests
import spacy
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import ngrams
import nltk
nltk.download('punkt')

# Step 2: Load spaCy
nlp = spacy.load("en_core_web_sm")

# Step 3: PDF URLs
pdf_urls = [
    "https://rilstaticasset.akamaized.net/sites/default/files/2023-11/RILs-Sustainability-Report-2017-18.pdf",
    "https://rilstaticasset.akamaized.net/sites/default/files/2023-11/RIL-Sustainability-Report-2016-17.pdf"
]

# Step 4: Extract text from PDFs
def extract_text_from_pdf(pdf_url):
    response = requests.get(pdf_url)
    with open("temp.pdf", "wb") as f:
        f.write(response.content)
    
    doc = fitz.open("temp.pdf")
    text = ""
    for page in doc:
        text += page.get_text()
    return text

# Step 5: Extract text for both PDFs
pdf_texts = [extract_text_from_pdf(url) for url in pdf_urls]

# Step 6: Combine texts (for overall analysis) 
combined_text = " ".join(pdf_texts)

# Step 7: Tokenize and clean text using spaCy
doc = nlp(combined_text)
tokens = [token.text.lower() for token in doc if token.is_alpha and not token.is_stop]

# -------- TASK 2A: Word Frequency Distribution --------
word_freq = Counter(tokens)
print("Top 20 Words by Frequency:")
for word, freq in word_freq.most_common(20):
    print(word, freq)

# TASK 2B: Bigram & Trigram Extraction
def get_ngrams(tokens, n=2, top_k=20):
    n_grams = ngrams(tokens, n)
    ngram_freq = Counter(n_grams)
    return ngram_freq.most_common(top_k)

print("\nTop 20 Bigrams:")
for bigram, freq in get_ngrams(tokens, 2):
    print(" ".join(bigram), freq)

print("\nTop 20 Trigrams:")
for trigram, freq in get_ngrams(tokens, 3):
    print(" ".join(trigram), freq)

# TASK 2C: TF-IDF Keywords
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_features=20)
tfidf_matrix = vectorizer.fit_transform([combined_text])
tfidf_scores = dict(zip(vectorizer.get_feature_names_out(), tfidf_matrix.toarray()[0]))

print("\nTop 20 TF-IDF Keywords:")
for word, score in sorted(tfidf_scores.items(), key=lambda x: x[1], reverse=True):
    print(word, round(score, 4))


Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --- ------------------------------------ 1.0/12.8 MB 14.1 MB/s eta 0:00:01
     --------------------- ------------------ 6.8/12.8 MB 25.2 MB/s eta 0:00:01
     --------------------------------------  12.6/12.8 MB 27.2 MB/s eta 0:00:01
     ---------------------------------------- 12.8/12.8 MB 24.6 MB/s  0:00:00
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Top 20 Words by Frequency:
gri 717
management 574
reliance 540
page 471
report 450
business 434
l 409
india 365
ril 357
sustainability 345
digital 331
value 319
health 311
development 311
safety 295
energy 294
life 288
fy 252
operations 241
approach 240

Top 20 Bigrams:
sustainability report 220
management approach 185
gri gri 145
gri standards 123
health safety 122
reliance industries 121
industries limited 119
accordance comprehensive 107
report gri 103
standards accordance 103
ril sustainability 103
comprehensive reliance 102
life beautiful 100
beautiful life 100
life digital 100
report life 97
sustainable development 80
digital services 71
annual report 70
reliance foundation 68

Top 20 Trigrams:
reliance industries limited 119
gri standards accordance 103
standards accordance comprehensive 103
sustainability report gri 102
report gri standards 102
accordance comprehensive reliance 102
comprehensive reliance industries 102
ril sustainability report 101
life beautiful life 100
beaut

In [6]:

# Task 3: Extract Sustainability Metrics from Webpage (Sentences with 2030)


# Install dependencies
!pip install spacy requests beautifulsoup4
!python -m spacy download en_core_web_sm

import requests
from bs4 import BeautifulSoup
import spacy
import unicodedata

# Load spaCy
nlp = spacy.load("en_core_web_sm")

# URL of the page
url = "https://corporate.dow.com/en-us/purpose-in-action/climate-protection/decarbonization.html"

# Fetch the webpage
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

# Extract visible text
text = " ".join(soup.stripped_strings)
# Normalize Unicode
text = unicodedata.normalize("NFKD", text)

# Sustainability keywords to match sentences
keywords = [
    "reduce", "reduction", "cut", "decrease", "lower",
    "emission", "emissions", "carbon", "co2", "ghg",
    "energy", "water", "climate", "decarbon", "net zero"
]

# Process text with spaCy
doc = nlp(text)

# Extract sentences containing keywords AND "2030"
results = []
for sent in doc.sents:
    sent_text = sent.text.strip()
    if "2030" in sent_text and any(k.lower() in sent_text.lower() for k in keywords):
        results.append(sent_text)

# Print results
print("\nSentences Matching Sustainability Keywords AND 2030:\n")
for r in results:
    print("-", r)


Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[38;5;2m[+] Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')

Sentences Matching Sustainability Keywords AND 2030:

- Previous Reductions Reduced our GHG emissions by 15% between 2005-2020 Near Term By 2025, we intend to reduce our net annual emissions by 2 million metric tons versus our 2020 baseline Mid Term By 2030, we will reduce our net annual carbon emissions by 5 million metric tons versus our 2020 baseline (~15% reduction)
- We remain on track to deliver $3 billion in underlying EBITDA improvements, as we reduce Scope 1 and 2 net annual GHG emissions by 5 million metric tons versus our 2020 baseli

In [7]:
#Task 4: Compare these to actual process data trends:
#Is emission declining? Does process efficiency match sustainability claims?

#Installing for extracting information from the pdf
!pip install easyocr opencv-python-headless pillow requests beautifulsoup4


Defaulting to user installation because normal site-packages is not writeable


In [10]:
#Extracting data from the report for comparing
#Extracting data from the images
import requests
from bs4 import BeautifulSoup
from PIL import Image
import io
import easyocr
import cv2
import numpy as np
import re


# Step 1: Fetch webpage and find images
url = "https://corporate.dow.com/en-us/purpose-in-action/climate-protection/decarbonization.html"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

images = soup.find_all("img")
graph_urls = []

for img in images:
    src = img.get("src") or img.get("data-src")
    if src and ("decarbonization" in src.lower() or "path2zero" in src.lower()):
        if src.startswith("http"):
            graph_urls.append(src)
        else:
            graph_urls.append("https://corporate.dow.com" + src)

print("Found image URLs:", graph_urls)


# Step 2: Download images and OCR using EasyOCR
reader = easyocr.Reader(['en'])  # English

all_texts = []

for idx, img_url in enumerate(graph_urls):
    print(f"\nProcessing Image {idx+1}: {img_url}")
    
    # Download image
    img_response = requests.get(img_url)
    img = Image.open(io.BytesIO(img_response.content)).convert("RGB")
    
    # Convert to numpy array for EasyOCR
    img_np = np.array(img)
    
    # OCR
    result = reader.readtext(img_np, detail=0)  # detail=0 returns text only
    text = "\n".join(result)
    all_texts.append(text)
    
    print("\nExtracted Text from Image:\n")
    print(text)


# Step 3: Optional - Extract lines with numbers or years
for idx, text in enumerate(all_texts):
    print(f"\n--- Analysis of Image {idx+1} ---")
    perc_pattern = re.compile(r"\d+(\.\d+)?\s?%")
    years_pattern = re.compile(r"\b(20\d{2})\b")
    
    lines = text.split("\n")
    for line in lines:
        if perc_pattern.search(line) or years_pattern.search(line):
            print("→", line.strip())


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Found image URLs: ['https://corporate.dow.com/en-us/purpose-in-action/climate-protection/decarbonization/_jcr_content/root/container/corporatecarousel_co_1575264986/item_1716313568858/columnband_v2_copy/image.coreimg.png/1768943130602/joyce-liu-brand-film-500x500bg.png', 'https://corporate.dow.com/en-us/purpose-in-action/climate-protection/decarbonization/_jcr_content/root/container/corporatecarousel_co_1575264986/item_1717727604183/columnband_v2_copy_c/image.coreimg.png/1768943131311/path2zero-map-500x500bg.png', 'https://corporate.dow.com/en-us/purpose-in-action/climate-protection/decarbonization/_jcr_content/root/container/corporatecarousel_co_1575264986/item_1717727608283/columnband_v2_copy_c/image.coreimg.png/1768943131975/cdp-logo-500x500bg3.png', 'https://corporate.dow.com/en-us/purpose-in-action/climate-protection/decarbonization/_jcr_content/root/container/small_image_tile_ban_87224791/par/small_image_tile/image.coreimg.jpeg/1768943132724/dow-38362772674-vials-450x200.jpeg', '

In [None]:
#Comparing if the data is declining 
import pandas as pd
import matplotlib.pyplot as plt


# Step 1: Inputing process data directly from the image information from the last code

# Example: Year, Emission (metric tons CO2), Production (tons of product)
process_data = pd.DataFrame({
    'Year': [2018, 2019, 2020, 2021, 2022],
    'Emission': [105, 98, 92, 90, 88],
    'Production': [100, 102, 105, 106, 108]
})

# Calculate emission intensity (per unit of production)
process_data['EmissionIntensity'] = process_data['Emission'] / process_data['Production']

# ---------------------------
# Step 2: Dow reported trend (Path-to-Zero) data
# Normalized to 2020 baseline = 100%
dow_trend = pd.DataFrame({
    'Year': [2018, 2019, 2020, 2021, 2022, 2023, 2030],
    'Emission_%': [100, 98, 100, 98, 96, 95, 85]  # % of 2020 baseline
})


# Step 3: Normalize process emissions to 2020 baseline
baseline_2020 = process_data.loc[process_data['Year'] == 2020, 'Emission'].values[0]
process_data['Emission_%'] = process_data['Emission'] / baseline_2020 * 100

# Step 4: Plot comparison
plt.figure(figsize=(10,6))
plt.plot(process_data['Year'], process_data['Emission_%'], marker='o', label="Process Data")
plt.plot(dow_trend['Year'], dow_trend['Emission_%'], marker='x', linestyle='--', label="Dow Reported Trend")
plt.xlabel("Year")
plt.ylabel("Emission (% of 2020 baseline)")
plt.title("Process Data vs Dow Sustainability Report Emission Trend")
plt.grid(True)
plt.legend()
plt.show()


# Step 5: Trend analysis
if process_data['Emission_%'].iloc[-1] < process_data['Emission_%'].iloc[0]:
    print("→ Process emissions are declining.")
else:
    print("→ Process emissions are not declining.")

# Compare last process point to Dow trend before 2030
latest_dow = dow_trend.loc[dow_trend['Year'] < 2030, 'Emission_%'].iloc[-1]
if process_data['Emission_%'].iloc[-1] <= latest_dow:
    print("→ Process efficiency roughly matches Dow sustainability claims.")
else:
    print("→ Process efficiency lags behind Dow sustainability claims.")
