In [1]:
get_ipython().system('pip install fitz')
get_ipython().system('pip install pymupdf')
get_ipython().system('pip install torch')
get_ipython().system('pip install git+https://github.com/huggingface/transformers accelerate')
get_ipython().system('pip install qwen-vl-utils')
get_ipython().system('pip install langdetect')

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-ngp8_c1j
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-ngp8_c1j
  Resolved https://github.com/huggingface/transformers to commit e3d99ec2f58e0e2a4df6b2b41152fdfb3f92a52f
  Installing build dependencies ... [?2done
[?25h  Getting requirements to build wheel ... [?25done
[?25h  Preparing metadata (pyproject.toml) ... [?25done


In [1]:
# Required libraries
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import requests
import fitz  # PyMuPDF
import re
import time
import os
import pandas as pd  # 📊 Import pandas for table display
from langdetect import detect
from pathlib import Path

# Track total execution time
overall_start = time.time()

# Step 1: Set up the model
model_name = "Qwen/Qwen2.5-7B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    torch_dtype=torch.float16, 
    device_map="auto"
)

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Step 2: Define multiple PDF URLs
pdf_urls = [
    "https://arxiv.org/pdf/2211.02001.pdf",  # Add more PDFs here
    "https://arxiv.org/pdf/2304.03271",
    "https://advanced.onlinelibrary.wiley.com/doi/epdf/10.1002/advs.202100707",
]

# Step 3: Define the single question
question = "What are the carbon emissions linked to AI training?"

# Step 4: Extract text from PDFs
def extract_text_from_pdf(pdf_path, max_pages=1, chunk_size=800):
    doc = fitz.open(pdf_path)
    text_chunks = []
    text = ""
    for i, page in enumerate(doc):
        if i >= max_pages:
            break
        text += page.get_text()
        while len(text) > chunk_size:
            text_chunks.append(text[:chunk_size + 300])  # Overlap for coherence
            text = text[chunk_size:]
    text_chunks.append(text)
    return text_chunks

# Step 5: Generate answers
def generate_answer(question, text_chunks):
    all_responses = []
    for i, chunk in enumerate(text_chunks):
        print(f"\n🔍 Analyzing Chunk {i+1}/{len(text_chunks)}...")

        input_text = (
            f"Context: {chunk}\n"
            f"Question: {question}\n"
            "Please provide a short, precise answer focusing on numerical data only. "
            "If no relevant information is found, reply with 'No data available'.\n"
        )
        inputs = tokenizer(input_text, return_tensors="pt").to(device)

        with torch.no_grad():
            generated_ids = model.generate(
                **inputs,
                max_new_tokens=32,  
                num_beams=5,    
                temperature=0.3,
                no_repeat_ngram_size=2,
                early_stopping=True
            )

        response = tokenizer.decode(generated_ids[0], skip_special_tokens=True).strip()
        all_responses.append(response)

    return all_responses

# Step 6: Extract key sentences with proper number handling
def extract_key_sentences(responses):
    key_sentences = []
    for response in responses:
        if detect(response) == 'en':
            sentences = response.split('. ')
            for sentence in sentences:
                if re.search(r'\b\d+(\.\d+)?\b', sentence):
                    if any(keyword in sentence.lower() for keyword in ["co2", "carbon dioxide", "emission", "tonnes", "kg","co2eq","carbon footprint","bloom","training"]):
                        sentence = re.sub(r'(Question|Output Format|Please provide).*?\. ', '', sentence, flags=re.IGNORECASE)
                        sentence = re.sub(r'\s+', ' ', sentence).strip()
                        key_sentences.append(sentence)
    return key_sentences


# Step 7: Summarize extracted answers
def summarize_responses(responses):
    summary_text = " ".join(responses)
    summary_prompt = (
        f"Summarize the following text in about 100 words, focusing on CO2 emissions and numerical data:\n"
        f"{summary_text}\n"
    )
    summary_inputs = tokenizer(summary_prompt, return_tensors="pt")
    with torch.no_grad():
        summary_ids = model.generate(
            **summary_inputs,
            max_new_tokens=150,
            num_beams=5,
            temperature=0.3,
            no_repeat_ngram_size=2,
            early_stopping=True
        )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True).strip()
    summary = re.sub(r'(Question|Output Format|Please provide).*?\. ', '', summary, flags=re.IGNORECASE)
    summary = re.sub(r'\s+', ' ', summary).strip()
    return summary

# Step 9: Process each PDF and store data
pdf_data = []

for pdf_url in pdf_urls:
    print(f"\n📄 Processing PDF: {pdf_url}")

    # Download PDF
    pdf_name = pdf_url.split("/")[-1]
    response = requests.get(pdf_url)
    with open(pdf_name, 'wb') as f:
        f.write(response.content)

    # Extract text
    text_chunks = extract_text_from_pdf(pdf_name)

    print(f"\n🔍 Processing: {question}")
    
    # Generate and process responses
    responses = generate_answer(question, text_chunks)
    filtered_responses = extract_key_sentences(responses)
    summary = summarize_responses(filtered_responses)  # Get summary instead of top 10

    # Store results
    pdf_data.append([pdf_name, question, summary])

    # Remove the downloaded PDF to save space
    os.remove(pdf_name)

# Convert to DataFrame and save
df_results = pd.DataFrame(pdf_data, columns=["PDF Name", "Question", "Summary"])

print(df_results)

# End timing
overall_end = time.time()
total_duration = overall_end - overall_start
print(f"\n⏱️ Total execution time: {total_duration / 60:.2f} minutes")


  from .autonotebook import tqdm as notebook_tqdm
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
Loading checkpoint shards: 100%|██████████| 4/4 [00:06<00:00,  1.73s/it]
Some parameters are on the meta device because they were offloaded to the cpu.



📄 Processing PDF: https://arxiv.org/pdf/2211.02001.pdf

🔍 Processing: What are the carbon emissions linked to AI training?

🔍 Analyzing Chunk 1/5...

🔍 Analyzing Chunk 2/5...

🔍 Analyzing Chunk 3/5...

🔍 Analyzing Chunk 4/5...

🔍 Analyzing Chunk 5/5...





📄 Processing PDF: https://arxiv.org/pdf/2304.03271

🔍 Processing: What are the carbon emissions linked to AI training?

🔍 Analyzing Chunk 1/6...

🔍 Analyzing Chunk 2/6...

🔍 Analyzing Chunk 3/6...

🔍 Analyzing Chunk 4/6...

🔍 Analyzing Chunk 5/6...

🔍 Analyzing Chunk 6/6...

📄 Processing PDF: https://advanced.onlinelibrary.wiley.com/doi/epdf/10.1002/advs.202100707

🔍 Processing: What are the carbon emissions linked to AI training?

🔍 Analyzing Chunk 1/1...
         PDF Name                                           Question  \
0  2211.02001.pdf  What are the carbon emissions linked to AI tra...   
1      2304.03271  What are the carbon emissions linked to AI tra...   
2  advs.202100707  What are the carbon emissions linked to AI tra...   

                                             Summary  
0  Summarize the following text in about 100 word...  
1  Summarize the following text in about 100 word...  
2  Summarize the following text in about 100 word...  

⏱️ Total execution time: 8.6

In [2]:
# Define a safe directory
save_path = "/data" if os.path.exists("/data") else os.getcwd()  # Fallback to the current working directory

# Ensure the directory exists
os.makedirs(save_path, exist_ok=True)

# Define the file path
csv_file = os.path.join(save_path, "AI_Training_Emissions_Summary.csv")

# Export the DataFrame
df_results.to_csv(csv_file, index=False)

print(f"✅ Results saved at: {csv_file}")


✅ Results saved at: /home/onyxia/work/AI_Training_Emissions_Summary.csv


In [3]:
# Ensure all text is visible
pd.set_option("display.max_colwidth", None)

# Print the summary content
print(df_results[["PDF Name", "Question", "Summary"]])


         PDF Name                                              Question  \
0  2211.02001.pdf  What are the carbon emissions linked to AI training?   
1      2304.03271  What are the carbon emissions linked to AI training?   
2  advs.202100707  What are the carbon emissions linked to AI training?   

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           

In [4]:
# List of model names to look for (modify this as needed)
model_names = [
    "BLOOM",
    "GPT-3",
    "T5",
    "BERT",
    "XLNet",
    "RoBERTa",
    "GPT-2",
    "Transformer-XL",
    "Albert",
    "Megatron"
]

# Function to extract model names and CO2 emissions from summaries
def extract_model_emissions_from_summary(df, model_names):
    model_emissions = []

    # Improved pattern to capture multiple emissions per model
    pattern = re.compile(
        r"(?P<model>" + "|".join(model_names) + r").*?(?:approximately|around|about)?\s?(?P<emission>\d+(\.\d+)?)\s?(?:tonnes|tonne|t)\s?(?:CO2|CO2eq|CO₂)?",
        re.IGNORECASE
    )

    # Iterate over each row in the DataFrame
    for index, row in df.iterrows():
        pdf_name = row["PDF Name"]
        summary = row["Summary"]

        if isinstance(summary, str):  # Ensure valid text
            sentences = re.split(r'(?<=[.!?]) +', summary)  # Sentence splitting
            
            for sentence in sentences:
                if any(model in sentence for model in model_names):  # Check if sentence mentions a model
                    print(f"\n🔎 Analyzing Sentence: {sentence}")

                    matches = pattern.findall(sentence)  # Extract emissions
                    if matches:
                        for match in matches:
                            model = match[0]  # Extracted model name
                            emission = float(match[1])  # Extracted CO2 emission value
                            model_emissions.append({
                                "PDF Name": pdf_name,
                                "Model Name": model,
                                "CO2 Emission (Tonnes)": emission,
                                "Context": sentence
                            })
                            print(f"✅ Model Found: {model} with Emission: {emission} Tonnes")
                    else:
                        print("⚠️ Model mentioned but no emissions found.")
    
    return model_emissions

# Extract model emissions from the summary column
model_emissions_list = extract_model_emissions_from_summary(df_results, model_names)

# Convert to DataFrame
df_emissions = pd.DataFrame(model_emissions_list)

# Display the DataFrame
print("\n📊 CO2 Emissions by Model with Context:")
print(df_emissions)



🔎 Analyzing Sentence: Summarize the following text in about 100 words, focusing on CO2 emissions and numerical data: In the present article, we aim to quantify the carbon footprint of BLOOM, a 176-billion parameter language model, across its life cycle We estimate that BLOOM’s ﬁnal training emitted approximately 24.7 tonnes of CO2eq if we consider only the dynamic power consumption, and 50.5 tonnes if we account for all processes ranging from equipment manufacturing to energy-based operational consumption We conclude with a discussion regarding the difﬁculty of precisely estimating the carbon footprint of ML models and future research directions that can contribute towards improving carbon emissions reporting.
✅ Model Found: BLOOM with Emission: 24.7 Tonnes

🔎 Analyzing Sentence: Summarize the following text in about 100 words, focusing on CO2 emissions and numerical data: For example, training the GPT-3 language model in Microsoft’s state-of-the-art U.S In this paper, we provide a pr

In [6]:
# Define a safe directory
save_path = "/data" if os.path.exists("/data") else os.getcwd()  # Fallback to the current working directory

# Ensure the directory exists
os.makedirs(save_path, exist_ok=True)

# Define the file path
csv_file = os.path.join(save_path, "CO2_Emissions_By_Model.csv")

# Export the DataFrame
df_emissions.to_csv(csv_file, index=False)

print(f"✅ CO2 emissions data saved at: {csv_file}")


✅ CO2 emissions data saved at: /home/onyxia/work/CO2_Emissions_By_Model.csv
