In [1]:
get_ipython().system('pip install fitz')
get_ipython().system('pip install pymupdf')
get_ipython().system('pip install torch')
get_ipython().system('pip install git+https://github.com/huggingface/transformers accelerate')
get_ipython().system('pip install qwen-vl-utils')
get_ipython().system('pip install langdetect')

Collecting fitz
  Downloading fitz-0.0.1.dev2-py2.py3-none-any.whl.metadata (816 bytes)
Collecting configobj (from fitz)
  Downloading configobj-5.0.9-py2.py3-none-any.whl.metadata (3.2 kB)
Collecting configparser (from fitz)
  Downloading configparser-7.1.0-py3-none-any.whl.metadata (5.4 kB)
Collecting httplib2 (from fitz)
  Downloading httplib2-0.22.0-py3-none-any.whl.metadata (2.6 kB)
Collecting nibabel (from fitz)
  Downloading nibabel-5.3.2-py3-none-any.whl.metadata (9.1 kB)
Collecting nipype (from fitz)
  Downloading nipype-1.9.2-py3-none-any.whl.metadata (6.8 kB)
Collecting pandas (from fitz)
  Downloading pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
Collecting pyxnat (from fitz)
  Downloading pyxnat-1.6.3-py3-none-any.whl.metadata (5.4 kB)
Collecting scipy (from fitz)
  Downloading scipy-1.15.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting pyparsing!=3.0.0,!=3.0.1,!=3.0.2,!=3.0.3,<4,>=2.4.2 (fr

In [1]:
# Required libraries
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import requests
import fitz  # PyMuPDF
import re
import time
import os
import pandas as pd  # Import pandas for table display
from langdetect import detect
from pathlib import Path

# Track total execution time
overall_start = time.time()

# Step 1: Set up the model
model_name = "Qwen/Qwen2.5-7B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    torch_dtype=torch.float16, 
    device_map="auto"
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Step 2: Define PDF URLs
pdf_urls = [
    "https://arxiv.org/pdf/2211.02001.pdf",  
]

# Step 3: Define the questions and corresponding keywords
questions_keywords = {
    "What is the estimated electricity consumption linked to training AI models?": ["electricity", "power consumption", "kWh", "energy usage","Wh","TWh"],
    "What is the estimated carbon emission linked to training AI models?": ["co2", "carbon", "emission", "tonnes", "kg", "co2eq", "carbon footprint"],
    "What is the power usage effectiveness (PUE) for total facility power linked to AI compared to ICT equipment power?": ["pue", "power usage effectiveness", "total facility power", "ICT equipment power"],
    "What is the carbon usage effectiveness (CUE): the ratio of total CO2 emissions caused by total data centers energy consumption to the energy consumption of ICT equipment?": ["cue", "carbon usage effectiveness", "CO2 emissions", "data center energy consumption", "ICT equipment energy"]
}

# Step 4: Extract text from PDFs and retrieve title
def extract_text_from_pdf(pdf_path, max_pages=10, chunk_size=800):
    doc = fitz.open(pdf_path)
    text_chunks = []
    text = ""
    pdf_title = doc.metadata.get("title", "Unknown Title")
    
    for i, page in enumerate(doc):
        if i >= max_pages:
            break
        text += page.get_text()
        while len(text) > chunk_size:
            text_chunks.append(text[:chunk_size + 300])  # Overlap for coherence
            text = text[chunk_size:]
    text_chunks.append(text)
    
    # If no metadata title, extract first line from first page as fallback
    if pdf_title == "Unknown Title" and text_chunks:
        first_lines = text_chunks[0].split('\n')
        pdf_title = first_lines[0] if first_lines else "Unknown Title"
    
    return text_chunks, pdf_title

# Step 5: Generate answers
def generate_answer(question, text_chunks):
    all_responses = []
    for i, chunk in enumerate(text_chunks):
        print(f"\n🔍 Analyzing Chunk {i+1}/{len(text_chunks)} for question: {question}")
        input_text = (
            f"Context: {chunk}\n"
            f"Question: {question}\n"
            "Provide a concise answer with numerical data only. "
            "If no relevant information is found, reply with 'No data available'.\n"
        )
        inputs = tokenizer(input_text, return_tensors="pt").to(device)
        with torch.no_grad():
            generated_ids = model.generate(
                **inputs,
                max_new_tokens=32,  
                num_beams=5,    
                temperature=0.3,
                no_repeat_ngram_size=2,
                early_stopping=True
            )
        response = tokenizer.decode(generated_ids[0], skip_special_tokens=True).strip()
        all_responses.append(response)
    return all_responses

# Step 6: Extract key sentences
def extract_key_sentences(responses, keywords):
    key_sentences = []
    for response in responses:
        if detect(response) == 'en':
            sentences = response.split('. ')
            for sentence in sentences:
                if re.search(r'\b\d+(\.\d+)?\b', sentence) and any(
                    keyword in sentence.lower() for keyword in keywords
                ):
                    sentence = re.sub(r'\s+', ' ', sentence).strip()
                    key_sentences.append(sentence)
    return key_sentences

# Step 7: Summarize extracted answers
def summarize_responses(responses):
    summary_text = " ".join(responses)
    summary_prompt = (
        f"Summarize in around 100 words, focusing on CO2 emissions and numerical data:\n"
        f"{summary_text}\n"
    )
    summary_inputs = tokenizer(summary_prompt, return_tensors="pt")
    with torch.no_grad():
        summary_ids = model.generate(
            **summary_inputs,
            max_new_tokens=100,
            num_beams=5,
            temperature=0.3,
            no_repeat_ngram_size=2,
            early_stopping=True
        )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True).strip()
    summary = re.sub(r'\s+', ' ', summary).strip()
    return summary

# Step 8: Process each PDF and store data
pdf_data = []
for pdf_url in pdf_urls:
    print(f"\n📄 Processing PDF: {pdf_url}")
    pdf_name = pdf_url.split("/")[-1]
    response = requests.get(pdf_url)
    with open(pdf_name, 'wb') as f:
        f.write(response.content)
    
    text_chunks, pdf_title = extract_text_from_pdf(pdf_name)
    print(f"📖 Extracted Title: {pdf_title}")
    
    for question, keywords in questions_keywords.items():
        print(f"\n❓ Processing question: {question}")
        responses = generate_answer(question, text_chunks)
        filtered_responses = extract_key_sentences(responses, keywords)
        summary = summarize_responses(filtered_responses)
        pdf_data.append([pdf_title, question, summary])
    os.remove(pdf_name)

# Convert to DataFrame and save
df_results = pd.DataFrame(pdf_data, columns=["PDF Title", "Question", "Summary"])
print(df_results)

# End timing
overall_end = time.time()
total_duration = overall_end - overall_start
print(f"\n⏱️ Total execution time: {total_duration / 60:.2f} minutes")


  from .autonotebook import tqdm as notebook_tqdm
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
Loading checkpoint shards: 100%|██████████| 4/4 [00:08<00:00,  2.02s/it]
Some parameters are on the meta device because they were offloaded to the cpu.



📄 Processing PDF: https://arxiv.org/pdf/2211.02001.pdf
📖 Extracted Title: 

❓ Processing question: What is the estimated electricity consumption linked to training AI models?

🔍 Analyzing Chunk 1/53 for question: What is the estimated electricity consumption linked to training AI models?

🔍 Analyzing Chunk 2/53 for question: What is the estimated electricity consumption linked to training AI models?

🔍 Analyzing Chunk 3/53 for question: What is the estimated electricity consumption linked to training AI models?

🔍 Analyzing Chunk 4/53 for question: What is the estimated electricity consumption linked to training AI models?

🔍 Analyzing Chunk 5/53 for question: What is the estimated electricity consumption linked to training AI models?

🔍 Analyzing Chunk 6/53 for question: What is the estimated electricity consumption linked to training AI models?

🔍 Analyzing Chunk 7/53 for question: What is the estimated electricity consumption linked to training AI models?

🔍 Analyzing Chunk 8/53 fo



OutOfMemoryError: CUDA out of memory. Tried to allocate 668.00 MiB. GPU 0 has a total capacity of 14.57 GiB of which 392.75 MiB is free. Process 2174434 has 14.18 GiB memory in use. Of the allocated memory 12.98 GiB is allocated by PyTorch, and 1.08 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [2]:
# Define a safe directory
save_path = "/data" if os.path.exists("/data") else os.getcwd()  # Fallback to the current working directory

# Ensure the directory exists
os.makedirs(save_path, exist_ok=True)

# Define the file path
csv_file = os.path.join(save_path, "AI_Training_Emissions_Summary.csv")

# Export the DataFrame
df_results.to_csv(csv_file, index=False)

print(f"✅ Results saved at: {csv_file}")

✅ Results saved at: /home/onyxia/work/AI_Training_Emissions_Summary.csv


In [4]:
# Ensure all text is visible
pd.set_option("display.max_colwidth", None)

# Print the summary content
print(df_results[["Question", "Summary"]])


                                                                                                                                                                     Question  \
0                                                                                                 What is the estimated electricity consumption linked to training AI models?   
1                                                                                                         What is the estimated carbon emission linked to training AI models?   
2                                                          What is the power usage effectiveness (PUE) for total facility power linked to AI compared to ICT equipment power?   
3  What is the carbon usage effectiveness (CUE): the ratio of total CO2 emissions caused by total data centers energy consumption to the energy consumption of ICT equipment?   

                                                                                                                  