In [None]:
!pip install datasets transformers

Collecting datasets
  Downloading datasets-3.3.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.0-py3-none-any.whl (484 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.9/484.9 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading 

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForQuestionAnswering
from datasets import load_dataset
import torch

First, we process the (preprocessed) csv file. The aim is to generate a question-answer-context structure to make the data applicable for the final chatbot. The columns that provide the most important information (i.e. policy description, country, etc.) are used to create the context column. In order to generate questions, we make use of a LLM that is suitable for question generation. In the next step, we apply a LLM that performs well in answer generation.


In [None]:
#@title QA structure for CSV file

# Load dataset
dataset = load_dataset("csv", data_files="/content/climate_policies_final.csv", sep=";")

# Step 1: Create a context column from CSV
def create_context(example):
    context = (
        f"Country: {example.get('country', '')}\n"
        f"Policy Title: {example.get('policy_title', '')}\n"
        f"Policy Description: {example.get('policy_description', '')}\n"
        f"Policy Instrument: {example.get('policy_instrument', 'N/A')}"
    )
    example["context"] = context
    return example

dataset = dataset.map(create_context)

# Step 2: Question Generation using LLM
qg_model_name = "valhalla/t5-base-qg-hl"
qg_tokenizer = AutoTokenizer.from_pretrained(qg_model_name)
qg_model = AutoModelForSeq2SeqLM.from_pretrained(qg_model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
qg_model.to(device)

def generate_question(example):
    context = example["context"]
    # Construct prompt for question generation
    prompt = f"generate question: {context}"

    inputs = qg_tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    outputs = qg_model.generate(
        inputs["input_ids"],
        max_length=64,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.8,
        num_return_sequences=1
    )
    question_text = qg_tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
    example["question"] = question_text
    return example

dataset = dataset.map(generate_question)

# Step 3: Answer Generation using a model fine-tuned for question answering
qa_model_name = "google/flan-t5-base"
qa_tokenizer = AutoTokenizer.from_pretrained(qa_model_name)
qa_model = AutoModelForSeq2SeqLM.from_pretrained(qa_model_name)
qa_model.to(device)

def generate_answer(example):
    context = example["context"]
    question = example.get("question", "")
    prompt = (
        f"Context:\n{context}\n\n"
        f"Question: {question}\n\n"
        "Provide a detailed answer discussing the long-term technological, economic, social, environmental, "
        "and geopolitical impacts by 2100."
    )
    inputs = qa_tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = qa_model.generate(
        inputs["input_ids"],
        max_length=150,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.8,
        num_return_sequences=1
    )
    answer_text = qa_tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
    example["answer"] = answer_text
    return example

dataset = dataset.map(generate_answer)

#Print out the results
for i in range(10):
    print(f"Row {i+1}:")
    print("Context:")
    print(dataset["train"][i]["context"])
    print("\nGenerated Question:")
    print(dataset["train"][i]["question"])
    print("\nGenerated Answer:")
    print(dataset["train"][i]["answer"])
    print("="*60)

NameError: name 'load_dataset' is not defined

In [None]:
#Save trained data as json

dataset_path = "/content/qa_dataset.json"
dataset["train"].to_json(dataset_path)

print(f"Dataset successfully saved at {dataset_path}")

KeyError: "Column train not in the dataset. Current columns in the dataset: ['context', 'question', 'answer']"

Now, we can process other data to enrich the available information. We use the "Summary for policymaker" document to do so.

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import Dataset
import torch
import re

In [None]:
#@title Clean TXT Document and preserve paragraph structure
    """
    Cleans the document by splitting into paragraphs (using one or more blank lines as separators)
    and filtering out paragraphs that are likely noise:
      - Paragraphs with fewer than 10 words.
      - Paragraphs that seem to be reference lists (e.g., contain many commas and initials).
    """

def clean_document(text):

    # Split text into paragraphs using one or more blank lines as the delimiter.
    paragraphs = re.split(r"\n\s*\n", text)
    cleaned_paragraphs = []

    for para in paragraphs:
        para = para.strip()
        if not para:
            continue

        if len(para.split()) < 10:
            continue

        # Heuristic: if a paragraph contains many commas and matches a pattern like "A.", assume it's a reference list
        if para.count(",") > 5 and re.search(r"\b[A-Z]\.", para):
            continue

        cleaned_paragraphs.append(para)

    # Reassemble the paragraphs using double newlines.
    cleaned_text = "\n\n".join(cleaned_paragraphs)
    return cleaned_text

# Load text file
file_path = "/content/Summary for policymakers.txt"  # Replace with your file path.
with open(file_path, "r", encoding="utf-8") as f:
    raw_text = f.read()

# Clean the document
cleaned_text = clean_document(raw_text)

# Save the cleaned document to a new file.
cleaned_file_path = "/content/summary_cleaned.txt"
with open(cleaned_file_path, "w", encoding="utf-8") as f:
    f.write(cleaned_text)

print(f"\nCleaned document saved to {cleaned_file_path}")


In [None]:
#@title Process cleaned TXT file to QA format

#Load cleaned text file
with open("/content/summary_cleaned.txt", "r", encoding="utf-8") as f:
    cleaned_text = f.read()

#Split the text into paragraphs using a regex that splits on one or more blank lines
paragraphs = re.split(r'\n\s*\n', cleaned_text.strip())
print(f"Found {len(paragraphs)} paragraphs in the cleaned document.")

#Create dataset from paragraphs
data = {"context": paragraphs}
dataset = Dataset.from_dict(data)

#1. Question Generation using LLM
qg_model_name = "valhalla/t5-base-qg-hl"
qg_tokenizer = AutoTokenizer.from_pretrained(qg_model_name)
qg_model = AutoModelForSeq2SeqLM.from_pretrained(qg_model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
qg_model.to(device)

def generate_question(example):
    context = example["context"]
    # Construct prompt for question generation
    prompt = f"generate question: {context}"
    inputs = qg_tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = qg_model.generate(
        inputs["input_ids"],
        max_length=64,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.8,
        num_return_sequences=1
    )
    question_text = qg_tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
    example["question"] = question_text
    return example

dataset = dataset.map(generate_question)

#2. Answer Generation using LLM
qa_model_name = "google/flan-t5-base"
qa_tokenizer = AutoTokenizer.from_pretrained(qa_model_name)
qa_model = AutoModelForSeq2SeqLM.from_pretrained(qa_model_name)
qa_model.to(device)

def generate_answer(example):
    context = example["context"]
    question = example.get("question", "")
    #Construct prompt that combines the context and the generated question
    prompt = (
        f"Context:\n{context}\n\n"
        f"Question: {question}\n\n"
        "Provide a detailed answer discussing the long-term technological, economic, social, environmental, "
        "and geopolitical impacts by 2100."
    )
    inputs = qa_tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = qa_model.generate(
        inputs["input_ids"],
        max_length=250,
        min_length=10,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=1.2,
        num_return_sequences=1
    )
    answer_text = qa_tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
    example["answer"] = answer_text
    return example

dataset = dataset.map(generate_answer)

#Print examples
for i in range(10):
    print(f"Paragraph {i+1}:")
    print("Context:")
    print(dataset[i]["context"])
    print("\nGenerated Question:")
    print(dataset[i]["question"])
    print("\nGenerated Answer:")
    print(dataset[i]["answer"])
    print("="*60)


Found 2289 paragraphs in the cleaned document.


Map:   0%|          | 0/2289 [00:00<?, ? examples/s]

Map:   0%|          | 0/2289 [00:00<?, ? examples/s]

Paragraph 1:
Context:
The Third Assessment Report of Working Group I of the
Intergovernmental Panel on Climate Change (IPCC) builds
upon past assessments and incorporates new results from the
past five years of research on climate change1. Many hundreds
of scientists2 from many countries participated in its preparation
and review.

Generated Question:
What is the third assessment of Working Group I?

Generated Answer:
The Third Assessment Report of Working Group I of the Intergovernmental Panel on Climate Change (IPCC)
Paragraph 2:
Context:
This Summary for Policymakers (SPM), which was approved
by IPCC member governments in Shanghai in January 20013,
describes the current state of understanding of the climate
system and provides estimates of its projected future evolution
and their uncertainties. Further details can be found in the
underlying report, and the appended Source Information
provides cross references to the report's chapters.

Generated Question:
What is the name of the sum

In [None]:
#Save trained data as json

dataset.to_json("/content/qa_summary.json")
print("Dataset saved to /content/qa_dataset.json")

Creating json from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Dataset saved to /content/qa_dataset.json


Now, the we proceed with cleaning the documents of the G7 folder (all transformed to TXT files before).

In [None]:
#@title Clean Document Starting from Line 100

def clean_page_text(page_text):
    """
    Clean the text for a single page by:
      - Removing lines that are just a number.
      - Removing lines with fewer than 3 words.
      - Removing lines that appear to be table-like (e.g., containing 3 or more commas).
    """
    lines = page_text.splitlines()
    cleaned_lines = []
    for line in lines:
        line = line.strip()
        #Skip lines that are empty
        if not line:
            continue
        #Remove lines that are just a number (likely a page number)
        if re.fullmatch(r"\d+", line):
            continue
        #Skip lines with fewer than 3 words
        if len(line.split()) < 3:
            continue
        #Remove lines that contain many commas (likely tables or reference lists).
        if line.count(",") >= 5:
            continue
        #Check if the line contains a long gap after some word
        if re.search(r"(?<=\S)\s{5,}", line):
            continue

        cleaned_lines.append(line)
    return "\n".join(cleaned_lines)

#Load txt file
file_path = "/content/Climate-Adaptation-Plan.txt"
with open(file_path, "r", encoding="utf-8") as f:
    lines = f.readlines()

# Skip the first 150 lines
lines_start = lines[150:]
raw_text = "".join(lines_start)
raw_text = raw_text.replace("\f", "\n")

#Split the text into pages
pages = re.split(r"\n\s*\d+\s*\n", raw_text)
print(f"Found {len(pages)} pages (starting from line 100).")

cleaned_pages = [clean_page_text(page) for page in pages]
cleaned_document = "\n\n".join(cleaned_pages)

#Save file
cleaned_file_path = "/content/adaptation_us.txt"
with open(cleaned_file_path, "w", encoding="utf-8") as f:
    f.write(cleaned_document)

print(f"Cleaned document saved to {cleaned_file_path}")

#Preview
print("\nPreview of cleaned document:")
print(cleaned_document[:10000])


Found 68 pages (starting from line 100).
Cleaned document saved to /content/adaptation_us.txt

Preview of cleaned document:
Climate change poses significant risks to EPA’s mission of protecting human health and the
environment, as well as its own workforce and facilities. For over a decade, EPA has focused on
ensuring it continues to fulfill its mission of protecting human health and the environment even as the
climate changes. Following the release of its 2021 Climate Adaptation Action Plan, EPA significantly
increased its efforts to incorporate climate adaptation planning into the agency’s programs, policies,
rulemaking processes, enforcement activities, and operations. Since that time, EPA has made
significant advances and established innovative actions and processes described in this document. It
businesses to strengthen their adaptive capacity and increase their resilience to climate change
impacts, placing a particular focus on communities with environmental justice concerns. EPA

In [None]:
#@title Process cleaned TXT file to QA format

#Load cleaned text file
with open("/content/adaptation_us.txt", "r", encoding="utf-8") as f:
    cleaned_text = f.read()

#Split the text into paragraphs using a regex that splits on one or more blank lines
#Split the text into pages
pages = re.split(r"\n\s*\d+\s*\n", raw_text)

#Create dataset from pages
data = {"context": pages}
dataset = Dataset.from_dict(data)

#1. Question Generation using LLM
qg_model_name = "valhalla/t5-base-qg-hl"
qg_tokenizer = AutoTokenizer.from_pretrained(qg_model_name)
qg_model = AutoModelForSeq2SeqLM.from_pretrained(qg_model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
qg_model.to(device)

def generate_question(example):
    context = example["context"]
    # Construct prompt for question generation
    prompt = f"generate question: {context}"
    inputs = qg_tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = qg_model.generate(
        inputs["input_ids"],
        max_length=64,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.8,
        num_return_sequences=1
    )
    question_text = qg_tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
    example["question"] = question_text
    return example

dataset = dataset.map(generate_question)

#2. Answer Generation using LLM
qa_model_name = "google/flan-t5-base"
qa_tokenizer = AutoTokenizer.from_pretrained(qa_model_name)
qa_model = AutoModelForSeq2SeqLM.from_pretrained(qa_model_name)
qa_model.to(device)

def generate_answer(example):
    context = example["context"]
    question = example.get("question", "")
    #Construct prompt that combines the context and the generated question
    prompt = (
        f"Context:\n{context}\n\n"
        f"Question: {question}\n\n"
        "Provide a detailed answer discussing the long-term technological, economic, social, environmental, "
        "and geopolitical impacts by 2100."
    )
    inputs = qa_tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    outputs = qa_model.generate(
        inputs["input_ids"],
        max_length=250,
        min_length=100,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=1.2,
        num_return_sequences=1
    )
    answer_text = qa_tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
    example["answer"] = answer_text
    return example

dataset = dataset.map(generate_answer)

#Print examples
for i in range(10):
    print(f"Paragraph {i+1}:")
    print("Context:")
    print(dataset[i]["context"])
    print("\nGenerated Question:")
    print(dataset[i]["question"])
    print("\nGenerated Answer:")
    print(dataset[i]["answer"])
    print("="*60)


Map:   0%|          | 0/68 [00:00<?, ? examples/s]

Map:   0%|          | 0/68 [00:00<?, ? examples/s]

Paragraph 1:
Context:
Climate change poses significant risks to EPA’s mission of protecting human health and the
environment, as well as its own workforce and facilities. For over a decade, EPA has focused on
ensuring it continues to fulfill its mission of protecting human health and the environment even as the
climate changes. Following the release of its 2021 Climate Adaptation Action Plan, EPA significantly
increased its efforts to incorporate climate adaptation planning into the agency’s programs, policies,
rulemaking processes, enforcement activities, and operations. Since that time, EPA has made
significant advances and established innovative actions and processes described in this document. It
has also partnered with states, Tribes, territories, local governments, community groups, and
businesses to strengthen their adaptive capacity and increase their resilience to climate change
impacts, placing a particular focus on communities with environmental justice concerns. EPA is
comm

In [None]:
#Save trained data as json
dataset.to_json("/content/qa_adaptation.json")


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

283164