<a href="https://colab.research.google.com/github/Hicham-Yezza/LLM-SUM/blob/main/LLM_KG_summariser_prototype.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install necessary libraries
!pip install transformers datasets SPARQLWrapper

# Verify GPU availability
import torch
if torch.cuda.is_available():
    print("GPU is available.")
else:
    print("Running on CPU.")

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting SPARQLWrapper
  Downloading SPARQLWrapper-2.0.0-py3-none-any.whl.metadata (2.0 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting rdflib>=6.1.1 (from SPARQLWrapper)
  Downloading rdflib-7.0.0-py3-none-any.whl.metadata (11 kB)
Collecting isodate<0.7.0,>=0.6.0 (from rdflib>=6.1.1->SPARQLWrapper)
  Downloading isodate-0.6.1-py2.py3-none-any.whl.metadata (9.6 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from SPARQLWrapper import SPARQLWrapper, JSON

def query_knowledge_graph(sparql_query):
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    sparql.setQuery(sparql_query)
    sparql.setReturnFormat(JSON)
    try:
        results = sparql.query().convert()
        return results["results"]["bindings"]
    except Exception as e:
        print(f"Error querying KG: {e}")
        return []

# Example query to get data about humans
query = """
SELECT ?item ?itemLabel WHERE {
  ?item wdt:P31 wd:Q5.
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
} LIMIT 5
"""

results = query_knowledge_graph(query)
for result in results:
    print(result["itemLabel"]["value"])

Andrei Rublev
Georgina Cassar
Pedro Aguirre Cerda
Arturo Alessandri
François Villon


In [3]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load pre-trained T5-small model and tokenizer
model = T5ForConditionalGeneration.from_pretrained('t5-small')
tokenizer = T5Tokenizer.from_pretrained('t5-small')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [4]:
def prepare_dataset(kg_data):
    # Transform the KG data into summarization input/output format
    inputs = []
    outputs = []
    for entry in kg_data:
        # Example formatting of KG data
        fact = entry['itemLabel']['value']
        inputs.append(f"Summarize the fact: {fact}")
        outputs.append(fact)  # In a real scenario, you'd want to process this further
    return inputs, outputs

# Prepare the dataset from KG query results
train_inputs, train_outputs = prepare_dataset(results)

In [5]:
from datasets import Dataset
from transformers import Trainer, TrainingArguments

# Create a Dataset object for the trainer
train_data = Dataset.from_dict({
    'input_texts': train_inputs,
    'labels': train_outputs
})

# Tokenize data
def tokenize_data(examples):
    inputs = tokenizer(examples['input_texts'], padding='max_length', truncation=True, return_tensors="pt")
    labels = tokenizer(examples['labels'], padding='max_length', truncation=True, return_tensors="pt").input_ids
    inputs['labels'] = labels
    return inputs

train_data = train_data.map(tokenize_data, batched=True)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="steps",
    per_device_train_batch_size=4,
    num_train_epochs=1,  # Kept low to save resources in Colab
    save_steps=10_000,
    save_total_limit=2,
    load_best_model_at_end=True
)

# Set up Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data
)

# Fine-tune the model
trainer.train()

Map:   0%|          | 0/5 [00:00<?, ? examples/s]



Step,Training Loss,Validation Loss


TrainOutput(global_step=2, training_loss=16.393192291259766, metrics={'train_runtime': 8.3891, 'train_samples_per_second': 0.596, 'train_steps_per_second': 0.238, 'total_flos': 676709007360.0, 'train_loss': 16.393192291259766, 'epoch': 1.0})

In [None]:
# Approach 2 - In-Context Learning with Real-Time KG Extraction

In [7]:
def generate_summary_with_kg(input_text, kg_data):
    # Check if GPU is available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Move the model to the correct device
    model.to(device)

    # Use KG facts to enrich the input prompt
    kg_facts = ", ".join([entry['itemLabel']['value'] for entry in kg_data])
    prompt = f"Summarize the following text with additional knowledge: {input_text}. Knowledge: {kg_facts}."

    # Tokenize the input and move it to the same device as the model
    inputs = tokenizer(prompt, return_tensors='pt').input_ids.to(device)

    # Generate the summary
    outputs = model.generate(inputs, max_length=150, num_beams=4, early_stopping=True)

    # Decode and return the output, move back to CPU for decoding if necessary
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example usage
summary = generate_summary_with_kg("Humans are the only species known to build cities.", results)
print(summary)


the following text: text with additional knowledge: Humans are only species known to build cities. Knowledge: Andrei Rublev, Georgina Cassar, Pedro Aguirre Cerda, Arturo Alessandri, François Villon.


In [10]:
!pip install evaluate rouge_score


Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=a191a5ffc1b1c0e5b6b92a7b3530463f08a96439b154101bee21b70ab397c491
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score, evaluate
Successfully installed evaluate-0.4.2 rouge_score-0.1.2


In [11]:
import evaluate

# Load the ROUGE metric from the `evaluate` library
rouge = evaluate.load('rouge')

# Function to compute ROUGE score
def compute_rouge(predictions, references):
    return rouge.compute(predictions=predictions, references=references)

# Example predictions and references
predictions = ["Humans are a species that build cities."]
references = ["Humans are the only species known to build cities."]

# Compute and print ROUGE score
rouge_score = compute_rouge(predictions, references)
print(rouge_score)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

{'rouge1': 0.6250000000000001, 'rouge2': 0.28571428571428575, 'rougeL': 0.6250000000000001, 'rougeLsum': 0.6250000000000001}
