In [1]:
import os
import zipfile
import torch
from transformers import BartTokenizer, BartForConditionalGeneration
import textstat
from rouge_score import rouge_scorer

model = BartForConditionalGeneration.from_pretrained('models/billsum')
tokenizer = BartTokenizer.from_pretrained('models/billsum')

# Sample input texts (replace with your actual texts)
input_texts = [
    "The quick brown fox jumps over the lazy dog.",
    "Artificial intelligence is a field of computer science focused on creating intelligent machines."
]

# Reference summaries for ROUGE comparison
reference_summaries = [
    "A fox jumps over a dog.",
    "AI is about creating smart machines."
]

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

generated_summaries = []

# Generate summaries
for input_text in input_texts:
    # Tokenize the input text
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=1024)

    # Generate summary
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=150,
        min_length=40,
        length_penalty=2.0,
        num_beams=4
    )
    generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    generated_summaries.append(generated_summary)

# Calculate ROUGE scores
rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

for generated_summary, reference_summary in zip(generated_summaries, reference_summaries):
    scores = scorer.score(reference_summary, generated_summary)
    rouge_scores['rouge1'].append(scores['rouge1'].fmeasure)
    rouge_scores['rouge2'].append(scores['rouge2'].fmeasure)
    rouge_scores['rougeL'].append(scores['rougeL'].fmeasure)

# Average ROUGE scores
avg_rouge1 = sum(rouge_scores['rouge1']) / len(rouge_scores['rouge1'])
avg_rouge2 = sum(rouge_scores['rouge2']) / len(rouge_scores['rouge2'])
avg_rougeL = sum(rouge_scores['rougeL']) / len(rouge_scores['rougeL'])

# Calculate FKGL scores
fkgl_scores = [textstat.flesch_kincaid_grade(summary) for summary in generated_summaries]
average_fkgl = sum(fkgl_scores) / len(fkgl_scores)

# Display results
print("Generated Summaries:")
for i, summary in enumerate(generated_summaries):
    print(f"Input {i+1}: {input_texts[i]}")
    print(f"Generated Summary: {summary}")
    print(f"Reference Summary: {reference_summaries[i]}\n")

print("ROUGE Scores:")
print(f"Average ROUGE-1: {avg_rouge1:.4f}")
print(f"Average ROUGE-2: {avg_rouge2:.4f}")
print(f"Average ROUGE-L: {avg_rougeL:.4f}")

print("\nFlesch-Kincaid Grade Level (FKGL):")
print(f"Average FKGL: {average_fkgl:.2f}")


Generated Summaries:
Input 1: The quick brown fox jumps over the lazy dog.
Generated Summary: The quick brown fox jumps over the lazy dog. The quick brownfox jumps over a lazy dog in a bid to get the attention of the dog's owner, who is on the other side of the fence.
Reference Summary: A fox jumps over a dog.

Input 2: Artificial intelligence is a field of computer science focused on creating intelligent machines.
Generated Summary: Artificial intelligence is a field of computer science focused on creating intelligent machines. It is the study of computer technology that can be applied to the creation of intelligent machines, including computers, robots, and other machines.
Reference Summary: AI is about creating smart machines.

ROUGE Scores:
Average ROUGE-1: 0.2078
Average ROUGE-2: 0.0714
Average ROUGE-L: 0.1851

Flesch-Kincaid Grade Level (FKGL):
Average FKGL: 9.25


In [3]:
from datasets import load_dataset

# Load the TLDR dataset with remote code execution allowed
tldr = load_dataset('neulab/tldr', trust_remote_code=True)

# Access the dataset splits
tldr_train = tldr['train']
tldr_validation = tldr['validation']  # Check if 'validation' is available
tldr_test = tldr['test']

# Inspect the dataset structure (optional)
print(tldr)
print(tldr_train[0])  # Print the first example from the training set


tldr-train.jsonl:   0%|          | 0.00/4.76M [00:00<?, ?B/s]

tldr-dev.jsonl:   0%|          | 0.00/1.28M [00:00<?, ?B/s]

tldr-test.jsonl:   0%|          | 0.00/525k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['question_id', 'nl', 'cmd', 'oracle_man', 'cmd_name', 'tldr_cmd_name', 'manual_exist', 'matching_info'],
        num_rows: 6414
    })
    test: Dataset({
        features: ['question_id', 'nl', 'cmd', 'oracle_man', 'cmd_name', 'tldr_cmd_name', 'manual_exist', 'matching_info'],
        num_rows: 928
    })
    validation: Dataset({
        features: ['question_id', 'nl', 'cmd', 'oracle_man', 'cmd_name', 'tldr_cmd_name', 'manual_exist', 'matching_info'],
        num_rows: 1845
    })
})
{'question_id': '0', 'nl': 'get the label of a fat32 partition', 'cmd': 'fatlabel {{/dev/sda1}}', 'oracle_man': ['fatlabel_3'], 'cmd_name': 'fatlabel', 'tldr_cmd_name': 'fatlabel', 'manual_exist': True, 'matching_info': {'token': ['|main|'], 'oracle_man': [['fatlabel_3', 'fatlabel_4']]}}


In [4]:
test_examples = tldr['test'].select(range(5))

In [8]:
import torch

generated_summaries = []
reference_summaries = []

# Define maximum lengths for source and target text
MAX_SOURCE_LENGTH = 512  # Maximum length of input text (e.g., 'nl')
MAX_TARGET_LENGTH = 64   # Maximum length of the generated summary (e.g., 'cmd')


# Ensure the model and tokenizer are moved to the correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Loop through the TLDR test examples
with torch.no_grad():
    for example in test_examples:
        # Use the 'nl' field as the input text
        inputs = tokenizer(
            example['nl'],  # Input: natural language description
            max_length=MAX_SOURCE_LENGTH,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Generate summary
        summary_ids = model.generate(
            inputs['input_ids'],
            num_beams=4,
            max_length=MAX_TARGET_LENGTH,
            length_penalty=2.0,
            early_stopping=True,
            no_repeat_ngram_size=3
        )

        # Decode generated summary
        generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        generated_summaries.append(generated_summary)

        # Use the 'cmd' field as the reference summary
        reference_summaries.append(example['cmd'])

        # Print results
        print("\nOriginal NL Description:", example['nl'])
        print("\nGenerated Summary (Command):", generated_summary)
        print("\nReference Summary (Command):", example['cmd'])
        print("\n" + "=" * 80)



Original NL Description: delete a shared memory segment by id

Generated Summary (Command): delete a shared memory segment by id by id or by id in the following manner: (i.e. delete a memory segment from the shared memory of a person who has shared a memory with another person): (1) delete a segment from a memory that is

Reference Summary (Command): ipcrm --shmem-id {{shmem_id}}


Original NL Description: delete a shared memory segment by key

Generated Summary (Command): delete a shared memory segment by key key by key by deleting a key from a key in a key memory segment and replacing it with a new key by a different key in the same key. delete a key and replace it with another key by another key.delete a

Reference Summary (Command): ipcrm --shmem-key {{shmem_key}}


Original NL Description: delete an ipc queue by id

Generated Summary (Command): delete an ipc queue by id by id to avoid the use of the id of an IPc queue on a computer with an id that is not the same as the id used t

In [11]:
# Import necessary libraries
from rouge_score import rouge_scorer
import numpy as np
import textstat  # Library for readability scores

# Define the ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Define the Flesch-Kincaid Grade Level (FKGL) scorer
def calculate_fkgl(text):
    return textstat.flesch_kincaid_grade(text)

# List to hold the ROUGE scores and FKGL scores for each example
rouge_scores = []
fkgl_scores = []

# Loop through the generated and reference summaries
for gen, ref in zip(generated_summaries, reference_summaries):
    # ROUGE scoring
    score = scorer.score(gen, ref)
    rouge_scores.append(score)

    # FKGL scoring for the generated summary
    fkgl_score = calculate_fkgl(gen)
    fkgl_scores.append(fkgl_score)

# Compute the average ROUGE scores
avg_rouge_scores = {
    'rouge1': np.mean([score['rouge1'].fmeasure for score in rouge_scores]),
    'rouge2': np.mean([score['rouge2'].fmeasure for score in rouge_scores]),
    'rougeL': np.mean([score['rougeL'].fmeasure for score in rouge_scores])
}

# Compute the average FKGL score
avg_fkgl_score = np.mean(fkgl_scores)

# Print the average ROUGE scores and FKGL score
print("\nAverage ROUGE Scores:")
for metric, score in avg_rouge_scores.items():
    print(f"{metric}: {score:.4f}")

print(f"\nAverage Flesch-Kincaid Grade Level (FKGL): {avg_fkgl_score:.4f}")

# Prepare the summary results
summary_results = {
    'generated_summaries': generated_summaries,
    'reference_summaries': reference_summaries,
    'rouge_scores': avg_rouge_scores,
    'fkgl_scores': fkgl_scores,
    'avg_fkgl_score': avg_fkgl_score
}

# Display or return the summary results
summary_results



Average ROUGE Scores:
rouge1: 0.1368
rouge2: 0.0155
rougeL: 0.1288

Average Flesch-Kincaid Grade Level (FKGL): 8.0600


{'generated_summaries': ['delete a shared memory segment by id by id or by id in the following manner: (i.e. delete a memory segment from the shared memory of a person who has shared a memory with another person): (1) delete a segment from a memory that is',
  'delete a shared memory segment by key key by key by deleting a key from a key in a key memory segment and replacing it with a new key by a different key in the same key. delete a key and replace it with another key by another key.delete a',
  "delete an ipc queue by id by id to avoid the use of the id of an IPc queue on a computer with an id that is not the same as the id used to create the queue by the user's IP address.delete an IPC queue by",
  'delete an ipc queue by key by key: (i.e. delete an IPc queue from an address by key or a key from a key to a key that is not a key) to an address that is a key or an address in a key',
  'delete a semaphore by id by id or by id to avoid the use of semaphores by the user of a mobile ph