In [1]:
from datasets import load_dataset

# Load the TLDR dataset with remote code execution allowed
tldr = load_dataset('neulab/tldr', trust_remote_code=True)

# Access the dataset splits
tldr_train = tldr['train']
tldr_validation = tldr['validation']  # Check if 'validation' is available
tldr_test = tldr['test']

# Inspect the dataset structure (optional)
print(tldr)
print(tldr_train[0])  # Print the first example from the training set


DatasetDict({
    train: Dataset({
        features: ['question_id', 'nl', 'cmd', 'oracle_man', 'cmd_name', 'tldr_cmd_name', 'manual_exist', 'matching_info'],
        num_rows: 6414
    })
    test: Dataset({
        features: ['question_id', 'nl', 'cmd', 'oracle_man', 'cmd_name', 'tldr_cmd_name', 'manual_exist', 'matching_info'],
        num_rows: 928
    })
    validation: Dataset({
        features: ['question_id', 'nl', 'cmd', 'oracle_man', 'cmd_name', 'tldr_cmd_name', 'manual_exist', 'matching_info'],
        num_rows: 1845
    })
})
{'question_id': '0', 'nl': 'get the label of a fat32 partition', 'cmd': 'fatlabel {{/dev/sda1}}', 'oracle_man': ['fatlabel_3'], 'cmd_name': 'fatlabel', 'tldr_cmd_name': 'fatlabel', 'manual_exist': True, 'matching_info': {'token': ['|main|'], 'oracle_man': [['fatlabel_3', 'fatlabel_4']]}}


In [4]:
test_examples = tldr['test'].select(range(5))

In [8]:
import torch

generated_summaries = []
reference_summaries = []

# Define maximum lengths for source and target text
MAX_SOURCE_LENGTH = 512  # Maximum length of input text (e.g., 'nl')
MAX_TARGET_LENGTH = 64   # Maximum length of the generated summary (e.g., 'cmd')


# Ensure the model and tokenizer are moved to the correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Loop through the TLDR test examples
with torch.no_grad():
    for example in test_examples:
        # Use the 'nl' field as the input text
        inputs = tokenizer(
            example['nl'],  # Input: natural language description
            max_length=MAX_SOURCE_LENGTH,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Generate summary
        summary_ids = model.generate(
            inputs['input_ids'],
            num_beams=4,
            max_length=MAX_TARGET_LENGTH,
            length_penalty=2.0,
            early_stopping=True,
            no_repeat_ngram_size=3
        )

        # Decode generated summary
        generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        generated_summaries.append(generated_summary)

        # Use the 'cmd' field as the reference summary
        reference_summaries.append(example['cmd'])

        # Print results
        print("\nOriginal NL Description:", example['nl'])
        print("\nGenerated Summary (Command):", generated_summary)
        print("\nReference Summary (Command):", example['cmd'])
        print("\n" + "=" * 80)



Original NL Description: delete a shared memory segment by id

Generated Summary (Command): delete a shared memory segment by id by id or by id in the following manner: (i.e. delete a memory segment from the shared memory of a person who has shared a memory with another person): (1) delete a segment from a memory that is

Reference Summary (Command): ipcrm --shmem-id {{shmem_id}}


Original NL Description: delete a shared memory segment by key

Generated Summary (Command): delete a shared memory segment by key key by key by deleting a key from a key in a key memory segment and replacing it with a new key by a different key in the same key. delete a key and replace it with another key by another key.delete a

Reference Summary (Command): ipcrm --shmem-key {{shmem_key}}


Original NL Description: delete an ipc queue by id

Generated Summary (Command): delete an ipc queue by id by id to avoid the use of the id of an IPc queue on a computer with an id that is not the same as the id used t

In [11]:
# Import necessary libraries
from rouge_score import rouge_scorer
import numpy as np
import textstat  # Library for readability scores

# Define the ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

# Define the Flesch-Kincaid Grade Level (FKGL) scorer
def calculate_fkgl(text):
    return textstat.flesch_kincaid_grade(text)

# List to hold the ROUGE scores and FKGL scores for each example
rouge_scores = []
fkgl_scores = []

# Loop through the generated and reference summaries
for gen, ref in zip(generated_summaries, reference_summaries):
    # ROUGE scoring
    score = scorer.score(gen, ref)
    rouge_scores.append(score)

    # FKGL scoring for the generated summary
    fkgl_score = calculate_fkgl(gen)
    fkgl_scores.append(fkgl_score)

# Compute the average ROUGE scores
avg_rouge_scores = {
    'rouge1': np.mean([score['rouge1'].fmeasure for score in rouge_scores]),
    'rouge2': np.mean([score['rouge2'].fmeasure for score in rouge_scores]),
    'rougeL': np.mean([score['rougeL'].fmeasure for score in rouge_scores])
}

# Compute the average FKGL score
avg_fkgl_score = np.mean(fkgl_scores)

# Print the average ROUGE scores and FKGL score
print("\nAverage ROUGE Scores:")
for metric, score in avg_rouge_scores.items():
    print(f"{metric}: {score:.4f}")

print(f"\nAverage Flesch-Kincaid Grade Level (FKGL): {avg_fkgl_score:.4f}")

# Prepare the summary results
summary_results = {
    'generated_summaries': generated_summaries,
    'reference_summaries': reference_summaries,
    'rouge_scores': avg_rouge_scores,
    'fkgl_scores': fkgl_scores,
    'avg_fkgl_score': avg_fkgl_score
}

# Display or return the summary results
summary_results



Average ROUGE Scores:
rouge1: 0.1368
rouge2: 0.0155
rougeL: 0.1288

Average Flesch-Kincaid Grade Level (FKGL): 8.0600


{'generated_summaries': ['delete a shared memory segment by id by id or by id in the following manner: (i.e. delete a memory segment from the shared memory of a person who has shared a memory with another person): (1) delete a segment from a memory that is',
  'delete a shared memory segment by key key by key by deleting a key from a key in a key memory segment and replacing it with a new key by a different key in the same key. delete a key and replace it with another key by another key.delete a',
  "delete an ipc queue by id by id to avoid the use of the id of an IPc queue on a computer with an id that is not the same as the id used to create the queue by the user's IP address.delete an IPC queue by",
  'delete an ipc queue by key by key: (i.e. delete an IPc queue from an address by key or a key from a key to a key that is not a key) to an address that is a key or an address in a key',
  'delete a semaphore by id by id or by id to avoid the use of semaphores by the user of a mobile ph