# Load Dataset Using Hugging Face Datasets Library

This script installs the `datasets` library, loads the "epfl-llm/guidelines" dataset, and displays its contents.


In [11]:
!pip install datasets


from datasets import load_dataset

dataset = load_dataset("epfl-llm/guidelines")

dataset



DatasetDict({
    train: Dataset({
        features: ['id', 'source', 'title', 'clean_text', 'raw_text', 'url', 'overview'],
        num_rows: 37970
    })
})

In [12]:
!pip install pandas



# Access and Explore Dataset Features

This script accesses the training split of a loaded dataset and displays the feature headers.


In [13]:
train_dataset = dataset['train']

# Display the headers (features)
headers = train_dataset.features.keys()
print(headers)

dict_keys(['id', 'source', 'title', 'clean_text', 'raw_text', 'url', 'overview'])


# Convert Dataset to DataFrame

This script converts the training split of a dataset into a Pandas DataFrame and displays the first few rows.


In [14]:
# Convert the train split to a DataFrame
df = train_dataset.to_pandas()

# Display the DataFrame
print(df.head())

                                         id source title  \
0  7a73f9287841533eeb11c025026322a23d519f2c    cco  None   
1  0ca6b1adf1e6c001dc70cd13be35a1e8a4c14839    cco  None   
2  68984194848f42b555d2a3c9077ac7a1e53f976b    cco  None   
3  c80ad0d5ba1eee75e1702847d361abd6ce4bc7b0    cco  None   
4  7a09ab0610ecc9990db3362ca0b00a37a016acb4    cco  None   

                                          clean_text  \
0  # QUESTIONS Diagnosis/Staging\nWhat benefit to...   
1  # GUIDELINE OBJECTIVES\nTo update clinical gui...   
2  # GUIDELINE OBJECTIVES\nTo make recommendation...   
3  Evidence-Based Series 4-5 is CURRENT as of Nov...   
4  # GUIDELINE OBJECTIVES\nTo make recommendation...   

                                            raw_text   url overview  
0  # QUESTIONS Diagnosis/Staging\nWhat benefit to...  None     None  
1  This report is copyrighted by Ontario Health (...  None     None  
2  This report is copyrighted by Cancer Care Onta...  None     None  
3  Evidence-Based Seri

In [15]:
df['clean_text']

Unnamed: 0,clean_text
0,# QUESTIONS Diagnosis/Staging\nWhat benefit to...
1,# GUIDELINE OBJECTIVES\nTo update clinical gui...
2,# GUIDELINE OBJECTIVES\nTo make recommendation...
3,Evidence-Based Series 4-5 is CURRENT as of Nov...
4,# GUIDELINE OBJECTIVES\nTo make recommendation...
...,...
37965,"ZP4\nZona pellucida sperm-binding protein 4, Z..."
37966,pH\n- Acid-base extraction\n- Acid-base reacti...
37967,Rb\nRB or Rb may stand for:\n- the chemical el...
37968,SB\n\n# Curators\nAnyone should feel free to a...


# Preprocess and Sample Dataset

This script preprocesses the text data by removing newline characters and then selects the first 20 samples for further analysis.


In [16]:
# Preprocess and taking first 1000 samples.

df['clean_text'] = df['clean_text'].apply(lambda x: x.replace('\n',''))
df = df.head(20)
df.shape


(20, 7)

In [17]:
pip install transformers



# Load BART Model and Tokenizer

This script loads the BART tokenizer and model for conditional generation using the "facebook/bart-base" pretrained model.


In [21]:
from transformers import BartTokenizer, BartForConditionalGeneration
from datasets import load_dataset

# Load tokenizer and model
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")

# Load and prepare the dataset
dataset = load_dataset("epfl-llm/guidelines")

# Slice the dataset to get only the first 1000 records
dataset_slice = dataset['train'].select(range(100))  # Adjust split name as needed

# Tokenize the dataset
def preprocess_function(examples):
    # Tokenize the input text
    inputs = tokenizer(examples['source'], max_length=1024, truncation=True, padding="max_length")

    # Tokenize the target text
    targets = tokenizer(examples['clean_text'], max_length=256, truncation=True, padding="max_length")

    # Add labels
    inputs['labels'] = targets['input_ids']

    return inputs

# Apply preprocessing to the sliced dataset
tokenized_datasets = dataset_slice.map(preprocess_function, batched=True)

# Print an example to verify
print(tokenized_datasets[0])


{'id': '7a73f9287841533eeb11c025026322a23d519f2c', 'source': 'cco', 'title': 'None', 'clean_text': '# QUESTIONS Diagnosis/Staging\nWhat benefit to clinical management does positron emission tomography (PET) or positron emission tomography/computed tomography (PET/CT) contribute to the diagnosis or staging of head and neck cancer? What benefit to clinical management does PET or PET/CT contribute to the assessment of treatment response for head and neck cancer?\nWhat benefit to clinical management does PET or PET/CT contribute when recurrence of head and neck cancer is suspected but not proven? What benefit to clinical management does PET or PET/CT contribute to restaging at the time of documented recurrence for head and neck cancer? What is the role of PET when a solitary metastasis is identified at the time of recurrence and a metastectomy is being contemplated?\n\n# TARGET POPULATION\nPatients with head and neck cancer are the target population for this recommendation report.\n\n# INT

# Text Summarization Using BART Model

This script defines a function to generate summaries using the BART model and applies it to the preprocessed dataset, generating and printing summaries for verification.


In [23]:
# Define summarization function
def summarize(text):
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=50,
        min_length=20,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Apply summarization to the preprocessed dataset
def generate_summaries(examples):
    summaries = [summarize(text) for text in examples['source']]
    return {"generated_summary": summaries}

# Apply summarization function
summarized_datasets = tokenized_datasets.map(generate_summaries, batched=True)



In [24]:
summarized_datasets[0]

{'id': '7a73f9287841533eeb11c025026322a23d519f2c',
 'source': 'cco',
 'title': 'None',
 'clean_text': '# QUESTIONS Diagnosis/Staging\nWhat benefit to clinical management does positron emission tomography (PET) or positron emission tomography/computed tomography (PET/CT) contribute to the diagnosis or staging of head and neck cancer? What benefit to clinical management does PET or PET/CT contribute to the assessment of treatment response for head and neck cancer?\nWhat benefit to clinical management does PET or PET/CT contribute when recurrence of head and neck cancer is suspected but not proven? What benefit to clinical management does PET or PET/CT contribute to restaging at the time of documented recurrence for head and neck cancer? What is the role of PET when a solitary metastasis is identified at the time of recurrence and a metastectomy is being contemplated?\n\n# TARGET POPULATION\nPatients with head and neck cancer are the target population for this recommendation report.\n\n# 