# Data Science HW4 Tutorial
## Reference: https://huggingface.co/docs/transformers/tasks/summarization

In [1]:
import os
os.environ["WANDB_MODE"] = "disabled"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # change device number if there exists more than one gpu on your platform.

In [2]:
from datasets  import load_dataset

billsum = load_dataset("billsum", split="train")

  from .autonotebook import tqdm as notebook_tqdm
Downloading readme: 100%|██████████| 7.27k/7.27k [00:00<00:00, 10.8MB/s]
Downloading data: 100%|██████████| 91.8M/91.8M [00:01<00:00, 56.3MB/s]
Downloading data: 100%|██████████| 15.8M/15.8M [00:00<00:00, 35.1MB/s]
Downloading data: 100%|██████████| 6.12M/6.12M [00:00<00:00, 18.6MB/s]
Generating train split: 100%|██████████| 18949/18949 [00:00<00:00, 42479.66 examples/s]
Generating test split: 100%|██████████| 3269/3269 [00:00<00:00, 46923.63 examples/s]
Generating ca_test split: 100%|██████████| 1237/1237 [00:00<00:00, 42406.88 examples/s]


In [3]:
billsum = billsum.train_test_split(test_size=0.2)

In [4]:
from transformers import AutoTokenizer

checkpoint = "t5small_TextSummarization/" # released full model path
TK_ckpt = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(TK_ckpt)  # use tokeniozer from Hugging Face

In [5]:
prefix = "summarize: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [7]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [8]:
import evaluate

rouge = evaluate.load("rouge")

In [9]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [10]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

# load full model 
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)  

In [11]:
from datasets import load_dataset

billsum = load_dataset("billsum", split="train")

In [12]:
billsum = billsum.train_test_split(test_size=0.2)
tokenized_billsum = billsum.map(preprocess_function, batched=True)

Map:   0%|          | 0/15159 [00:00<?, ? examples/s]

Map: 100%|██████████| 15159/15159 [00:14<00:00, 1021.92 examples/s]
Map: 100%|██████████| 3790/3790 [00:03<00:00, 1000.18 examples/s]


## TA's trainer for fine-tune T5-small

In [13]:
training_args = Seq2SeqTrainingArguments(
    output_dir="TA_billsum_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    weight_decay=0.01,  # Assuming you still want weight decay as it wasn't mentioned to remove
    save_total_limit=3,  # Assuming to maintain the save limit as before
    num_train_epochs=4,
    lr_scheduler_type="linear",
    seed=42,
    fp16=True,  # You mentioned "Native AMP" for mixed precision training which is generally enabled by setting fp16=True in Transformers
    logging_steps=10,  # Assuming to keep the logging frequency as before
    predict_with_generate=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


## Ratio of non-zero parameter 

In [14]:
def show_param_ratio(model):
    num_param = 0
    for param in model.parameters():
        num_param += param.numel()
    num_mask = 0
    for name, param in model.named_buffers():
        if "mask" in name:
            num_mask += (param == 0).sum()
    print((num_param - num_mask) / num_param)

In [15]:
show_param_ratio(model)

1.0


## Prediction Part

In [16]:
billsum_test = load_dataset("billsum", split="test")
tokenized_billsum_test = billsum_test.map(preprocess_function, batched=True)

Map: 100%|██████████| 3269/3269 [00:03<00:00, 918.78 examples/s]


In [17]:
trainer.evaluate(tokenized_billsum_test)



{'eval_loss': 1.4963345527648926,
 'eval_rouge1': 0.241,
 'eval_rouge2': 0.1962,
 'eval_rougeL': 0.2334,
 'eval_rougeLsum': 0.2334,
 'eval_gen_len': 18.9997,
 'eval_runtime': 709.973,
 'eval_samples_per_second': 4.604,
 'eval_steps_per_second': 2.303}

In [18]:
results = trainer.predict(tokenized_billsum_test)

In [21]:
decoded_prediction = tokenizer.batch_decode(results[0], skip_special_tokens=True)


In [22]:
import pandas as pd
import csv

In [23]:
df_results = pd.DataFrame(columns=['ID','Predict'])

for i, prediction in enumerate(decoded_prediction):
    # Escape quotes by replacing "," with "."
    summary_escaped = prediction.replace(',', '.')
    
    # Create a new row DataFrame and append it
    new_row = pd.DataFrame({'ID': [i], 'Predict': [summary_escaped]})
    df_results = pd.concat([df_results, new_row], ignore_index=True)

# Print the resulting DataFrame
print(df_results)

        ID                                            Predict
0        0  Amends the Water Resources Development Act of ...
1        1  Federal Forage Fee Act of 1993 - Requires all ...
2        2  Merchant Marine of World War II Congressional ...
3        3  Small Business Tax Modernization Act of 2004 -...
4        4  Fair Access to Investment Research Act of 2016...
...    ...                                                ...
3264  3264  Public Servant Priority Placement Act of 1995 ...
3265  3265  Sportmanship in Hunting Act of 2008 - Amends t...
3266  3266  Helping College Students Cross the Finish Line...
3267  3267  Texas National Forests Improvement Act of 2000...
3268  3268  Federal Power Asset Privatization Act of 1995 ...

[3269 rows x 2 columns]


In [24]:
# Function to escape double quotes and handle newlines
def escape_special_characters(text):
    return text.replace('"', '""').replace('\n', ' ')

# Apply escaping to the 'Summary' column
df_results['Predict'] = df_results['Predict'].apply(escape_special_characters)

### Dump Prediction

In [25]:
df_results.to_csv('full_model_sample_submission.csv', index=False, quoting=csv.QUOTE_ALL, encoding='utf-8')


### Calculating ROUGE-Lsum with build-in Python function

In [26]:
def calculate_lcs(X, Y):
    """
    Helper function to calculate the longest common subsequence of sequences X and Y.
    """
    m, n = len(X), len(Y)
    L = [[0] * (n + 1) for _ in range(m + 1)]

    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if X[i - 1] == Y[j - 1]:
                L[i][j] = L[i - 1][j - 1] + 1
            else:
                L[i][j] = max(L[i - 1][j], L[i][j - 1])

    return L[m][n]

def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float:
    """
    Computes the ROUGE-Lsum score based on the longest common subsequence summed over all sentences in the summaries.
    
    Args:
    solution (pd.DataFrame): The DataFrame containing the correct summaries.
    submission (pd.DataFrame): The DataFrame containing participant's predicted summaries.
    row_id_column_name (str): The column name for the row ID in both DataFrames.

    Returns:
    float: The mean ROUGE-Lsum score across all predictions.
    """
    # Ensure indices for proper alignment
    solution.set_index(row_id_column_name, inplace=True)
    submission.set_index(row_id_column_name, inplace=True)

    total_score = 0

    for idx in solution.index:
        if idx not in submission.index:
            # raise ParticipantVisibleError(f"Missing prediction for ID {idx}.")
            raise ValueError(f"Missing prediction for ID {idx}.")

        ref_summary = solution.loc[idx, 'Label']
        pred_summary = submission.loc[idx, 'Predict']

        # Tokenize sentences
        ref_sentences = ref_summary.split('.')
        pred_sentences = pred_summary.split('.')

        # Calculate LCS for each sentence pair
        lcs_sum = 0
        for ref_sent in ref_sentences:
            ref_tokens = ref_sent.strip().lower().split()
            best_lcs = 0
            for pred_sent in pred_sentences:
                pred_tokens = pred_sent.strip().lower().split()
                lcs_length = calculate_lcs(ref_tokens, pred_tokens)
                best_lcs = max(best_lcs, lcs_length)
            lcs_sum += best_lcs

        # Calculate ROUGE-L for the current pair of summaries
        ref_length = sum(len(sent.strip().split()) for sent in ref_sentences)
        if ref_length > 0:
            rouge_l = lcs_sum / ref_length
        else:
            rouge_l = 0
        total_score += rouge_l

    # Compute the average ROUGE-L score across all submissions
    mean_rouge_lsum = total_score / len(solution)

    return mean_rouge_lsum

In [27]:
df_label = pd.DataFrame(columns=['ID','Label'])

for i, label in enumerate(billsum_test):
    # Escape quotes by replacing "," with "."
    label_escaped = label['summary'].replace(',', '.')
    
    # Create a new row DataFrame and append it
    new_row = pd.DataFrame({'ID': [i], 'Label': [label_escaped]})
    df_label = pd.concat([df_label, new_row], ignore_index=True)

# Print the resulting DataFrame
print(df_label)

        ID                                              Label
0        0  Amends the Water Resources Development Act of ...
1        1  Federal Forage Fee Act of 1993 - Subjects graz...
2        2  .  Merchant Marine of World War II Congression...
3        3  Small Business Modernization Act of 2004 - Ame...
4        4  Fair Access to Investment Research Act of 2016...
...    ...                                                ...
3264  3264  Public Servant Priority Placement Act of 1995 ...
3265  3265  Sportsmanship in Hunting Act of 2008 - Amends ...
3266  3266  Helping College Students Cross the Finish Line...
3267  3267  Makes proceeds from such conveyances available...
3268  3268  Federal Power Asset Privatization Act of 1995 ...

[3269 rows x 2 columns]


In [28]:
score(df_label, df_results, 'ID')

0.16892204432689417

## Sample code to do pruning

torch.nn tutorial: https://pytorch.org/tutorials/intermediate/pruning_tutorial.html

In [None]:
parameters_to_prune = []
for _, module in model.named_modules():
    if isinstance(module, torch.nn.Linear):
        parameters_to_prune.append((module, "weight"))
torch.nn.utils.prune.global_unstructured(
    parameters_to_prune,
    pruning_method=    ,
    amount= prune,
)

### Check pruned non-zero ratio 

In [None]:
show_param_ratio(model)