In [None]:
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
print(f"GPU device count: {torch.cuda.device_count()}")
print(f"Current CUDA device: {torch.cuda.current_device()}")
print(f"Device name: {torch.cuda.get_device_name(0)}")

In [None]:
# !pip install datasets transformers peft nltk rouge_score

## Fine Tuning with LoRA

In [1]:
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

PyTorch version: 2.3.1+cu118
CUDA available: True
CUDA version: 11.8


In [2]:
import math
import time
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from datasets import load_dataset
from transformers import BartTokenizer


def sample_dataset(dataset, fraction=0.3, seed=42):
    """
    Sample a fraction of the dataset.
    """
    sampled_dataset = dataset.shuffle(seed=seed)
    num_samples = int(len(dataset) * fraction)
    return sampled_dataset.select(range(num_samples))


def preprocess_function(examples):
    inputs = [doc for doc in examples["document"]]
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=MAX_TARGET_LENGTH, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


# Create the custom dataset class and data loaders
class MultiNewsDataset(Dataset):
    def __init__(self, tokenized_datasets, split):
        self.dataset = tokenized_datasets[split]

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]
        return {
            'input_ids': torch.tensor(item['input_ids']),
            'attention_mask': torch.tensor(item['attention_mask']),
            'labels': torch.tensor(item['labels'])
        }


def collate_fn(batch):
    input_ids = pad_sequence([item['input_ids'] for item in batch], batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = pad_sequence([item['attention_mask'] for item in batch], batch_first=True, padding_value=0)
    labels = pad_sequence([item['labels'] for item in batch], batch_first=True, padding_value=-100)  # Assuming -100 is ignore index for labels

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels
    }


# 1. Load the dataset
dataset = load_dataset("multi_news")

# 2. Set up the tokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

# 3. Preprocess the dataset
MAX_INPUT_LENGTH = 1024
MAX_TARGET_LENGTH = 100

# Sample 10% of each split
train_dataset = sample_dataset(dataset['train'])
validation_dataset = sample_dataset(dataset['validation'])
test_dataset = sample_dataset(dataset['test'])

# Create a new dataset dictionary with the sampled splits
dataset = {
    'train': train_dataset,
    'validation': validation_dataset,
    'test': test_dataset
}


tokenized_datasets = {}
for split in dataset.keys():
    tokenized_datasets[split] = dataset[split].map(preprocess_function, batched=True)

train_dataset = MultiNewsDataset(tokenized_datasets, 'train')
val_dataset = MultiNewsDataset(tokenized_datasets, 'validation')

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=4, collate_fn=collate_fn)

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BartForConditionalGeneration, BartTokenizer, get_linear_schedule_with_warmup, AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm import tqdm
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType

In [None]:
# Load the base model
model_path = r"model\bart_large_cnn"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def load_summarizer(model_path, device):
    # Load pre-trained model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
    return model, tokenizer

def summarize_news(model, tokenizer, news_article):
    # Tokenize input text
    inputs = tokenizer([news_article], max_length=2048, return_tensors='pt', truncation=True)

    # Generate summary
    summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=100, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Example usage
model, tokenizer = load_summarizer(model_path, device)

In [None]:
# Define LoRA Config
lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj"]
)

# Add LoRA adaptor
model_lora = get_peft_model(model, lora_config)
model_lora.print_trainable_parameters()

# Move model to device
model_lora.to(device)

# Set up optimizer and scheduler
optimizer = optim.AdamW(model_lora.parameters(), lr=1e-5, weight_decay=0.01)

num_epochs = 7
num_training_steps = num_epochs * len(train_dataloader)
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=1,
    num_training_steps=num_training_steps
)

In [None]:
import os
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM

# Set CUDA_LAUNCH_BLOCKING for synchronous CUDA errors
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

def train_lora(model, train_dataloader, val_dataloader, optimizer, scheduler, num_epochs, device):
    for epoch in range(num_epochs):
        model.train()
        total_train_loss = 0
        for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_train_loss += loss.item()

            loss.backward()
            optimizer.step()
            scheduler.step()

        avg_train_loss = total_train_loss / len(train_dataloader)
        print(f"Average training loss: {avg_train_loss:.4f}")

        # Validation
        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for batch in tqdm(val_dataloader, desc="Validation"):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(val_dataloader)
        print(f"Average validation loss: {avg_val_loss:.4f}")

    return model

# Run the training
model_lora = train_lora(model_lora, train_dataloader, val_dataloader, optimizer, scheduler, num_epochs, device)

# Save the fine-tuned model
model_lora.save_pretrained("./model/bart_large_cnn_lora_finetuned_multinews_v2")
tokenizer.save_pretrained("./model/bart_large_cnn_lora_finetuned_multinews_v2")

### Merge with base model

In [None]:
from peft import PeftModel, PeftConfig

# Load the base model
base_model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

# Load the LoRA config
lora_config = PeftConfig.from_pretrained("./model/bart_large_cnn_lora_finetuned_multinews_v2")

# Load the LoRA model
lora_model = PeftModel.from_pretrained(base_model, "./model/bart_large_cnn_lora_finetuned_multinews_v2")

# Merge the LoRA weights with the base model
merged_model = lora_model.merge_and_unload()

# Save the merged model
merged_model.save_pretrained("./model/bart_large_cnn_lora_merged_202408")
tokenizer.save_pretrained("./model/bart_large_cnn_lora_merged_202408")

In [7]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch

# Path to your saved merged model
merged_model_path = "./model/bart_large_cnn_lora_merged_202408"

# Load the model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(merged_model_path)
tokenizer = AutoTokenizer.from_pretrained(merged_model_path)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set the model to evaluation mode
model.eval()

# Function to generate summary
def generate_summary(model, tokenizer, text, max_length=400, min_length=130):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
    inputs = inputs.to(device)

    # Generate summary
    summary_ids = model.generate(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        num_beams=4,
        early_stopping=True,
        max_length=max_length,
        min_length=min_length,
        # num_beams=4,
        # length_penalty=2.0,
        # early_stopping=True,
        # no_repeat_ngram_size=3,  # Reduce repetition
        # do_sample=True,  # Enable sampling
        # top_k=50,  # Limit vocabulary for sampling
        # top_p=0.95,  # Nucleus sampling
    )

    # Decode the generated summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

In [4]:
## Base Model
model_path = r"D:\Online_Learning\Practical_DL\bart_large_cnn"

model_original = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)
tokenizer_original = AutoTokenizer.from_pretrained(model_path)

model_original.eval()

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50264, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(50264, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_l

In [5]:
text_to_summarize = '''
The widespread anti-immigrant riots in the United Kingdom of the past week, and the false viral claims that fueled them, may be the clearest, most direct example yet of the way unchecked misinformation on social media can produce violence and harm in the real world.

Even after authorities identified a UK national as the suspect behind a series of deadly stabbings targeting children, false claims about the attacker’s name and origins continued to stoke anti-immigrant fervor and propel far-right demonstrations.

The fake claims have circulated widely, particularly on X, the platform formerly known as Twitter, extremism researchers said. And police have openly blamed that misinformation for the violence that has wracked the country in recent days, with rioters throwing bricks at mosques, setting cars on fire and chanting anti-Islamic slogans while clashing with officers in riot gear.

The events of the past week are hardly the only example of the link between online misinformation and politically motivated violence: From the Rohingya genocide to the attack on the US Capitol on January 6, 2021, false and misleading claims have consistently been at the center of high-profile incidences of political unrest and violence.

It is a pattern that keeps repeating despite years of calls by governments and civil society groups for social media platforms to rein in inflammatory, hateful posts, as well as pledges by companies themselves to do more.

A recent retreat from content moderation by some major platforms, however, suggests that the problem of violence fueled by misinformation may well get worse before it gets better.

For nearly a decade, governments and civil rights groups have increasingly argued that online platforms have created enormous societal costs.

Critics of social media have repeatedly accused the industry of putting corporate profits before users’ mental health, or opening the door to foreign meddling, without doing enough to shield the world from those risks.

An economist might call these negative externalities – like pollution, they are byproducts of a profit-seeking business that, left unaddressed, everyone else must either learn to live with or mitigate, usually at great collective expense. The consequences tend to play out over long timeframes and with large-scale, systemic effects.

Police hold back rioters near a burning police vehicle after disorder broke out on July 30, 2024, in Southport, England.
Related article
Elon Musk says ‘civil war is inevitable’ as UK rocked by far-right riots. He’s part of the problem
This week, it is hard to avoid wondering whether politically motivated violence based on nothing more than bad-faith, evidence-free speculation has become a permanent fixture among social media’s various externalities, and if we are being asked to make peace with it as a condition of living in a digitally connected world.

Many social media companies have invested heavily in content moderation over the years. But the industry’s recent track record hints at a bet – or perhaps a hope – that just maybe, the public will tolerate a bit more pollution.

There are some signs of pushback. In the European Union, officials are looking to hold social media companies accountable for spreading misinformation under the new Digital Services Act. In the UK, the Online Safety Act could take effect as soon as this year, requiring, among other things, social media platforms to remove illegal content.

And even tougher rules may be on the way as a result of the riots. “We’re going to have to look more broadly at social media after this disorder,” UK Prime Minister Keir Starmer said in a video distributed to media Friday.

But punishments for online wrongdoing are already being handed out to individual perpetrators. On Friday, Jordan Parlour from Leeds, England, was sentenced to 20 months in jail after being convicted of publishing written material intended to stir racial hatred. The 28-year-old had posted the material on Facebook.

The United States has lagged on platform regulation, partly due to congressional dysfunction and partly because of legal and constitutional differences that grant online platforms more freedom to manage their own websites.

Still, lawmakers made some moves last month when the US Senate passed the Kids Online Safety Act, which aims to combat mental health harms for teens linked to social media.

It may be tempting to dismiss social media’s role in the UK riots as merely a reflection of latent political trends or the result of activism that would have happened on other platforms anyway.

But that distracts from the calculation that some platforms appear to have made: At least some of the time, some amount of misinformation-fueled violence is a reasonable cost for society to pay.

Olesya Dmitracova and Kara Fox contributed reporting.'''

In [8]:
summary = generate_summary(model_original, tokenizer_original, text_to_summarize)
print("Generated Summary:")
print(summary)

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Generated Summary:
False and misleading claims have consistently been at the center of high-profile incidences of political unrest and violence. It is a pattern that keeps repeating despite years of calls by governments and civil society groups for social media platforms to rein in inflammatory, hateful posts. A recent retreat from content moderation by some major platforms suggests that the problem of violence fueled by misinformation may well get worse before it gets better. “We’re going to have to look more broadly at social media after this disorder,” UK Prime Minister Keir Starmer said in a video distributed to media Friday. But punishments for online wrongdoing are already being handed out to individual perpetrators. On Friday, Jordan Parlour was sentenced to 20 months in jail after being convicted of publishing written material intended to stir racial hatred.


In [9]:
## Merge model
summary = generate_summary(model, tokenizer_original, text_to_summarize)
print("Generated Summary:")
print(summary)

Generated Summary:
– The UK riots of the past week may be the clearest, most direct example yet of the way unchecked misinformation on social media can produce violence and harm in the real world, the Guardian reports. Even after authorities identified a UK national as the suspect behind a series of deadly stabbings targeting children, false claims about the attacker's name and origins continued to stoke anti-immigrant fervor and propel far-right demonstrations. The fake claims have circulated widely, particularly on X, the platform formerly known as Twitter, extremism researchers say. And police have openly blamed that misinformation for the violence that has wracked the country in recent days, with rioters


## Test on Test Set

In [10]:
from datasets import load_metric

test_dataset = sample_dataset(dataset['test'])
test_dataset_tokenized = MultiNewsDataset(tokenized_datasets, 'test')

generated_summaries = [generate_summary(model, tokenizer_original, doc) for doc in test_dataset['document']]

In [11]:
# !pip install 
generated_summaries_original = [generate_summary(model_original, tokenizer_original, doc) for doc in test_dataset['document']]

In [12]:
# Evaluate with ROUGE
rouge = load_metric('rouge',trust_remote_code=True)

references = test_dataset['summary']
result_1 = rouge.compute(predictions=generated_summaries, references=references)
result_2 = rouge.compute(predictions=generated_summaries_original, references=references)

  rouge = load_metric('rouge',trust_remote_code=True)


## Test with Crawled Data

In [13]:
import pandas as pd
import numpy as np

df_text = pd.read_csv('D:\Online_Learning\Practical_DL\practical_dl_final_project\scraped_texts.csv')

In [14]:
summary_result = []
summary_result_cnn = []
for item in df_text[df_text['url'].str.contains('cnn')]['txt']:
    summary_result.append(generate_summary(model, tokenizer_original, item))
    summary_result_cnn.append(generate_summary(model_original, tokenizer_original, item))

In [15]:
df_cnn = df_text[df_text['url'].str.contains('cnn')]['txt'].copy()
df_cnn['summary_lora'] = summary_result
df_cnn['summary_cnn'] = summary_result_cnn

In [16]:
print(summary_result[2])
print(summary_result_cnn[2])

– A major ad industry group is shutting down, days after Elon Musk-owned X filed a lawsuit claiming the group illegally conspired to boycott advertising on his platform. The group, Global Alliance for Responsible Media, also known as GARM, is a voluntary ad-industry initiative run by the World Federation of Advertisers that aims to help brands avoid having their advertisements appear alongside illegal or harmful content. “GARM is a small, not-for-profit initiative, and recent allegations that unfortunately misconstrue its purpose and activities have caused a distraction and significantly drained its resources and finances,” the group said in a statement Friday.
The group, Global Alliance for Responsible Media, also known as GARM, is a voluntary ad-industry initiative run by the World Federation of Advertisers. The end of GARM marks a temporary victory for Elon Musk and X CEO Linda Yaccarino, even though a judge hasn’t made a ruling yet. The lawsuit could drive away even more advertiser

In [17]:
summary_result

['– If reelected, Donald Trump said Thursday, he\'d try to exert direct power over monetary policy. "I feel the president should have at least a say in there. I feel that strongly,” Trump said toward the end of hispress conference. “I made a lot of money. I was very successful. And I think I have a better instinct than, in many cases, people that would be on the Federal Reserve — or the chairman.” The former president said that Fed Chair Jerome Powell, whom Trump appointed to the position in 2017, has got the timing of rate moves wrong throughout his tenure.',
 "– In a letter sent Thursday to CrowdStrike's attorneys written by Delta’s high-powered lawyer, David Boies, the airline lashed out at the cybersecurity company, which has apologized for introducing a bug thatled to a global tech outage. CrowdStrike said it took responsibility for the initial outage, but it has said Delta was responsible for thousands of cancellations that piled up over the course of a week – long after its comp

In [18]:
summary_result_cnn

["Donald Trump said Thursday he'd try to exert direct power over monetary policy if reelected. The former president said that Fed Chair Jerome Powell has got the timing of rate moves wrong throughout his tenure. Trump has publicly feuded with Powell for years, frequently posting on social media that he disagreed with the Fed’s decision to raise rates in Powell's pre-Covid rate-hiking campaign. The Fed is designed to be an independent governing body, free from political influence, so that it cannot be bullied into making emotional decisions that could upset the delicate balance of job creation and low inflation. It takes time for rate hikes or cuts to take effect in the economy, so timing a policy decision right is a tricky game.",
 "CrowdStrike and Microsoft have claimed Delta’s outage lasted substantially longer than its rivals’ service downtime. CrowdStrike said Delta was responsible for thousands of cancellations that piled up over the course of a week. Delta canceled 7,000 flights 

In [None]:
import os
os.getcwd()