In [4]:
import os
import random
import numpy as np
import torch
import torch.nn.functional as F
from datasets import load_dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer, AdamW
from rouge_score import rouge_scorer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import spacy

# Set random seeds for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

# Reward calculation utilities
class RewardCalculator:
    def __init__(self):
        self.rouge_scorer = rouge_scorer.RougeScorer(["rouge1", "rougeL"], use_stemmer=True)
        self.semantic_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
        self.nlp = spacy.load("en_core_web_sm")

    def length_reward(self, generated_summary, reference_summary):
        """Reward based on summary length similarity."""
        gen_length = len(generated_summary.split())
        ref_length = len(reference_summary.split())
        return max(0, 1 - abs(gen_length - ref_length) / ref_length)

    def keyword_coverage(self, generated_summary, reference_summary):
        """Compute keyword coverage reward."""
        vectorizer = CountVectorizer(stop_words='english', max_features=5)
        vectorizer.fit([reference_summary])
        keywords = set(vectorizer.get_feature_names_out())
        gen_words = set(generated_summary.split())
        return len(keywords & gen_words) / len(keywords) if keywords else 0

    def entity_coverage(self, generated_summary, reference_summary):
        """Compute entity coverage reward."""
        ref_doc = self.nlp(reference_summary)
        gen_doc = self.nlp(generated_summary)

        ref_entities = {ent.text for ent in ref_doc.ents}
        gen_entities = {ent.text for ent in gen_doc.ents}

        return len(ref_entities & gen_entities) / len(ref_entities) if ref_entities else 1

    def calculate_reward(self, generated_summary, reference_summary):
        """Calculate comprehensive reward."""
        # ROUGE score
        rouge_scores = self.rouge_scorer.score(generated_summary, reference_summary)
        rouge_reward = rouge_scores["rouge1"].fmeasure + rouge_scores["rougeL"].fmeasure

        # Semantic similarity
        generated_embedding = self.semantic_model.encode([generated_summary])
        reference_embedding = self.semantic_model.encode([reference_summary])
        semantic_reward = cosine_similarity(generated_embedding, reference_embedding)[0][0]

        # Additional rewards
        length_penalty = self.length_reward(generated_summary, reference_summary)
        keyword_cov = self.keyword_coverage(generated_summary, reference_summary)
        entity_cov = self.entity_coverage(generated_summary, reference_summary)

        # Weighted combined reward
        return (0.4 * rouge_reward + 
                0.3 * semantic_reward + 
                0.1 * length_penalty + 
                0.1 * keyword_cov + 
                0.1 * entity_cov)

class T5RLTrainer:
    def __init__(self, model, tokenizer, dataset, lr=5e-5):
        self.model = model
        self.tokenizer = tokenizer
        self.dataset = dataset
        self.reward_calculator = RewardCalculator()
        
        # Optimizer
        self.optimizer = AdamW(self.model.parameters(), lr=lr)

    def generate_summary(self, input_ids, max_length=150):
        """Generate summary using the model."""
        outputs = self.model.generate(
            input_ids, 
            max_length=max_length, 
            num_return_sequences=1, 
            do_sample=True
        )
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

    def policy_gradient_update(self, input_ids, reference_summary):
        """Perform policy gradient update."""
        # Ensure model is in training mode
        self.model.train()

        # Generate summary
        generated_summary = self.generate_summary(input_ids)

        # Calculate reward
        reward = self.reward_calculator.calculate_reward(generated_summary, reference_summary)

        # Compute log probabilities for the generated summary
        outputs = self.model(input_ids=input_ids, labels=self.tokenizer(generated_summary, return_tensors='pt').input_ids)
        log_probs = outputs.loss  # This is the negative log-likelihood

        # Policy gradient update (simplified version)
        policy_loss = log_probs * reward

        # Zero gradients, backpropagate, and optimize
        self.optimizer.zero_grad()
        policy_loss.backward()
        self.optimizer.step()

        return reward, generated_summary

    def train(self, num_epochs=3, batch_size=8):
        print("Starting RL Training...")
        
        # Shuffle dataset
        indices = list(range(len(self.dataset)))
        random.shuffle(indices)

        total_rewards = []
        for epoch in range(num_epochs):
            epoch_rewards = []
            
            for i in range(0, len(indices), batch_size):
                batch_indices = indices[i:i+batch_size]
                
                for idx in batch_indices:
                    # Prepare input
                    article = self.dataset[idx]['article']
                    reference_summary = self.dataset[idx]['highlights']
                    
                    # Tokenize input
                    input_ids = self.tokenizer(
                        article, 
                        return_tensors='pt', 
                        truncation=True, 
                        max_length=512
                    ).input_ids
                    
                    # RL update
                    reward, generated_summary = self.policy_gradient_update(input_ids, reference_summary)
                    
                    epoch_rewards.append(reward)
                    
                    print(f"Epoch {epoch+1}, Sample {i+batch_indices.index(idx)+1}")
                    print(f"Generated Summary: {generated_summary}")
                    print(f"Reward: {reward}\n")
            
            # Compute and store average reward for the epoch
            avg_reward = np.mean(epoch_rewards)
            total_rewards.append(avg_reward)
            print(f"Epoch {epoch+1} Average Reward: {avg_reward}")
        
        return total_rewards

def main():
    # Set random seed
    set_seed(42)

    # Set CUDA device
    os.environ["CUDA_VISIBLE_DEVICES"] = "14"

    # Load dataset
    dataset = load_dataset("cnn_dailymail", "3.0.0", cache_dir="./datasets/cnn_dailymail_clean")

    # Load pre-trained T5 model
    model = T5ForConditionalGeneration.from_pretrained("./summarization_model")
    tokenizer = T5Tokenizer.from_pretrained("./summarization_model")

    # Create RL trainer
    rl_trainer = T5RLTrainer(model, tokenizer, dataset['train'])

    # Train with RL
    rewards = rl_trainer.train(num_epochs=1, batch_size=8)

    # Save the RL-fine-tuned model
    model.save_pretrained("./t5_rl_summarization_model")
    tokenizer.save_pretrained("./t5_rl_summarization_model")

    print("RL Training Complete. Rewards:", rewards)

if __name__ == "__main__":
    main()



Starting RL Training...
Epoch 1, Sample 1
Generated Summary: British mercenary Simon Mann jailed for 34 years after he landed against Zimbabwe. Mann testified that his former British army commander was in charge of the plot. He was convicted earlier this year of trying to overthrow a Nigeria coup in 2004. The case of Abu Maasutid Suttera is dragged from prison after his imprisonment.
Reward: 0.6561788112367378

Epoch 1, Sample 2
Generated Summary: Thousands more killed and wounded more than 200 in Jaipur attack. Authorities impose curfew in Rajasthan following earlier attacks against Islamic militant groups. No one has claimed responsibility for the attack, the chief minister says. A government that has called for'strong, vigilante" coalition on Muslims.
Reward: 0.4877215328778456

Epoch 1, Sample 3
Generated Summary: NEW: U.S., European, Russian foreign policy chief calls for ceasefire. The Russian Foreign Minister: The Russian foreign minister says he supports Georgia ''Anita's terri