In [None]:
import pandas as pd
df_sample = pd.read_csv("sample_summary.csv")
# Display the first few rows
print(df_sample.head())

# Check DataFrame info
print(df_sample.info())

In [None]:
!pip install -q transformers huggingface_hub
!pip install -q --upgrade accelerate
!pip install -q -U bitsandbytes

In [None]:
from huggingface_hub import login
import os

# Use token from environment variable (safer)
login(os.getenv("HF_TOKEN"))


In [None]:
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
from typing import List, Tuple, Optional

# -----------------------------
# 1. Define the CPTF Summarizer Class
# -----------------------------
class CPTFSummarizer:
    def __init__(self, model: torch.nn.Module, tokenizer: AutoTokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.device = model.device

    def calculate_importance(
        self, 
        tokens: torch.Tensor, 
        alpha: float = 0.6
    ) -> torch.Tensor:
        """
        Calculate token importance scores using attention weights and positional bias.
        """
        try:
            with torch.no_grad():
                outputs = self.model(tokens, output_attentions=True)
                attentions = outputs.attentions
                
                importance_scores = torch.zeros(tokens.size(-1), device=tokens.device)
                num_layers = len(attentions)
                
                # Generate positional weights (linear decay from 1.0 to 0.5)
                position_weights = torch.linspace(1.0, 0.5, steps=tokens.size(-1), device=tokens.device)

                for l, layer_attention in enumerate(attentions):
                    # Calculate layer weight with position-based scaling
                    layer_weight = alpha + (1 - alpha) * (l + 1) / num_layers
                    
                    # Average attention across heads and batches
                    avg_attention = layer_attention.mean(dim=1).squeeze()
                    token_importance = avg_attention.mean(dim=-1)
                    
                    # Add positional weighting to importance scores
                    importance_scores += layer_weight * token_importance * position_weights
                
                return importance_scores
                
        except Exception as e:
            print(f"Error calculating importance scores: {str(e)}")
            raise

    def summarize(
        self,
        text: str,
        retention_ratio: float = 0.8,
        alpha: float = 0.6,
        max_length: int = 2500
    ) -> Tuple[str, List[float]]:
        """
        Perform Context Preserving Token Filtering to summarize text.
        
        Args:
            text: Input text to summarize.
            retention_ratio: Fraction of tokens to retain.
            alpha: Weight factor for layer importance.
            max_length: Maximum input length.
            
        Returns:
            Tuple of (summarized text, importance scores).
        """
        try:
            # Tokenize input text
            tokens = self.tokenizer(
                text,
                return_tensors="pt",
                truncation=True,
                max_length=max_length
            ).to(self.device)

            # Calculate number of tokens to keep
            n = tokens.input_ids.size(-1)
            k = int(n * retention_ratio)

            # Get importance scores and top k indices
            importance_scores = self.calculate_importance(tokens.input_ids, alpha)
            _, indices = torch.sort(importance_scores, descending=True)
            keep_indices = sorted(indices[:k].tolist())

            # Create reduced token sequence
            reduced_tokens = tokens.input_ids[0][keep_indices]
            
            # Decode back to text
            reduced_text = self.tokenizer.decode(reduced_tokens)

            return reduced_text, importance_scores.tolist()

        except Exception as e:
            print(f"Error in summarization process: {str(e)}")
            raise

# -----------------------------
# 2. Load your dataset
# -----------------------------
# Replace "sample_summary.csv" with the filename of your dataset.
df_sample = pd.read_csv("sample_summary.csv")
print(df_sample.head())
print(df_sample.info())

# -----------------------------
# 3. Initialize the Model and Tokenizer
# -----------------------------
# Change the model name as needed; ensure the model supports output_attentions.
model_name = "meta-llama/Llama-3.2-1B-Instruct"  # Example model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.padding_side = 'left'
tokenizer.pad_token_id = tokenizer.eos_token_id

# -----------------------------
# 4. Create a CPTF Summarizer instance
# -----------------------------
cptf_summarizer = CPTFSummarizer(model, tokenizer)

# -----------------------------
# 5. Process the dataset with CPTF summarizer
# -----------------------------
# We'll iterate over each "input" text and generate a summary.
cptf_summaries = []
for text in tqdm(df_sample["input"].tolist(), desc="Processing CPTF Summaries"):
    try:
        summary, _ = cptf_summarizer.summarize(text, retention_ratio=0.7)
        cptf_summaries.append(summary)
    except Exception as e:
        print(f"Error summarizing text: {e}")
        cptf_summaries.append("")  # Append empty string if there's an error

# Add the generated summaries to the DataFrame
df_sample["cptf_summary"] = cptf_summaries

# Optionally, save the updated DataFrame to a new CSV file
df_sample.to_csv("soap_cptf_generated_summaries.csv", index=False)
print("CPTF summaries saved to 'soap_cptf_generated_summaries.csv'")


In [None]:
import pandas as pd
from tqdm import tqdm

def process_dataset_with_cptf(
    df: pd.DataFrame,
    summarizer: CPTFSummarizer,
    input_column: str = "input",
    retention_ratio: float = 0.8,
    alpha: float = 0.5,
    max_length: int = 2048
) -> pd.DataFrame:
    """
    Apply CPTF to a dataset and add reduced text and importance scores as new columns.
    
    Args:
        df: Input dataframe with an `input` column containing text data.
        summarizer: An instance of the CPTFSummarizer class.
        input_column: Column name in the dataframe containing the input text.
        retention_ratio: Fraction of tokens to retain during summarization.
        alpha: Weight factor for layer importance.
        max_length: Maximum input length for tokenization.
    
    Returns:
        Updated dataframe with new columns: `reduced_text` and `importance_scores`.
    """
    reduced_texts = []
    importance_scores_list = []

    print("Processing dataset with CPTF...")
    with tqdm(total=len(df), desc="Processing Dataset", unit="row") as pbar:
        for _, row in df.iterrows():
            text = row[input_column]
            try:
                # Summarize using CPTFSummarizer
                reduced_text, importance_scores = summarizer.summarize(
                    text,
                    retention_ratio=retention_ratio,
                    alpha=alpha,
                    max_length=max_length
                )
                reduced_texts.append(reduced_text)
                importance_scores_list.append(importance_scores)
            except Exception as e:
                print(f"Error processing row: {text[:50]}... Error: {e}")
                reduced_texts.append("")
                importance_scores_list.append([])
            finally:
                pbar.update(1)

    # Add results as new columns to the dataframe
    df["reduced_text"] = reduced_texts
    df["importance_scores"] = importance_scores_list

    return df


In [None]:
# Process the dataset to reduce text dynamically
processed_df = process_dataset_with_cptf(
    df=df_sample,
    summarizer=CPTFSummarizer(model, tokenizer),
    input_column="input",  # The column containing input text
    retention_ratio=0.8,  # Retain 70% of the most important tokens
    alpha=0.5,            # Importance weighting factor
    max_length=2048       # Max tokenization length
)

# View one reduced text
print(processed_df["reduced_text"].iloc[0])  # Replace 0 with the desired row index
# Save the processed DataFrame to a CSV file
processed_df.to_csv("soap_processed_reduced_texts.csv", index=False)

print("Processed DataFrame saved to 'soap_processed_reduced_texts.csv'.")


In [None]:
processed_df.head(8)

In [None]:
print(processed_df['reduced_text'].iloc[8])

In [None]:
print(processed_df['input'].iloc[8])