In [1]:
import pandas as pd
df_sample = pd.read_csv("sample_summary.csv")
# Display the first few rows
print(df_sample.head())

# Check DataFrame info
print(df_sample.info())

                                               input  \
0  Good afternoon, champ, how you holding up? Goo...   
1  What brings you in here today? Hi, I'm um, I'm...   
2  Do you have any known allergies to medications...   
3  How may I help you today? Yeah I've had, a fev...   
4  It sounds like that you're experiencing some c...   

                                              output  
0  Subjective:\n- Symptoms: Lower back pain, radi...  
1  Subjective:\n- Presenting with dry cough for 1...  
2  Subjective:\n- No known allergies to medicatio...  
3  Subjective:\n- Fever and dry cough started 4 d...  
4  Subjective:\n- Presenting with chest pain for ...  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   input   100 non-null    object
 1   output  100 non-null    object
dtypes: object(2)
memory usage: 1.7+ KB
None


In [2]:
!pip install -q transformers huggingface_hub
!pip install -q --upgrade accelerate
!pip install -q -U bitsandbytes

In [3]:
from huggingface_hub import login

# Use your Hugging Face token
login("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")

In [4]:
import pandas as pd
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm
from typing import List, Tuple, Optional

# -----------------------------
# 1. Define the CPTF Summarizer Class
# -----------------------------
class CPTFSummarizer:
    def __init__(self, model: torch.nn.Module, tokenizer: AutoTokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.device = model.device

    def calculate_importance(
        self, 
        tokens: torch.Tensor, 
        alpha: float = 0.6
    ) -> torch.Tensor:
        """
        Calculate token importance scores using attention weights and positional bias.
        """
        try:
            with torch.no_grad():
                outputs = self.model(tokens, output_attentions=True)
                attentions = outputs.attentions
                
                importance_scores = torch.zeros(tokens.size(-1), device=tokens.device)
                num_layers = len(attentions)
                
                # Generate positional weights (linear decay from 1.0 to 0.5)
                position_weights = torch.linspace(1.0, 0.5, steps=tokens.size(-1), device=tokens.device)

                for l, layer_attention in enumerate(attentions):
                    # Calculate layer weight with position-based scaling
                    layer_weight = alpha + (1 - alpha) * (l + 1) / num_layers
                    
                    # Average attention across heads and batches
                    avg_attention = layer_attention.mean(dim=1).squeeze()
                    token_importance = avg_attention.mean(dim=-1)
                    
                    # Add positional weighting to importance scores
                    importance_scores += layer_weight * token_importance * position_weights
                
                return importance_scores
                
        except Exception as e:
            print(f"Error calculating importance scores: {str(e)}")
            raise

    def summarize(
        self,
        text: str,
        retention_ratio: float = 0.8,
        alpha: float = 0.6,
        max_length: int = 2500
    ) -> Tuple[str, List[float]]:
        """
        Perform Context Preserving Token Filtering to summarize text.
        
        Args:
            text: Input text to summarize.
            retention_ratio: Fraction of tokens to retain.
            alpha: Weight factor for layer importance.
            max_length: Maximum input length.
            
        Returns:
            Tuple of (summarized text, importance scores).
        """
        try:
            # Tokenize input text
            tokens = self.tokenizer(
                text,
                return_tensors="pt",
                truncation=True,
                max_length=max_length
            ).to(self.device)

            # Calculate number of tokens to keep
            n = tokens.input_ids.size(-1)
            k = int(n * retention_ratio)

            # Get importance scores and top k indices
            importance_scores = self.calculate_importance(tokens.input_ids, alpha)
            _, indices = torch.sort(importance_scores, descending=True)
            keep_indices = sorted(indices[:k].tolist())

            # Create reduced token sequence
            reduced_tokens = tokens.input_ids[0][keep_indices]
            
            # Decode back to text
            reduced_text = self.tokenizer.decode(reduced_tokens)

            return reduced_text, importance_scores.tolist()

        except Exception as e:
            print(f"Error in summarization process: {str(e)}")
            raise

# -----------------------------
# 2. Load your dataset
# -----------------------------
# Replace "sample_summary.csv" with the filename of your dataset.
df_sample = pd.read_csv("sample_summary.csv")
print(df_sample.head())
print(df_sample.info())

# -----------------------------
# 3. Initialize the Model and Tokenizer
# -----------------------------
# Change the model name as needed; ensure the model supports output_attentions.
model_name = "meta-llama/Llama-3.2-1B-Instruct"  # Example model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.padding_side = 'left'
tokenizer.pad_token_id = tokenizer.eos_token_id

# -----------------------------
# 4. Create a CPTF Summarizer instance
# -----------------------------
cptf_summarizer = CPTFSummarizer(model, tokenizer)

# -----------------------------
# 5. Process the dataset with CPTF summarizer
# -----------------------------
# We'll iterate over each "input" text and generate a summary.
cptf_summaries = []
for text in tqdm(df_sample["input"].tolist(), desc="Processing CPTF Summaries"):
    try:
        summary, _ = cptf_summarizer.summarize(text, retention_ratio=0.7)
        cptf_summaries.append(summary)
    except Exception as e:
        print(f"Error summarizing text: {e}")
        cptf_summaries.append("")  # Append empty string if there's an error

# Add the generated summaries to the DataFrame
df_sample["cptf_summary"] = cptf_summaries

# Optionally, save the updated DataFrame to a new CSV file
df_sample.to_csv("soap_cptf_generated_summaries.csv", index=False)
print("CPTF summaries saved to 'soap_cptf_generated_summaries.csv'")


                                               input  \
0  Good afternoon, champ, how you holding up? Goo...   
1  What brings you in here today? Hi, I'm um, I'm...   
2  Do you have any known allergies to medications...   
3  How may I help you today? Yeah I've had, a fev...   
4  It sounds like that you're experiencing some c...   

                                              output  
0  Subjective:\n- Symptoms: Lower back pain, radi...  
1  Subjective:\n- Presenting with dry cough for 1...  
2  Subjective:\n- No known allergies to medicatio...  
3  Subjective:\n- Fever and dry cough started 4 d...  
4  Subjective:\n- Presenting with chest pain for ...  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   input   100 non-null    object
 1   output  100 non-null    object
dtypes: object(2)
memory usage: 1.7+ KB
None


Processing CPTF Summaries: 100%|██████████| 100/100 [00:09<00:00, 10.57it/s]


CPTF summaries saved to 'soap_cptf_generated_summaries.csv'


In [5]:
import pandas as pd
from tqdm import tqdm

def process_dataset_with_cptf(
    df: pd.DataFrame,
    summarizer: CPTFSummarizer,
    input_column: str = "input",
    retention_ratio: float = 0.8,
    alpha: float = 0.5,
    max_length: int = 2048
) -> pd.DataFrame:
    """
    Apply CPTF to a dataset and add reduced text and importance scores as new columns.
    
    Args:
        df: Input dataframe with an `input` column containing text data.
        summarizer: An instance of the CPTFSummarizer class.
        input_column: Column name in the dataframe containing the input text.
        retention_ratio: Fraction of tokens to retain during summarization.
        alpha: Weight factor for layer importance.
        max_length: Maximum input length for tokenization.
    
    Returns:
        Updated dataframe with new columns: `reduced_text` and `importance_scores`.
    """
    reduced_texts = []
    importance_scores_list = []

    print("Processing dataset with CPTF...")
    with tqdm(total=len(df), desc="Processing Dataset", unit="row") as pbar:
        for _, row in df.iterrows():
            text = row[input_column]
            try:
                # Summarize using CPTFSummarizer
                reduced_text, importance_scores = summarizer.summarize(
                    text,
                    retention_ratio=retention_ratio,
                    alpha=alpha,
                    max_length=max_length
                )
                reduced_texts.append(reduced_text)
                importance_scores_list.append(importance_scores)
            except Exception as e:
                print(f"Error processing row: {text[:50]}... Error: {e}")
                reduced_texts.append("")
                importance_scores_list.append([])
            finally:
                pbar.update(1)

    # Add results as new columns to the dataframe
    df["reduced_text"] = reduced_texts
    df["importance_scores"] = importance_scores_list

    return df


In [6]:
# Process the dataset to reduce text dynamically
processed_df = process_dataset_with_cptf(
    df=df_sample,
    summarizer=CPTFSummarizer(model, tokenizer),
    input_column="input",  # The column containing input text
    retention_ratio=0.8,  # Retain 70% of the most important tokens
    alpha=0.5,            # Importance weighting factor
    max_length=2048       # Max tokenization length
)

# View one reduced text
print(processed_df["reduced_text"].iloc[0])  # Replace 0 with the desired row index
# Save the processed DataFrame to a CSV file
processed_df.to_csv("soap_processed_reduced_texts.csv", index=False)

print("Processed DataFrame saved to 'soap_processed_reduced_texts.csv'.")


Processing dataset with CPTF...


Processing Dataset: 100%|██████████| 100/100 [00:06<00:00, 14.95row/s]


<|begin_of_text|>Good afternoon, champ, how you holding up? Good afternoon, Doctor, I have a lot of lower back pain. Oh no, before we begin, how old are you, sir and which hand do you write with? I'm seventy five now. Right. Great, so tell me, how long have you had this lower back pain? It's been about ten days now. Have your symptoms improved at all since they began? No, they keep getting worse. Does the pain radiate into your legs? Yes, it started radiating down my right leg three days after the lower back pain began, and then the left leg three days after the right. The next day I could barely walk, the pain was so severe. Do you remember the initial date of the beginning of your low back pain? Um, it was on December third nineteen ninety five. Have you seen another doctor for this pain? Yes, I saw my local physician, um, it was on December eleventh, but he wasn't able to pinpoint what was going on. What kind of treatments did he recommend? He gave me some antiinflammatories and sen

In [7]:
processed_df.head(8)

Unnamed: 0,input,output,cptf_summary,reduced_text,importance_scores
0,"Good afternoon, champ, how you holding up? Goo...","Subjective:\n- Symptoms: Lower back pain, radi...","<|begin_of_text|>Good afternoon, champ, how yo...","<|begin_of_text|>Good afternoon, champ, how yo...","[0.02965068817138672, 0.029625179246068, 0.029..."
1,"What brings you in here today? Hi, I'm um, I'm...",Subjective:\n- Presenting with dry cough for 1...,<|begin_of_text|>What brings you in here today...,<|begin_of_text|>What brings you in here today...,"[0.0059814453125, 0.005979984533041716, 0.0059..."
2,Do you have any known allergies to medications...,Subjective:\n- No known allergies to medicatio...,<|begin_of_text|>Do you have any known allergi...,<|begin_of_text|>Do you have any known allergi...,"[1.02056884765625, 0.9742085337638855, 0.92784..."
3,"How may I help you today? Yeah I've had, a fev...",Subjective:\n- Fever and dry cough started 4 d...,<|begin_of_text|>How may I help you today? Yea...,<|begin_of_text|>How may I help you today? Yea...,"[0.011168479919433594, 0.011161953210830688, 0..."
4,It sounds like that you're experiencing some c...,Subjective:\n- Presenting with chest pain for ...,<|begin_of_text|>It sounds like that you're ex...,<|begin_of_text|>It sounds like that you're ex...,"[0.00801396369934082, 0.008009910583496094, 0...."
5,Hi there! What brings you in today? Guest_fami...,Subjective:\n- Concern that the baby may have ...,<|begin_of_text|>Hi there! What brings you in ...,<|begin_of_text|>Hi there! What brings you in ...,"[0.14084625244140625, 0.13999703526496887, 0.1..."
6,"Are you allergic to anything? No, I am not all...",Subjective:\n- No allergies reported (includin...,<|begin_of_text|>Are you allergic to anything?...,<|begin_of_text|>Are you allergic to anything?...,"[0.510284423828125, 0.49925094842910767, 0.488..."
7,"What brought you in today? Sure, I'm I'm just ...",Subjective:\n- Chest pain\n- Duration: Started...,<|begin_of_text|>What brought you in today? Su...,<|begin_of_text|>What brought you in today? Su...,"[0.009352684020996094, 0.009349113330245018, 0..."


In [9]:
print(processed_df['reduced_text'].iloc[8])

<|begin_of_text|>What brings you in? Yeah, I'm just coming in with my son, he's been having, he's been complaining of just, seems like pain in his right ear for the last I would say three to four days. And it seems like he's just more irritable, just not himself right now. Yeah OK, so, how long did you say he's had this ear pain? I would say four days now. Four days, OK. Have you noticed any hearing loss? Uh, no, he still responds when I call his name. OK and, have you noticed any fluid coming out of his ears? Uh, no, I don't think so. Is it more like one side of his ears or is it both ears? It's just his right ear. Just his right ear, OK. And have you noticed that it's like, swollen, red or warm? Uh, no, I don't see any swelling or redness at all. OK, OK, good, and have you taken a look inside his ears? Yeah, I tried to take a look with a flashlight. I couldn't really see too much. OK, yeah, that's fair. And you mentioned he's had a runny nose too, When did that start? Yeah, so that, 

In [8]:
print(processed_df['input'].iloc[8])

What brings you in? Yeah, I'm just coming in with my son, he's been having, he's been complaining of just, seems like pain in his right ear for the last I would say three to four days. And it seems like he's just more irritable, just not himself right now. Yeah OK, so, how long did you say he's had this ear pain? I would say four days now. Four days, OK. Have you noticed any hearing loss? Uh, no, he still responds when I call his name. OK and, have you noticed any fluid coming out of his ears? Uh, no, I don't think so. Is it more like one side of his ears or is it both ears? It's just his right ear. Just his right ear, OK. And have you noticed that it's like, swollen, red or warm? Uh, no, I don't see any swelling or redness at all. OK, OK, good, and have you taken a look inside his ears? Yeah, I tried to take a look with a flashlight. I couldn't really see too much. OK, yeah, that's fair. And you mentioned he's had a runny nose too, When did that start? Yeah, so that, kind of like last