

#### In this notebook, I have taken 100 random samples from various character token ranges. The sample dataset is available [here](https://kaggle.com/datasets/522b8a01db8c65fdc0e5ad2c381fab2f6e3eac5c299d7ae5b99678f9f1987ae6). For each token range, I calculated the ROUGE-1 and ROUGE-L scores. The score dataset is [here](https://kaggle.com/datasets/5d06d02e6778b7d73c2dd67b30db407e4247ba80e69f9d96f974ea94b05064eb).

In [None]:
%%capture
!pip install pip3-autoremove
!pip-autoremove torch torchvision torchaudio -y
!pip install unsloth

import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
from datasets import load_dataset

dataset = load_dataset('csebuetnlp/xlsum', 'bengali',split='test')


In [None]:
#sampling

length_ranges = [
    (442, 1060), (1060, 1670), (1670, 2280), (2280, 2900),
    (2900, 3510), (3510, 4130), (4130, 4740), (4740, 5350),
    (5350, 5920), (5920, 6580)
]

for i, (min_len, max_len) in enumerate(length_ranges, start=1):
    filtered_dataset = dataset.filter(lambda x: isinstance(x['text'], str) and min_len <= len(x['text']) <= max_len)
    
    selected_columns = filtered_dataset.remove_columns(
        [col for col in filtered_dataset.column_names if col not in ['text', 'summary']]
    )
    
    selected_columns = selected_columns.map(lambda x: {'length': f'{min_len}-{max_len}'})
    
    random_sample = selected_columns.shuffle(seed=1).select(range(10))
    save_directory = f"/kaggle/working/random_sample_{i}"
    random_sample.save_to_disk(save_directory)


In [None]:
#Rouge1

def rouge_n_score(candidate, reference, n=1):
    # Helper function to get n-grams
    def get_ngrams(text, n):
        tokens = text.split()
        return [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

    # Get n-grams for both candidate and reference
    candidate_ngrams = get_ngrams(candidate, n)
    reference_ngrams = get_ngrams(reference, n)

    # Count overlapping n-grams
    overlap = set(candidate_ngrams) & set(reference_ngrams)
    overlap_count = len(overlap)

    # Calculate precision, recall, and F1-score
    precision = overlap_count / len(candidate_ngrams) if candidate_ngrams else 0
    recall = overlap_count / len(reference_ngrams) if reference_ngrams else 0
    f1_score = (2 * precision * recall) / (precision + recall) if precision + recall > 0 else 0

    return {
        'precision': precision,
        'recall': recall,
        'f1_score': f1_score,
        'overlap_count': overlap_count
    }

In [None]:
#RougeL

def lcs_length(x, y):
    """
    Helper function to compute the length of the Longest Common Subsequence (LCS)
    between two sequences x and y using dynamic programming.
    
    Args:
        x: A list of words (tokenized sentence).
        y: A list of words (tokenized sentence).
        
    Returns:
        Length of the LCS.
    """
    m = len(x)
    n = len(y)
    # Create a 2D DP table to store lengths of LCS
    dp = [[0] * (n + 1) for _ in range(m + 1)]
    
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if x[i - 1] == y[j - 1]:
                dp[i][j] = dp[i - 1][j - 1] + 1
            else:
                dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
    
    return dp[m][n]

def rouge_l(candidate, reference):
    """
    Calculate the ROUGE-L score for a candidate summary and reference summary.
    
    Args:
        candidate: The generated summary (tokenized).
        reference: The reference summary (tokenized).
        
    Returns:
        Precision, Recall, and F1 score.
    """
    # Get the length of the longest common subsequence
    lcs_len = lcs_length(candidate, reference)
    
    # Precision
    precision = lcs_len / len(candidate) if candidate else 0
    
    # Recall
    recall = lcs_len / len(reference) if reference else 0
    
    # F1 score
    if precision + recall == 0:
        f1_score = 0
    else:
        f1_score = 2 * (precision * recall) / (precision + recall)
    
    return precision, recall, f1_score


In [None]:
max_seq_length = 4096 
dtype = None 
load_in_4bit = True
from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "hasnatz/gemma2b11", 
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    
)

FastLanguageModel.for_inference(model)

In [None]:
alpaca_prompt = """You are an abstructive text summarizer in Bangla language. Below is an instruction in Bangla, paired with an input that provides the context for summarization.

### Instruction:
নিচের লেখাটির সারমর্ম লিখো

### Input:
{}

### Response:
{}"""

In [None]:
#Function for getting all the rouge1 and L result for each sample size.

import pandas as pd
from tqdm import tqdm  

# Initialize a list to store results
results1 = []
results2 = []


# List of dataset names
dataset_names = [f'random_sample_{i}' for i in range(1, 11)]

# Iterate through each dataset
for dataset_name in dataset_names:
    random_sample = globals()[dataset_name] 
    
    total_rouge_1_precision = 0
    total_rouge_1_recall = 0
    total_rouge_1_f1 = 0

    total_rouge_l_precision = 0
    total_rouge_l_recall = 0
    total_rouge_l_f1 = 0
    num_samples = len(random_sample['text'])  # Number of samples

    # Iterate through the indices of the 'text' and 'summary' lists
    for i in tqdm(range(num_samples)):
        txt = random_sample['text'][i]  # Get the 'text' at index i
        reference_summary = random_sample['summary'][i]  # Get the reference 'summary' at index i

        # Prepare the inputs for the model
        inputs = tokenizer(
            [
                alpaca_prompt.format(
                    txt,  # input
                    "",  # output - leave this blank for generation!
                )
            ], return_tensors="pt").to("cuda")

        # Generate summary
        outputs = model.generate(**inputs, max_new_tokens=250, use_cache=True)
        decoded_outputs = tokenizer.batch_decode(outputs)

        # Extract the generated summary
        text = decoded_outputs[0]

        # Extracting the response text
        start_marker = "### Response:\n"
        end_marker = "<|end_of_text|>"

        start_index = text.find(start_marker) + len(start_marker)
        end_index = text.find(end_marker)

        # Extract the response text
        candidate_summary = text[start_index:end_index].strip()

        # Calculate ROUGE-1 scores for the current sample
        rouge_1 = rouge_n_score(candidate_summary, reference_summary, n=1)
        total_rouge_1_precision += rouge_1['precision']
        total_rouge_1_recall += rouge_1['recall']
        total_rouge_1_f1 += rouge_1['f1_score']

        # Calculate ROUGE-L scores for the current sample
        precision, recall, f1_score = rouge_l(candidate_summary, reference_summary)
        total_rouge_l_precision += precision
        total_rouge_l_recall += recall
        total_rouge_l_f1 += f1_score

    # Calculate average ROUGE-1 scores
    avg_rouge_1_precision = total_rouge_1_precision / num_samples
    avg_rouge_1_recall = total_rouge_1_recall / num_samples
    avg_rouge_1_f1 = total_rouge_1_f1 / num_samples

    # Calculate average ROUGE-L scores
    avg_rouge_l_precision = total_rouge_l_precision / num_samples
    avg_rouge_l_recall = total_rouge_l_recall / num_samples
    avg_rouge_l_f1 = total_rouge_l_f1 / num_samples

    # Create a dictionary to store the results
    result_dict1 = {
        "Model": "Gemma29b",
        "Context size": random_sample['length'],  # Assuming 'length' exists in your dataset
        "R1 Precision": avg_rouge_1_precision,
        "R1 Recall": avg_rouge_1_recall,
        "R1 F1": avg_rouge_1_f1,
        
    }
    
    result_dict2 ={
        "Model": "Gemma29b",
        "Context size": random_sample['length'],
        "R-L Precision": avg_rouge_l_precision,
        "R-L Recall": avg_rouge_l_recall,
        "R-L F1": avg_rouge_l_f1,
    }

    # Append the result to the results list
    results1.append(result_dict1)
    results2.append(result_dict2)

# Convert the results to a DataFrame
results_df1 = pd.DataFrame(results1)
results_df2 = pd.DataFrame(results2)



results_df1.to_csv("gm29b_rouge1_results.csv", index=False)
results_df2.to_csv("gm29b_rougeL_results.csv", index=False)
