In [6]:
import numpy as np
import pandas as pd 
import os
import re
import pandas as pd
import numpy as np
import pickle as pkl
import os
from tabulate import tabulate
from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaConfig
import requests
from huggingface_hub import hf_hub_download
from nltk import download; download('punkt_tab')

import torch
import warnings
# Disable all warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [7]:
#pip install -U bitsandbytes txtai
# pip install bitsandbytes
# pip install accelerate


In [8]:
from rouge_score import rouge_scorer


In [9]:
# Set device globally
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Function to move data to device
def move_to_device(data, device):
    if isinstance(data, torch.Tensor):
        return data.to(device)
    elif isinstance(data, dict):  # For dicts like inputs
        return {key: value.to(device) for key, value in data.items()}
    elif isinstance(data, list):  # For lists of tensors
        return [move_to_device(item, device) for item in data]
    else:
        return data  # If not tensor, return as is



Using device: cuda


In [10]:
path_train = "/kaggle/input/train-data"
with open(f"{path_train}/train_data.dat", "rb") as f:
    data_obs_train, data_rct_train = pkl.load(f)


In [11]:
path_test = "/kaggle/input/data-test"
with open(f"{path_test}/data_test.dat", "rb") as f:
    data_obs_test, data_rct_test = pkl.load(f)


In [12]:
# Move data to the selected device
data_obs_train = move_to_device(data_obs_train, device)
data_rct_train = move_to_device(data_rct_train, device)
data_obs_test = move_to_device(data_obs_test, device)
data_rct_test = move_to_device(data_rct_test, device)


In [13]:
def compute_rouge2(generated, reference):
    scorer = rouge_scorer.RougeScorer(['rouge2'], use_stemmer=True)
    score = scorer.score(reference, generated)
    return score['rouge2'].fmeasure


In [14]:
# Set your Hugging Face API key
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
api_key = user_secrets.get_secret("HF_TOKEN")



In [15]:
# Define the model name

model_name = "meta-llama/Llama-3.2-3B-Instruct"


In [16]:
# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=api_key)


tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [17]:
# Load the model
model = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token=api_key)

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [18]:
model = model.to(device)

In [23]:
def split_into_chunks_with_overlap(article, tokenizer, chunk_size=512, overlap_size=100):
    """
    Splits a long article into chunks with a specified size and overlap.

    Args:
        article (str): The full article to be summarized.
        tokenizer (transformers.PreTrainedTokenizer): The tokenizer to count tokens.
        chunk_size (int): The maximum number of tokens per chunk (default: 512).
        overlap_size (int): The number of tokens to overlap between chunks (default: 100).

    Returns:
        list: A list of chunks (strings), each with a token length <= chunk_size, and overlap.
    """
    # Tokenize the article into token IDs (flat list)
    tokens = tokenizer(article, truncation=False, padding=False)["input_ids"]

    # Create chunks with overlap
    chunks = []
    for i in range(0, len(tokens), chunk_size - overlap_size):
        chunk = tokens[i:i + chunk_size]
        chunk_text = tokenizer.decode(chunk, skip_special_tokens=True)
        chunks.append(chunk_text)
        
    return chunks



In [36]:
def generate_abstract_from_prompt(article, tokenizer, model, device, prompt, chunk_size=2048, overlap_size=100, max_new_tokens=300):
    """
    Generates an abstract for an article by processing it in chunks and summarizing each chunk.

    Args:
        article (str): The full article to be summarized.
        tokenizer (transformers.PreTrainedTokenizer): The tokenizer to count tokens.
        model (transformers.PreTrainedModel): The pre-trained model used for generating the summary.
        device (torch.device): The device on which to perform the computation ('cpu' or 'cuda').
        prompt (str): The prompt to guide the summarization process.
        chunk_size (int): The maximum number of tokens per chunk (default: 2048).
        overlap_size (int): The number of tokens to overlap between chunks (default: 100).

    Returns:
        str: The generated abstract for the full article.
    """
    # Check and set the pad_token if not already defined
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token


    # Split the article into chunks with overlap
    chunks = split_into_chunks_with_overlap(article, tokenizer, chunk_size, overlap_size)

    # Generate abstract for each chunk
    abstracts = []
    for i, chunk in enumerate(chunks):
        # Combine the provided prompt with the chunk
        full_prompt = f"{prompt} {chunk}"
        
        # Tokenize the prompt and generate the summary
        inputs = tokenizer(full_prompt, return_tensors="pt", truncation=True, padding=True, max_length=chunk_size, return_attention_mask=True)

        # Move input tensors to the same device as the model
        inputs = {key: value.to(device) for key, value in inputs.items()}
        
        # Fix the pad_token_id issue and generate the output
        outputs = model.generate(
            inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=max_new_tokens,  # Make sure this is an integer
            num_beams=1,  # Integer, not a list
            early_stopping=True,
            pad_token_id=model.config.eos_token_id[0]  # Explicitly set pad_token_id to eos_token_id
        )
        
        # Decode the generated summary and remove the prompt from the generated text
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Remove the prompt part from the generated abstract
        abstract = generated_text[len(full_prompt):].strip()  # Strip the prompt text
        
        abstracts.append(abstract)
        print(f"Chunk number {i} has been processed")

    # Join all the small abstracts into one big abstract
    full_abstract = " ".join(abstracts)
    
    return full_abstract


In [25]:
def calculate_rouge_scores(generated_summary, true_abstract):
    """
    Calculates and prints ROUGE scores for the generated summary compared to the true abstract.

    Args:
        generated_summary (str): The generated summary text.
        true_abstract (str): The true abstract text.
    """
    # Initialize ROUGE scorer
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

    # Compare the generated summary with the true abstract
    scores = scorer.score(true_abstract, generated_summary)

    return scores

In [33]:
prompt = """
You are a summarizer encoder-decoder model. The following is a chunk of a scientific article in the field of medicine. Give an extractive abstract of this chunk.
""" 

In [66]:
# Load the input article and true abstract
article_1 = data_obs_train.iloc[0, 1] # Input article
true_abstract_1 = data_obs_train.iloc[0, 2]  # True abstract
article_2 = data_obs_train.iloc[1, 1] # Input article
true_abstract_2 = data_obs_train.iloc[1, 2]  # True abstract
article_3 = data_obs_train.iloc[2, 1] # Input article
true_abstract_3 = data_obs_train.iloc[2, 2]  # True abstract
article_4 = data_obs_train.iloc[3, 1] # Input article
true_abstract_4 = data_obs_train.iloc[3, 2]  # True abstract
article_5 = data_obs_train.iloc[4, 1] # Input article
true_abstract_5 = data_obs_train.iloc[4, 2]  # True abstract


In [28]:
import torch
torch.cuda.empty_cache()  # Free memory on GPU (if applicable)
import gc
gc.collect()  # Force garbage collection to free unused memory


38

In [67]:
device = torch.device('cuda')
generated_abstract_1 = generate_abstract_from_prompt(article_1, tokenizer, model, device, prompt=prompt, chunk_size=2048, max_new_tokens=350)
generated_abstract_2 = generate_abstract_from_prompt(article_2, tokenizer, model, device, prompt=prompt, chunk_size=2048, max_new_tokens=350)
generated_abstract_3 = generate_abstract_from_prompt(article_3, tokenizer, model, device, prompt=prompt, chunk_size=2048, max_new_tokens=350)
generated_abstract_4 = generate_abstract_from_prompt(article_4, tokenizer, model, device, prompt=prompt, chunk_size=2048, max_new_tokens=350)
generated_abstract_5 = generate_abstract_from_prompt(article_5, tokenizer, model, device, prompt=prompt, chunk_size=2048, max_new_tokens=350)


Chunk number 0 has been processed
Chunk number 1 has been processed
Chunk number 2 has been processed
Chunk number 0 has been processed
Chunk number 1 has been processed
Chunk number 2 has been processed
Chunk number 3 has been processed
Chunk number 0 has been processed
Chunk number 1 has been processed
Chunk number 2 has been processed
Chunk number 3 has been processed
Chunk number 0 has been processed
Chunk number 1 has been processed
Chunk number 2 has been processed
Chunk number 3 has been processed
Chunk number 4 has been processed
Chunk number 0 has been processed
Chunk number 1 has been processed
Chunk number 2 has been processed
Chunk number 3 has been processed
Chunk number 4 has been processed
Chunk number 5 has been processed


In [68]:
score_1 = calculate_rouge_scores(generated_abstract, true_abstract)
score_2 = calculate_rouge_scores(generated_abstract, true_abstract)
score_3 = calculate_rouge_scores(generated_abstract, true_abstract)
score_4 = calculate_rouge_scores(generated_abstract, true_abstract)
score_5 = calculate_rouge_scores(generated_abstract, true_abstract)

In [69]:
# Liste de scores obtenus à partir de la fonction calculate_rouge_scores
all_scores = [score_1, score_2, score_3, score_4, score_5]

# Initialiser les totaux pour chaque métrique
rouge1_total = 0
rouge2_total = 0
rougeL_total = 0

# Ajouter les scores pour chaque appel
for score in all_scores:
    rouge1_total += score['rouge1'].fmeasure
    rouge2_total += score['rouge2'].fmeasure
    rougeL_total += score['rougeL'].fmeasure

# Calculer la moyenne pour chaque score
rouge1_avg = rouge1_total / len(all_scores)
rouge2_avg = rouge2_total / len(all_scores)
rougeL_avg = rougeL_total / len(all_scores)
C:\Users\medal\Downloads\nlp-mohammedalieladlouni-decodeur (2).ipynb
# Afficher les moyennes
print(f"ROUGE-1 Average: {rouge1_avg}")
print(f"ROUGE-2 Average: {rouge2_avg}")
print(f"ROUGE-L Average: {rougeL_avg}")


ROUGE-1 Average: 0.3466666666666667
ROUGE-2 Average: 0.13140311804008908
ROUGE-L Average: 0.1577777777777778
