In [None]:
import torch

# Vérifier si le GPU est disponible
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"GPU disponible : {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("GPU non disponible, utilisation du CPU.")

GPU disponible : Tesla T4


In [6]:
# Install required libraries (only needed once)
!pip install transformers
!pip install PyMuPDF

import fitz  # PyMuPDF
import torch
import ipywidgets as widgets
from transformers import pipeline
from google.colab import files

# Create a file upload widget
upload = widgets.FileUpload(accept='.pdf', multiple=False)

# Display the upload button
print("📌 Choisissez un fichier PDF pour le résumer :")
display(upload)

# Function to process uploaded file
def process_uploaded_file(change):
    uploaded_filename = next(iter(upload.value))
    pdf_path = uploaded_filename
    with open(pdf_path, "wb") as f:
        f.write(upload.value[uploaded_filename]["content"])

    print(f"📂 Fichier sélectionné : {pdf_path}")

    # Extraction du texte
    def extract_text_from_pdf(pdf_path):
        text = ""
        with fitz.open(pdf_path) as pdf_file:
            for page in pdf_file:
                text += page.get_text() + "\n"
        return text.strip()

    pdf_text = extract_text_from_pdf(pdf_path)
    if not pdf_text:
        print("❌ Le fichier PDF est vide ou illisible.")
        return

    # Charger le modèle BART
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=0 if torch.cuda.is_available() else -1)

    # Fonction pour diviser le texte
    def split_text(text, max_length=1024):
        words = text.split()
        chunks = []
        current_chunk = []
        current_length = 0
        for word in words:
            if current_length + len(word) + 1 <= max_length:
                current_chunk.append(word)
                current_length += len(word) + 1
            else:
                chunks.append(" ".join(current_chunk))
                current_chunk = [word]
                current_length = len(word) + 1
        if current_chunk:
            chunks.append(" ".join(current_chunk))
        return chunks

    text_chunks = split_text(pdf_text)
    print(f"📌 Nombre de morceaux de texte : {len(text_chunks)}")

    # Fonction pour résumer chaque morceau
    def summarize_text(text_chunks, summarizer, max_length=100, min_length=15):
        summaries = []
        for chunk in text_chunks:
            summary = summarizer(chunk, max_length=max_length, min_length=min_length, do_sample=False)
            summaries.append(summary[0]['summary_text'])
        return summaries

    summaries = summarize_text(text_chunks, summarizer)
    final_summary = " ".join(summaries)

    # Sauvegarde du résumé
    output_filename = "resume.txt"
    with open(output_filename, "w", encoding="utf-8") as output_file:
        output_file.write(final_summary)

    print(f"✅ Résumé généré et sauvegardé dans {output_filename}")

    # Télécharger le fichier résumé
    files.download(output_filename)

# Lancer le traitement après l'upload
upload.observe(process_uploaded_file, names='value')


📌 Choisissez un fichier PDF pour le résumer :


FileUpload(value={}, accept='.pdf', description='Upload')

📂 Fichier sélectionné : 108364 PLE_Digital Assets_Deck 290724 (1).pdf


Device set to use cuda:0


📌 Nombre de morceaux de texte : 15
✅ Résumé généré et sauvegardé dans resume.txt


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!python -m spacy download fr_core_news_sm

Collecting fr-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.7.0/fr_core_news_sm-3.7.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m56.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: fr-core-news-sm
Successfully installed fr-core-news-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [11]:
!pip install transformers
!pip install PyMuPDF

import fitz  # PyMuPDF
import torch
import os
import ipywidgets as widgets
from transformers import pipeline
from google.colab import files

# Set CPU mode to avoid CUDA errors
os.environ["TORCH_USE_CUDA_DSA"] = "1"

# File upload widget
upload = widgets.FileUpload(accept='.pdf', multiple=False)

print("📌 Choisissez un fichier PDF pour le résumer en 8 lignes :")
display(upload)

def process_uploaded_file(change):
    uploaded_filename = next(iter(upload.value))
    pdf_path = uploaded_filename

    with open(pdf_path, "wb") as f:
        f.write(upload.value[uploaded_filename]["content"])

    print(f"📂 Fichier sélectionné : {pdf_path}")

    # Extract text from PDF
    def extract_text_from_pdf(pdf_path):
        text = ""
        with fitz.open(pdf_path) as pdf_file:
            for page in pdf_file:
                text += page.get_text() + "\n"
        return text.strip()

    pdf_text = extract_text_from_pdf(pdf_path)
    if not pdf_text:
        print("❌ Le fichier PDF est vide ou illisible.")
        return

    # Load summarization model (CPU mode)
    summarizer = pipeline("summarization", model="facebook/bart-large-cnn", device=-1)

    # Function to split text into chunks (max 1024 tokens)
    def split_text(text, max_length=1024):
        words = text.split()
        chunks = []
        current_chunk = []
        current_length = 0

        for word in words:
            if current_length + len(word) + 1 <= max_length:
                current_chunk.append(word)
                current_length += len(word) + 1
            else:
                chunks.append(" ".join(current_chunk))
                current_chunk = [word]
                current_length = len(word) + 1

        if current_chunk:
            chunks.append(" ".join(current_chunk))

        return chunks

    # Split text into smaller chunks
    text_chunks = split_text(pdf_text)

    # Summarize each chunk
    summaries = []
    for chunk in text_chunks:
        summary = summarizer(chunk, max_length=130, min_length=50, do_sample=False)
        summaries.append(summary[0]['summary_text'])

    # Combine all summaries into 8 lines
    final_summary = " ".join(summaries)

    # Keep only the first 8 lines (~120 words)
    final_summary_lines = final_summary.split(". ")[:8]  # Split by sentence
    final_summary = ". ".join(final_summary_lines) + "."

    # Save the summary
    output_filename = "resumess.txt"
    with open(output_filename, "w", encoding="utf-8") as output_file:
        output_file.write(final_summary)

    print(f"✅ Résumé en 8 lignes généré et sauvegardé dans {output_filename}")

    # Download the summary file
    files.download(output_filename)

# Trigger processing after file upload
upload.observe(process_uploaded_file, names='value')


✅ Résumé en 8 lignes généré et sauvegardé dans resume.txt


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

📌 Choisissez un fichier PDF pour le résumer en 8 lignes :


FileUpload(value={}, accept='.pdf', description='Upload')