In [2]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from PIL import Image
import PyPDF2
import cv2
import pytesseract
import torch

In [8]:
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

In [4]:
# Preparation du modele
modele_name = 't5-base'
tokenizer = T5Tokenizer.from_pretrained(modele_name)
model = T5ForConditionalGeneration.from_pretrained(modele_name)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [10]:
# Pretraitement du texte
def preprocess(texte):
    texte_pretraiter = "summarize : " + texte
    return texte_pretraiter


# Extraction de texte de puis une image
def extract_text_from_image(path):
    image = Image.open(path)
    text = pytesseract.image_to_string(image)
    return text


# Extraction de texte a partir d'un fichier pdf
def extract_text_from_pdf(file):
    with open(file, 'rb') as f:
        reader = PyPDF2.PdfReader(file)
        texte = ""
        for page in reader.pages:
            texte += page.extract_text()
        return texte
    

# Extraction de texte a partir d'une video
def extract_text_from_video(file):
    video = cv2.VideoCapture(file)
    frames_interval = 30
    frames_process = 0
    texte_extrait = ""
    
    while video.isOpened():
        success, frame = video.read()
        
        if not success:
            break
            
        if frames_process % frames_interval == 0:
            rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            texte = pytesseract.image_to_string(rgb)
            texte_extrait += texte + '\n'
            
        frames_process += 1
        
    video.release()
    return texte_extrait

# Generation du resume
def resume_generation(texte):
    texte_ = preprocess(texte)
    tokenization = tokenizer.encode(texte_, return_tensors='pt', max_length=512, truncation=True)
    sum_generation = model.generate(tokenization, num_beams=4, early_stopping=True, max_length=150)
    summary = tokenizer.decode(sum_generation[0], skip_special_tokens=True)
    return summary

In [6]:
pdf = 'texte.pdf'
texte = extract_text_from_pdf(pdf)
summary = resume_generation(texte)
print(summary)

c'est une histoire d'une guerre contre le pouvoir des riches qui est triste et drôle. un homme qui n'avait pas d'argent et qui ne pouvait presque pas se nourrir décida de créer une équipe contre les riches. il réunit plus de 1000 pauvres qui ne mangeaient pas à leur faim et commença à se battre pour de la nourriture.


In [9]:
image = "C:\\Users\\MARCELLE\\Documents\\NLP PROJECT\\image.png"
texte_ = extract_text_from_image(image)
summary_ = resume_generation(texte_)
print(summary_)

c'est une histoire d'une guerre contre le pouvoir des riches qui est triste et drdle. un homme qui n'avait pas d'argent et qui ne pouvait presque pas se nourrir décida de créer une équipe contre les riches. Pendant la nuit, les policiers beganrent leur chasse et ils les trouvérent a Lille. Ils les capturé pour les envoyer en esclavage.


In [11]:
video = "C:\\Users\\MARCELLE\\Documents\\NLP PROJECT\\video.mp4"
texte_ = extract_text_from_video(video)
summary_ = resume_generation(texte_)
print(summary_)

je partais rendre visi — je partais rendre visite ama pet — je partais rendre visite ama petite amie et je me Suis arréte — je partais rendre visite ama petite amie et je me Suis arrété pour D — je partais rendre visite ama petite amie et je me Suis arrété pour pisser Il y aun vieux gui est des


# Dash App Code

In [4]:
%%writefile text_summary.py

import dash
import PyPDF2
import cv2
import pytesseract
import torch
import base64
import time

from dash import html, dcc, State
from dash.dependencies import Input, Output
from transformers import T5Tokenizer, T5ForConditionalGeneration
from PIL import Image
from autocorrect import Speller

external_scripts = [{
    'src': 'https://cdn.jsdelivr.net/npm/bootstrap@5.0.2/dist/js/bootstrap.bundle.min.js',
    'integrity':"sha384-MrcW6ZMFYlzcLA8Nl+NtUVF0sA7MsXsP1UyJoMp4YLEuNSfAP+JcXn/tWtIaxVXM",
    'crossorigin': "anonymous"
}]

external_stylesheets = [{
    'href': 'https://cdn.jsdelivr.net/npm/bootstrap@5.0.2/dist/css/bootstrap.min.css',
    'rel': 'stylesheet',
    'integrity': 'sha384-EVSTQN3/azprG1Anm3QDgpJLIm9Nao0Yz1ztcQTwFspd3yD65VohhpuuCOmLASjC',
    'crossorigin': 'anonymous'
}]

app = dash.Dash(__name__, suppress_callback_exceptions=True, external_scripts=external_scripts, 
                external_stylesheets=external_stylesheets)

pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# Préparation du modèle et du tokenizer
model_name = 't5-base'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)


# Layout de l'application'

app.layout = html.Div(
    className='container border p-4 rounded justify-content-center mt-4',
    style={
        'background': '#f2f2f2'},
    children=[
        html.H1(
            "Extraction & Resume de texte",
            className='text-center fw-bold',
#             style={'font-weight': 'bold', 'font-family': 'Arial', 'text-align': 'center'}
        ),
        html.Div(
            className='row',
#             style={'display': 'flex', 'flex-direction': 'row', 'justify-content': 'space-between'},
            children=[
                html.Div(
#                     style={'width': '45%'}
                    className='col-md-6',
                    children=[
                        dcc.Upload(
                            id='upload-data',
                            children=html.Div([
                                'Glissez et déposez ou ',
                                html.A('sélectionnez un fichier image, video ou pdf')
                            ], className='fst-italic'),
                            style={
                                'width': '100%',
                                'height': '60px',
                                'lineHeight': '60px',
                                'borderWidth': '1px',
                                'borderStyle': 'dashed',
                                'borderRadius': '5px',
                                'textAlign': 'center',
                                'margin': '10px',
                                'cursor': 'pointer',
                            },
                            multiple=False
                        )
                    ]
                ),
               html.Div(
                    children=[
                        html.H4("Résumé", className='fw-bold fst-italic'),
                        dcc.Loading(
                            id='loading-summary',
                            type='default',
                            children=[
                                html.Div(id='output-summary', className='fs-5', style={'font-family': 'Verdana'})
                            ]
                        )
                    ],
                    className='col-md-6'
                )
            ]
        )
    ]
)


# Pretraitement du texte
def preprocess(texte):
    texte_pretraiter = "summarize : " + texte
    return texte_pretraiter


# Correction auto de texte
def correction_texte(texte):
    spell = Speller(lang='fr')
    correction = spell(texte)
    return correction


# Extraction de texte de puis une image
def extract_text_from_image(path):
    image = Image.open(path)
    text = pytesseract.image_to_string(image)
    return text


# Extraction de texte a partir d'un fichier pdf
def extract_text_from_pdf(file):
    with open(file, 'rb') as f:
        reader = PyPDF2.PdfReader(file)
        texte = ""
        for page in reader.pages:
            texte += page.extract_text()
        return texte
    

# Extraction de texte a partir d'une video
def extract_text_from_video(file):
    video = cv2.VideoCapture(file)
    frames_interval = 30
    frames_process = 0
    texte_extrait = ""
    
    while video.isOpened():
        success, frame = video.read()
        
        if not success:
            break
            
        if frames_process % frames_interval == 0:
            rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            texte = pytesseract.image_to_string(rgb)
            texte_extrait += texte + '\n'
            
        frames_process += 1
        
    video.release()
    return texte_extrait


# Generation du resume
def resume_generation(texte):
    texte_ = preprocess(texte)
    tokenization = tokenizer.encode(texte_, return_tensors='pt', max_length=512, truncation=True)
    sum_generation = model.generate(tokenization, num_beams=4, early_stopping=True, max_length=150)
    summary = tokenizer.decode(sum_generation[0], skip_special_tokens=True)
    resume = correction_texte(summary)
    return resume


def update_output(contents):
    if contents is not None:
        content_type, content_string = contents.split(',')
#         return content_type
        if content_type.startswith('data:image/'):
            # Extraction de texte à partir de l'image
            image_data = content_string.split(';base64,')[-1]
            with open('temp_image.png', 'wb') as file:
                file.write(base64.b64decode(content_string))
            text = extract_text_from_image('temp_image.png')
            resume = resume_generation(text)
        elif content_type.startswith('data:video/mp4'):
            # Extraction de texte à partir de la vidéo
            video_data = content_string.split(';base64,')[-1]
            with open('temp_video.mp4', 'wb') as file:
                file.write(base64.b64decode(content_string))
            text = extract_text_from_video('temp_video.mp4')
            resume = resume_generation(text)
        elif content_type.startswith('data:application/pdf'):
            # Extraction de texte à partir du fichier PDF
            pdf_data = content_string.split(';base64,')[-1]
            with open('temp_pdf.pdf', 'wb') as file:
                file.write(base64.b64decode(content_string))
            text = extract_text_from_pdf('temp_pdf.pdf')
            resume = resume_generation(text)
        else:
            return "Type de fichier non pris en charge."

        return resume

    return ""

# Callback pour la génération du résumé lors du chargement du fichier
@app.callback(
    Output('output-summary', 'children'),
    [Input('upload-data', 'contents')],
    [State('loading-summary', 'loading_state')]
)


def generate_summary(contents, loading_state):
    if contents is not None:
        return update_output(contents)
    else:
        return ''


if __name__ == '__main__':
    app.run_server(debug=True)

Overwriting text_summary.py


In [None]:
# %%writefile app.py

# import dash
# import PyPDF2
# import cv2
# import pytesseract
# import torch
# import base64
# import time
# import io

# from dash import html, dcc, State
# from dash.dependencies import Input, Output
# from transformers import T5Tokenizer, T5ForConditionalGeneration
# from PIL import Image
# from autocorrect import Speller
# from pdf2image import convert_from_bytes


# external_scripts = [{
#     'src': 'https://cdn.jsdelivr.net/npm/bootstrap@5.0.2/dist/js/bootstrap.bundle.min.js',
#     'integrity':"sha384-MrcW6ZMFYlzcLA8Nl+NtUVF0sA7MsXsP1UyJoMp4YLEuNSfAP+JcXn/tWtIaxVXM",
#     'crossorigin': "anonymous"
# }]

# external_stylesheets = [{
#     'href': 'https://cdn.jsdelivr.net/npm/bootstrap@5.0.2/dist/css/bootstrap.min.css',
#     'rel': 'stylesheet',
#     'integrity': 'sha384-EVSTQN3/azprG1Anm3QDgpJLIm9Nao0Yz1ztcQTwFspd3yD65VohhpuuCOmLASjC',
#     'crossorigin': 'anonymous'
# }]

# app = dash.Dash(__name__, suppress_callback_exceptions=True, external_scripts=external_scripts, 
#                 external_stylesheets=external_stylesheets)

# pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# # Préparation du modèle et du tokenizer
# model_name = 't5-base'
# tokenizer = T5Tokenizer.from_pretrained(model_name)
# model = T5ForConditionalGeneration.from_pretrained(model_name)


# # Layout de l'application'
# app.layout = html.Div(
#     className='container border p-4 rounded justify-content-center mt-4',
#     style={
#         'background': '#f2f2f2'},
#     children=[
#         html.H1(
#             "Extraction & Resume de texte",
#             className='text-center fw-bold',
# #             style={'font-weight': 'bold', 'font-family': 'Arial', 'text-align': 'center'}
#         ),
#         html.Div(
#             className='row',
# #             style={'display': 'flex', 'flex-direction': 'row', 'justify-content': 'space-between'},
#             children=[
#                 html.Div(
#                     className='col-md-6',
#                     children=[
#                         dcc.Upload(
#                             id='upload-data',
#                             children=html.Div([
#                                 'Glissez et déposez ou ',
#                                 html.A('sélectionnez un fichier image, video ou pdf')
#                             ], className='fst-italic'),
#                             style={
#                                 'width': '100%',
#                                 'height': '60px',
#                                 'lineHeight': '60px',
#                                 'borderWidth': '1px',
#                                 'borderStyle': 'dashed',
#                                 'borderRadius': '5px',
#                                 'textAlign': 'center',
#                                 'margin': '10px',
#                                 'cursor': 'pointer',
#                             },
#                             multiple=False
#                         ),
#                         html.Div(id='file-preview')
#                     ]
#                 ),
#                 html.Div(
#                     className='col-md-6',
#                     children=[
#                         html.H4("Résumé", className='fw-bold fst-italic'),
#                         dcc.Loading(
#                             id='loading-summary',
#                             type='default',
#                             children=[
#                                 html.Div(id='output-summary', className='fs-5', style={'font-family': 'Verdana'})
#                             ]
#                         )
#                     ]
#                 )
#             ]
#         )
#     ]
# )


# # Pretraitement du texte
# def preprocess(texte):
#     texte_pretraiter = "Give me a precise and clear summary of this text : " + texte
#     return texte_pretraiter


# # Correction auto de texte
# def correction_texte(texte):
#     spell = Speller(lang='fr')
#     correction = spell(texte)
#     return correction


# # Extraction de texte de puis une image
# def extract_text_from_image(path):
#     image = Image.open(path)
#     text = pytesseract.image_to_string(image)
#     return text


# # Extraction de texte a partir d'un fichier pdf
# def extract_text_from_pdf(file):
#     with open(file, 'rb') as f:
#         reader = PyPDF2.PdfReader(file)
#         texte = ""
#         for page in reader.pages:
#             texte += page.extract_text()
#         return texte
    

# # Extraction de texte a partir d'une video
# def extract_text_from_video(file):
#     video = cv2.VideoCapture(file)
#     frames_interval = 30
#     frames_process = 0
#     texte_extrait = ""
    
#     while video.isOpened():
#         success, frame = video.read()
        
#         if not success:
#             break
            
#         if frames_process % frames_interval == 0:
#             rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
#             texte = pytesseract.image_to_string(rgb)
#             texte_extrait += texte + '\n'
            
#         frames_process += 1
        
#     video.release()
#     return texte_extrait


# # Generation du resume
# def resume_generation(texte):
#     texte_ = preprocess(texte)
#     tokenization = tokenizer.encode(texte_, return_tensors='pt', max_length=512, truncation=True)
#     sum_generation = model.generate(tokenization, num_beams=4, early_stopping=True, max_length=150)
#     summary = tokenizer.decode(sum_generation[0], skip_special_tokens=True)
#     resume = correction_texte(summary)
#     return resume


# def update_output(contents):
#     if contents is not None:
#         content_type, content_string = contents.split(',')
# #         return content_type
#         if content_type.startswith('data:image/'):
#             # Extraction de texte à partir de l'image
#             image_data = content_string.split(';base64,')[-1]
#             with open('temp_image.png', 'wb') as file:
#                 file.write(base64.b64decode(content_string))
#             text = extract_text_from_image('temp_image.png')
#             resume = resume_generation(text)
#         elif content_type.startswith('data:video/mp4'):
#             # Extraction de texte à partir de la vidéo
#             video_data = content_string.split(';base64,')[-1]
#             with open('temp_video.mp4', 'wb') as file:
#                 file.write(base64.b64decode(content_string))
#             text = extract_text_from_video('temp_video.mp4')
#             resume = resume_generation(text)
#         elif content_type.startswith('data:application/pdf'):
#             # Extraction de texte à partir du fichier PDF
#             pdf_data = content_string.split(';base64,')[-1]
#             with open('temp_pdf.pdf', 'wb') as file:
#                 file.write(base64.b64decode(content_string))
#             text = extract_text_from_pdf('temp_pdf.pdf')
#             resume = resume_generation(text)
#         else:
#             return "Type de fichier non pris en charge."

#         return resume

#     return ""


# def image_url(img):
#     with io.BytesIO() as buffer:
#         img.save(buffer, 'PNG')
#         img_str = base64.b64encode(buffer.getvalue()).decode()
#         return f'data:image/png;base64,{img_str}'


# def parse_contents(contents, filename):
#     content_type, content_string = contents.split(',')
#     decoded = base64.b64decode(content_string)
    
#     if 'image' in content_type:
#         image = base64.b64encode(decoded).decode('utf-8')
#         return html.Div([
#             html.H5(filename),
#             html.Img(src='data:image/png;base64,' + image, style={'width': '100%'})
#         ])
#     elif 'pdf' in content_type:
#         pdf = io.BytesIO(decoded)
#         images = convert_from_bytes(pdf.read())
#         return html.Div([
#             html.H5(filename),
#             *[html.Img(src=image_url(img), style={'width': '100%'}) for img in images]
#         ])



# # Callback pour la génération du résumé lors du chargement du fichier
# @app.callback(
#     Output('output-summary', 'children'),
#     [Input('upload-data', 'contents')],
#     [State('upload-data', 'filename'),
#      State('loading-summary', 'loading_state')]
# )
# def update_file_preview(contents, filename, loading_state):
#     if contents is not None:
#         file_preview = parse_contents(contents, filename)
#         return file_preview
    

# def generate_summary(contents, loading_state):
#     if contents is not None:
#         return update_output(contents)
#     else:
#         return ''


# if __name__ == '__main__':
#     app.run_server(debug=True)

