# Intermediate Processing

This notebook allows retrieving, from a subtitle file, two new files that will be used to obtain sentence-segmented subtitles:
1. An oversubtitled .vtt subtitle file, where each subtitle corresponds exclusively to a sentence or a portion of sentences.
2. (archived) A .txt file containing one sentence per line.

In [37]:
import spacy
import os
import re
import module_traitement as m
# spacy.prefer_gpu()
# nlp = spacy.load("fr_dep_news_trf")
# from spacy.language import Language

In [98]:
# Pour Mediapi
file_with_path = m.lister_fichiers_with_path("../data/aligned_mediapi/")
folder = m.lister_fichiers("../data/aligned_mediapi/")
output_seg = "../data/new_segmentation_mediapi/"
output_sent = "../data/sentence_mediapi/"

In [11]:
# Pour Matignon - LSF 
file_with_path = m.lister_fichiers_with_path("../data/cr_audio_aligned/")
folder = m.lister_fichiers("../data/cr_audio_aligned/")
output_seg = "../data/new_segmentation_cr/"
output_sent = "../data/sentence_matignon/"

## 1. Clean files
Clean the file before various preprocessing steps to facilitate sentence detection and segmentation

In [9]:
# TEST - Pour Matignon - LSF 
file_with_path = m.lister_fichiers_with_path("../data/cr_audio_aligned/")
folder = m.lister_fichiers("../data/cr_audio_aligned/")
output_seg = "test_new_seg_cr/"
output_cleaning = "test_cleaning_cr/"
#output_sent = "../data/sentence_matignon/"

In [57]:
# TEST - Pour Mediapi 
file_with_path = m.lister_fichiers_with_path("../data/aligned_mediapi//")
folder = m.lister_fichiers("../data/aligned_mediapi/")
output_seg = "test_new_seg_mediapi/"
output_cleaning = "test_cleaning_mediapi/"
#output_sent = "../data/sentence_matignon/"

In [59]:
def convertir_grand_nombre(nombre_texte):
    nombre_sans_points = re.sub(r'(\d)\.(\d)', r'\1\2', nombre_texte)
    return nombre_sans_points

def remplacer_points_adresses_email(texte):
    # Expression régulière pour repérer les adresses e-mail
    pattern = r'\b[\w.%+-]+@[\w.-]+\.[a-zA-Z]{2,}\b'

    # Fonction de remplacement pour remplacer les points par "POINT"
    def remplacer(match):
        return match.group().replace('.', 'POINT')

    texte_modifie = re.sub(pattern, remplacer, texte)
    return texte_modifie

def remplacer_points_adresses(texte):
    # Expression régulière pour repérer différentes formes d'adresses de site internet
    pattern = r'\b(?:https?://)?(?:www\.)?[\w.-]+\.[a-zA-Z]{2,}\b'

    # Fonction de remplacement pour remplacer les points par "POINT"
    def remplacer(match):
        return match.group().replace('.', 'POINT')

    texte_modifie = re.sub(pattern, remplacer, texte)
    return texte_modifie

def normaliser_points_de_suspension(texte):
    # Ajouter un espace après les points de suspension suivis d'une lettre
    texte_modifie = re.sub(r'\.\.\.(\w)', r'... \1', texte)

    # Retirer un espace avant les points de suspension précédés d'une lettre
    texte_modifie = re.sub(r'(\w)\s*\.\.\.', r'\1...', texte_modifie)

    return texte_modifie

def remplacer_ponctuation_html(texte):
    substitutions = [
        ("(", "&#40;"),
        (")", "&#41;"),
        ("?", "&#63;"),
        ("!", "&#33;"),
        (".", "&#46;"),
        ("...", "&#8230;")
    ]

    def remplacer_match(match):
        contenu_parentheses = match.group(1)
        for recherche, remplacement in substitutions:
            contenu_parentheses = contenu_parentheses.replace(recherche, remplacement)
        return f'({contenu_parentheses})'

    texte_modifie = re.sub(r'\(([^)]*)\)', remplacer_match, texte)
    return texte_modifie

def get_dict_vtt_clean(input):
    with open(input,encoding="utf-8") as f:
        lines = f.readlines()

    dict_sub = {}
    i = 0
    j = 0  

    while j < len(lines): 
        element = lines[j]
        if element.startswith("00:") or element.startswith("01:") or element.startswith("02:"):
            # Extraire le temps de début et de fin
            timing_line = element.strip().split(' --> ')
            start_time, end_time = timing_line

            text = ""
            while j + 1 < len(lines) and not lines[j + 1].startswith("00:") and not lines[j+1].startswith("01:") and not lines[j+1].startswith("02:"):
                j += 1
                content = lines[j]
                text = text + " " + content.strip()
                text = normaliser_points_de_suspension(text)
                text = text.replace("(...)","[SUSPENSIONP]") #régler ce problème.
                text = text.replace("....","[POINTS4]")
                text = remplacer_ponctuation_html(text)
                ### changement
                text = text.replace("…","...")
                text = text.replace("...","[SUSPENSION].") ### gérer la double suspension !
                text = re.sub(r'["“”«»]', '', text)
                text = text.replace("?!","[INTEREXCL].")
                text = text.replace("(???)","[INTERROGATION3]")
                text = text.replace("(?)","[INTERROGATION1]")
                text = text.replace("(??)","[INTERROGATION2]")
                text = text.replace("( ?)","[INTERROGATION1]")
                text = text.replace("?,","[INTERROGATION],")
                text = text.replace("!,","[EXCLAMATION],")
                text = text.replace("etc.,","[ETC],")
                text = text.replace("etc.)","[ETC])")
                text = text.replace("etc. .","[ETC].")
                text = text.replace("Etc","etc")
                text = text.replace("PAM !","[PAM]")
                text = text.replace("Média'Pi!","[NOM_MEDIA]")
                text = text.replace("Média'Pi !","[NOM_MEDIA]")
                text = text.replace("Media'Pi !","[NOM_MEDIA]")
                text = text.replace("Média'Pi&nbsp;!","[NOM_MEDIA]")
                text = text.replace("Média' Pi !","[NOM_MEDIA]")
                text = text.replace(".e.s","")
                text = text.replace(".ne.s","")
                text = text.replace(".e.","")
                text = text.replace(".e","")
                text = text.replace("!!","!")
                text = text.replace("??","?")
                #text = re.sub(r'\b(\w+)\s*\.\.\.\s*(\w+)\b', r'\1... \2', text)
                text = remplacer_points_adresses_email(text)
                text = remplacer_points_adresses(text)
                text = text.replace(".,","[POINT],")
                text = text.replace("y.a","y a")
                ### 
                text=text.replace("... -G. Attal : ","")
                text=text.replace("-G. Attal : ","")
                text=text.replace("G. Attal : ","")
                text = text.replace("-Bonjour", "Bonjour")
                text = text.replace("Bonjour.", "Bonjour,")
                text = text.replace(" M."," Monsieur")
                text = text.replace(" Mme"," Madame")
                text = text.replace("-"," ")
                text = convertir_grand_nombre(text)
                # text = text.replace("(","&#40;")
                # text = text.replace(")","&#41;")
                

            dict_sub[i] = {'start': start_time, 'end': end_time, 'text': text.strip()}
            i += 1

        j += 1

    return dict_sub


In [60]:
for file,name in zip(file_with_path,folder):
    dict_sub = get_dict_vtt_clean(file)
    m.create_vtt_file(dict_sub,f"{output_cleaning}/{name}")

## 2. Oversubtitled File

In these files, the following treatments are applied:
1. Retrieve strong punctuations (period, exclamation mark, question mark) within the subtitles.
2. Keep timestamps in memory.
3. Obtain the duration of a letter to segment the subtitle accordingly.
4. Segment the subtitle and generate a new .vtt file

### Processing 

The next treatment can be performed twice in case there were multiple strong punctuations. 

TODO : I can modify the regular expression to exclude LETTRE_MAJ from my segmentation.

In [61]:
import module_traitement as m

In [62]:
file_with_path = m.lister_fichiers_with_path("test_cleaning_mediapi/")
folder = m.lister_fichiers("test_cleaning_mediapi/")

In [63]:
file_with_path = m.lister_fichiers_with_path("try_seg_end/")
folder = m.lister_fichiers("try_seg_end/")
output_seg="try_seg_end/"

In [64]:
for file,name in zip(file_with_path,folder):
    print(f"TRAITEMENT {file} ---- {name}")
    dict_sub = get_dict_vtt_clean(file)
    new_dict = {}
    mm = 0
    pattern = r'([.!?]+)[^)]' # bonne solution
    sous_unite = []
    for k, v in dict_sub.items():
        for kk, vv in v.items():
            if kk == "text":
                # Replace the point between two capital letters with '#'
                modified_text = re.sub(r'(?<=[A-Z])\.(?=[A-Z])', '#', vv)
                # Use re.split() to split the text based on the pattern
                sentences = re.split(pattern, modified_text)
                # Combine pairs of adjacent list elements (sentence + punctuation)
                result = [sentences[i] + sentences[i + 1] if i < len(sentences) - 1 else sentences[i] for i in range(0, len(sentences), 2)]
                # Remove empty strings from the result
                result = [sentence.strip() for sentence in result if sentence.strip()]
                if len(result) == 1:
                    if mm not in new_dict:
                        new_dict[mm]=v
                        mm = mm +1
                else:
                    if result:
                        print(f"resultat : {result}")
                        start_time_str = v["start"]
                        end_time_str = v["end"]
                        # start_time = m.conv_str_to_time(start_time_str)
                        # end_time = m.conv_str_to_time(end_time_str)
                        nb_of_carach = len(v["text"])
                        duration = m.time_to_seconds(end_time_str) - m.time_to_seconds(start_time_str)
                        duration_sec = duration
                        print(duration)
                        sec_par_letter = duration_sec / nb_of_carach
                        for match in result:
                            len_match = len(match)
                            duration_match = len_match*sec_par_letter
                            if mm not in new_dict:
                                #print(end_time)
                                end_time = m.ajouter_secondes(start_time_str,duration_match)
                                print(end_time)
                                print(f"start time ({type(start_time_str)}) : {start_time_str}, end time ({type(end_time)}) : {end_time}, text : {match}")
                                new_dict[mm]={'start':start_time_str,"end":end_time,'text':match}
                                start_time_str = end_time
                                mm = mm +1
                    else:
                        continue
    m.create_vtt_file(new_dict,f"{output_seg}/{name}")

                    
    

TRAITEMENT try_seg_end/b7f2d8f0c3.vtt ---- b7f2d8f0c3.vtt
TRAITEMENT try_seg_end/3d0b82b459.vtt ---- 3d0b82b459.vtt
TRAITEMENT try_seg_end/44f554f914.vtt ---- 44f554f914.vtt
TRAITEMENT try_seg_end/b9a51f4361.vtt ---- b9a51f4361.vtt
TRAITEMENT try_seg_end/4e073949b1.vtt ---- 4e073949b1.vtt
TRAITEMENT try_seg_end/0b1437bc85.vtt ---- 0b1437bc85.vtt
TRAITEMENT try_seg_end/752500b761.vtt ---- 752500b761.vtt
TRAITEMENT try_seg_end/cba6cefad2.vtt ---- cba6cefad2.vtt
TRAITEMENT try_seg_end/41c606553f.vtt ---- 41c606553f.vtt
TRAITEMENT try_seg_end/3f1b2118ca.vtt ---- 3f1b2118ca.vtt
TRAITEMENT try_seg_end/ed969c3e70.vtt ---- ed969c3e70.vtt
TRAITEMENT try_seg_end/26f1ff8385.vtt ---- 26f1ff8385.vtt
TRAITEMENT try_seg_end/2b4b33189c.vtt ---- 2b4b33189c.vtt
TRAITEMENT try_seg_end/ada7d10a19.vtt ---- ada7d10a19.vtt
TRAITEMENT try_seg_end/fd41a24117.vtt ---- fd41a24117.vtt
TRAITEMENT try_seg_end/15abfc95ae.vtt ---- 15abfc95ae.vtt
TRAITEMENT try_seg_end/495145911e.vtt ---- 495145911e.vtt
TRAITEMENT try

In [2]:
from IPython.display import HTML

# <font color='red'> ARCHIVED - not useful anymore </font>
## Sortir un fichier de phrases

L'idée est de sortie un fichier contenant une phrase par ligne pour pouvoir créer plus simplement le nouveau fichier de sous-titre segmenté en phrases. 

### Traitement

Utilisation de SpaCy pour récupérer les phrases

In [26]:
file_with_path

['test_cleaning_mediapi/b7f2d8f0c3.vtt',
 'test_cleaning_mediapi/3d0b82b459.vtt',
 'test_cleaning_mediapi/44f554f914.vtt',
 'test_cleaning_mediapi/b9a51f4361.vtt',
 'test_cleaning_mediapi/4e073949b1.vtt',
 'test_cleaning_mediapi/0b1437bc85.vtt',
 'test_cleaning_mediapi/752500b761.vtt',
 'test_cleaning_mediapi/cba6cefad2.vtt',
 'test_cleaning_mediapi/41c606553f.vtt',
 'test_cleaning_mediapi/3f1b2118ca.vtt',
 'test_cleaning_mediapi/ed969c3e70.vtt',
 'test_cleaning_mediapi/26f1ff8385.vtt',
 'test_cleaning_mediapi/2b4b33189c.vtt',
 'test_cleaning_mediapi/ada7d10a19.vtt',
 'test_cleaning_mediapi/fd41a24117.vtt',
 'test_cleaning_mediapi/15abfc95ae.vtt',
 'test_cleaning_mediapi/495145911e.vtt',
 'test_cleaning_mediapi/5ef5fa319a.vtt',
 'test_cleaning_mediapi/0fd91cb814.vtt',
 'test_cleaning_mediapi/9a58b08185.vtt',
 'test_cleaning_mediapi/ac6160b61a.vtt',
 'test_cleaning_mediapi/74bb642e72.vtt',
 'test_cleaning_mediapi/17217ca54b.vtt',
 'test_cleaning_mediapi/928499b438.vtt',
 'test_cleaning_

In [19]:
output_sent = "test_clean_sent_mediapi/"

In [16]:
def get_dict_vtt(input):
    with open(input,encoding="utf-8") as f:
        lines = f.readlines()

    dict_sub = {}
    i = 0
    j = 0  

    while j < len(lines): 
        element = lines[j]
        if element.startswith("00:") or element.startswith("01:") or element.startswith("02:"):
            # Extraire le temps de début et de fin
            timing_line = element.strip().split(' --> ')
            start_time, end_time = timing_line

            text = ""
            while j + 1 < len(lines) and not lines[j + 1].startswith("00:") and not lines[j+1].startswith("01:") and not lines[j+1].startswith("02:"):
                j += 1
                content = lines[j]
                text = text + " " + content.strip()

            dict_sub[i] = {'start': start_time, 'end': end_time, 'text': text.strip()}
            i += 1

        j += 1

    return dict_sub

In [20]:
for files, name in zip(file_with_path,folder):
    print("Traitement",files,name)
    dict_sub = get_dict_vtt_clean(files)
    text = ""
    sentences = []
    for k,v in dict_sub.items():
        for kk,vv in v.items():
            if kk=="text":
                text = text + vv + " "
    doc = nlp(text)
    assert doc.has_annotation("SENT_START")
    for sent in doc.sents:
        sentences.append(sent.text)
    with open(f"{output_sent}/{name}","w",encoding="utf-8") as output:
        for sent in sentences:
            output.write(sent+"\n")

Traitement test_cleaning_mediapi/b7f2d8f0c3.vtt b7f2d8f0c3.vtt
Traitement test_cleaning_mediapi/3d0b82b459.vtt 3d0b82b459.vtt
Traitement test_cleaning_mediapi/44f554f914.vtt 44f554f914.vtt
Traitement test_cleaning_mediapi/b9a51f4361.vtt b9a51f4361.vtt
Traitement test_cleaning_mediapi/4e073949b1.vtt 4e073949b1.vtt
Traitement test_cleaning_mediapi/0b1437bc85.vtt 0b1437bc85.vtt
Traitement test_cleaning_mediapi/752500b761.vtt 752500b761.vtt
Traitement test_cleaning_mediapi/cba6cefad2.vtt cba6cefad2.vtt
Traitement test_cleaning_mediapi/41c606553f.vtt 41c606553f.vtt
Traitement test_cleaning_mediapi/3f1b2118ca.vtt 3f1b2118ca.vtt
Traitement test_cleaning_mediapi/ed969c3e70.vtt ed969c3e70.vtt
Traitement test_cleaning_mediapi/26f1ff8385.vtt 26f1ff8385.vtt
Traitement test_cleaning_mediapi/2b4b33189c.vtt 2b4b33189c.vtt
Traitement test_cleaning_mediapi/ada7d10a19.vtt ada7d10a19.vtt
Traitement test_cleaning_mediapi/fd41a24117.vtt fd41a24117.vtt
Traitement test_cleaning_mediapi/15abfc95ae.vtt 15abfc9

### Nettoyer fichier phrase
- remonter la ponctuation forte si elle est isolée sur une ligne
- remonter les sections commençant par une virgule si elles sont isolées du reste de la phrase sur la ligne suivante

In [21]:
files= m.lister_fichiers_with_path(output_sent)

In [22]:
files

['test_clean_sent_mediapi/b7f2d8f0c3.vtt',
 'test_clean_sent_mediapi/3d0b82b459.vtt',
 'test_clean_sent_mediapi/44f554f914.vtt',
 'test_clean_sent_mediapi/b9a51f4361.vtt',
 'test_clean_sent_mediapi/4e073949b1.vtt',
 'test_clean_sent_mediapi/0b1437bc85.vtt',
 'test_clean_sent_mediapi/752500b761.vtt',
 'test_clean_sent_mediapi/cba6cefad2.vtt',
 'test_clean_sent_mediapi/41c606553f.vtt',
 'test_clean_sent_mediapi/3f1b2118ca.vtt',
 'test_clean_sent_mediapi/ed969c3e70.vtt',
 'test_clean_sent_mediapi/26f1ff8385.vtt',
 'test_clean_sent_mediapi/2b4b33189c.vtt',
 'test_clean_sent_mediapi/ada7d10a19.vtt',
 'test_clean_sent_mediapi/fd41a24117.vtt',
 'test_clean_sent_mediapi/15abfc95ae.vtt',
 'test_clean_sent_mediapi/495145911e.vtt',
 'test_clean_sent_mediapi/5ef5fa319a.vtt',
 'test_clean_sent_mediapi/0fd91cb814.vtt',
 'test_clean_sent_mediapi/9a58b08185.vtt',
 'test_clean_sent_mediapi/ac6160b61a.vtt',
 'test_clean_sent_mediapi/74bb642e72.vtt',
 'test_clean_sent_mediapi/17217ca54b.vtt',
 'test_clea

In [23]:
def get_dict_vtt(input):
    with open(input,encoding="utf-8") as f:
        lines = f.readlines()

    dict_sub = {}
    i = 0
    j = 0  

    while j < len(lines): 
        element = lines[j]
        if element.startswith("00:") or element.startswith("01:") or element.startswith("02:"):
            # Extraire le temps de début et de fin
            timing_line = element.strip().split(' --> ')
            start_time, end_time = timing_line

            text = ""
            while j + 1 < len(lines) and not lines[j + 1].startswith("00:") and not lines[j+1].startswith("01:") and not lines[j+1].startswith("02:"):
                j += 1
                content = lines[j]
                text = text + " " + content.strip()

            dict_sub[i] = {'start': start_time, 'end': end_time, 'text': text.strip()}
            i += 1

        j += 1

    return dict_sub

In [24]:
ponctuations = {"!", ".", "?", "....", "...",'"',":"}

for file in files:
    with open(file, 'r', encoding="utf-8") as f:
        print(f"Traitement de {file}")
        ligne_precedente = ""
        liste_sent = f.readlines()
        if len(liste_sent) > 3:
            print(f"File long enough, traitement : {file} --- {len(liste_sent)}")
            i = 0
            txt = ""
            while i < len(liste_sent):
                if i < len(liste_sent) - 1:
                    if liste_sent[i + 1].strip() in ponctuations or liste_sent[i+1].startswith(","):
                        txt = txt + liste_sent[i].strip() + liste_sent[i + 1]
                        i = i + 2
                        var = True
                    else:
                        txt = txt + liste_sent[i]
                        i = i + 1
                        var = False
                else:
                    if var == False:
                        txt = txt + liste_sent[i]
                        i = i + 1
                    else:
                        i = i +1

            with open(file, "w", encoding="utf-8") as f:
                print(f"Ecriture du nouveau fichier {file}")
                f.write(txt)

            print(f"{file} done")
            i = 0
        else:
            print(f"not long enough : {file}")


Traitement de test_clean_sent_mediapi/b7f2d8f0c3.vtt
not long enough : test_clean_sent_mediapi/b7f2d8f0c3.vtt
Traitement de test_clean_sent_mediapi/3d0b82b459.vtt
File long enough, traitement : test_clean_sent_mediapi/3d0b82b459.vtt --- 61
Ecriture du nouveau fichier test_clean_sent_mediapi/3d0b82b459.vtt
test_clean_sent_mediapi/3d0b82b459.vtt done
Traitement de test_clean_sent_mediapi/44f554f914.vtt
not long enough : test_clean_sent_mediapi/44f554f914.vtt
Traitement de test_clean_sent_mediapi/b9a51f4361.vtt
File long enough, traitement : test_clean_sent_mediapi/b9a51f4361.vtt --- 8
Ecriture du nouveau fichier test_clean_sent_mediapi/b9a51f4361.vtt
test_clean_sent_mediapi/b9a51f4361.vtt done
Traitement de test_clean_sent_mediapi/4e073949b1.vtt
not long enough : test_clean_sent_mediapi/4e073949b1.vtt
Traitement de test_clean_sent_mediapi/0b1437bc85.vtt
not long enough : test_clean_sent_mediapi/0b1437bc85.vtt
Traitement de test_clean_sent_mediapi/752500b761.vtt
File long enough, traiteme

- remonter ce qu'il y a après ":" si c'est isolée de la phrase sur la ligne suivante

In [25]:
ponctuations = {":"}

# Ouvrez le fichier en mode lecture
for file in files:
    with open(file,"r",encoding="utf-8") as f:
        print(f"Traitement de {file}")
        ligne_precedente = ""
        liste_sent = f.readlines()
        if len(liste_sent) > 3:
            print(f"File long enough, traitement : {file} --- {len(liste_sent)}")
            i=0
            txt=""
            while i < len(liste_sent):
                if i < len(liste_sent)-1:
                    if liste_sent[i].strip().endswith(":"):
                        txt = txt + liste_sent[i].strip() + " " + liste_sent[i+1]
                        i = i+2
                        var = True
                    else:
                        txt = txt + liste_sent[i]
                        i = i +1
                        var = False
                else:
                    if var == False:
                        txt = txt + liste_sent[i]
                        i = i +1
                    else:
                        i = i +1
            with open(file,"w",encoding="utf-8") as f:
                print(f"Ecriture du nouveau fichier {file}")
                f.write(txt)
            print(f"{file} done")
            i = 0
        else:
            print(f"not long enough : {file}")



Traitement de test_clean_sent_mediapi/b7f2d8f0c3.vtt
not long enough : test_clean_sent_mediapi/b7f2d8f0c3.vtt
Traitement de test_clean_sent_mediapi/3d0b82b459.vtt
File long enough, traitement : test_clean_sent_mediapi/3d0b82b459.vtt --- 61
Ecriture du nouveau fichier test_clean_sent_mediapi/3d0b82b459.vtt
test_clean_sent_mediapi/3d0b82b459.vtt done
Traitement de test_clean_sent_mediapi/44f554f914.vtt
not long enough : test_clean_sent_mediapi/44f554f914.vtt
Traitement de test_clean_sent_mediapi/b9a51f4361.vtt
File long enough, traitement : test_clean_sent_mediapi/b9a51f4361.vtt --- 8
Ecriture du nouveau fichier test_clean_sent_mediapi/b9a51f4361.vtt
test_clean_sent_mediapi/b9a51f4361.vtt done
Traitement de test_clean_sent_mediapi/4e073949b1.vtt
not long enough : test_clean_sent_mediapi/4e073949b1.vtt
Traitement de test_clean_sent_mediapi/0b1437bc85.vtt
not long enough : test_clean_sent_mediapi/0b1437bc85.vtt
Traitement de test_clean_sent_mediapi/752500b761.vtt
File long enough, traiteme