# Subtitle Timestamp Modification

This notebook contains functions to modify subtitles timestamps:

1. Adding time to subtitles (e.g., 3 seconds)
2. Adding noise to subtitles, following a Gaussian distribution

**Attention: Use .vtt files and change the folder names in the list_files function if necessary"**

In [1]:
import re
from datetime import datetime, timedelta
import os 
import argparse
from collections import defaultdict,Counter
import json 
import csv
from tqdm import tqdm
from typing import List
import module as m
import numpy as np
from scipy.stats import norm

## 1. Adding time to subtitles

In [2]:
# Matignon LSF - original subtitles
list_files = m.lister_fichiers("../data/cr_audio_aligned/") 

In [3]:
# Matignon LSF - sentences based subtitles
list_files = m.lister_fichiers("../data/cr_audio_aligned_sent_seg/")

In [4]:
seconde_to_add = 3.5

for files in list_files:
    # save file in folder plus_one_sec/
    new_file = "cr_plus_3-5_sec/" + str(os.path.splitext(os.path.basename(files))[0]) +".vtt"
    with open(files,"r") as file:
        with open(new_file,"w") as output:
            content = ""
            line = file.readline()
            while line:
                # on prend la ligne qui correspond au timestamps / get the ligne corresponding of the timestamps
                if line.startswith("01:") or line.startswith("00:") or line.startswith("02:"):
                    timing_line = line.strip().split(' --> ')
                    start_time, end_time = timing_line
                    # et on modifie le timestamps à chaque fois / modify the timestamp each time
                    start_time = m.change_timecode(start_time,seconde_to_add)
                    end_time = m.change_timecode(end_time,seconde_to_add)
                    output.write(f"{start_time} --> {end_time}\n")
                else:
                    output.write(line)
                line = file.readline()

## 2. Adding noise to subtitles, following a Gaussian distribution

In [22]:
# Mediapi - original subtitles
files_list = m.lister_fichiers("../data/aligned_mediapi/")


In [2]:
# Mediapi - sentence based subtitles
files_list = m.lister_fichiers("../sentence_segmentation_subtitles/cleaned_gold_sent_seg_mediapi/")

In [3]:
files_list

['../sentence_segmentation_subtitles/cleaned_gold_sent_seg_mediapi/b7f2d8f0c3.vtt',
 '../sentence_segmentation_subtitles/cleaned_gold_sent_seg_mediapi/3d0b82b459.vtt',
 '../sentence_segmentation_subtitles/cleaned_gold_sent_seg_mediapi/44f554f914.vtt',
 '../sentence_segmentation_subtitles/cleaned_gold_sent_seg_mediapi/b9a51f4361.vtt',
 '../sentence_segmentation_subtitles/cleaned_gold_sent_seg_mediapi/4e073949b1.vtt',
 '../sentence_segmentation_subtitles/cleaned_gold_sent_seg_mediapi/0b1437bc85.vtt',
 '../sentence_segmentation_subtitles/cleaned_gold_sent_seg_mediapi/752500b761.vtt',
 '../sentence_segmentation_subtitles/cleaned_gold_sent_seg_mediapi/cba6cefad2.vtt',
 '../sentence_segmentation_subtitles/cleaned_gold_sent_seg_mediapi/41c606553f.vtt',
 '../sentence_segmentation_subtitles/cleaned_gold_sent_seg_mediapi/3f1b2118ca.vtt',
 '../sentence_segmentation_subtitles/cleaned_gold_sent_seg_mediapi/ed969c3e70.vtt',
 '../sentence_segmentation_subtitles/cleaned_gold_sent_seg_mediapi/26f1ff838

In [4]:
# Bruiter les sous-titres. / Adding noise to the subtitles
# Définir les paramètres de la distribution normale
moyenne = 0
ecart_type = 1
borne_inf = -5 # information BOBSL
borne_sup = 13 # information BOBSL

# Parcours la liste des fichiers dans files_list / Getting each file of the file's list
for file in files_list:
    print(file)
    # Construit le chemin pour le nouveau fichier VTT / new path fot the new .vtt file
    new_file = "non_aligned_mediapi_sent_seg/" + str(os.path.splitext(os.path.basename(file))[0]) + ".vtt"
    
    # Ouvre le fichier original en mode lecture / open the file
    with open(file, "r") as file:
        # Ouvre le nouveau fichier VTT en mode écriture / open the new .vtt file in writting mode
        with open(new_file, "w") as output:
            # Initialise une chaîne vide pour stocker le contenu modifié / new empty list
            content = ""
            
            # Lit la première ligne du fichier original / read the first line
            line = file.readline()
            last_end_time = None

            # Boucle pour parcourir chaque ligne du fichier original / for each line
            while line:
                try:
                    # Vérifie si la ligne commence par "00:", "01:", ou "02:" / check the beggining
                    if line.startswith("00:") or line.startswith("01:") or line.startswith("02:"):
                        # Générer une valeur aléatoire à partir de la distribution normale / Generate a random value from the normal distribution
                        valeur_normale = np.random.normal(loc=moyenne, scale=ecart_type)

                        # Ajuster la valeur pour respecter les bornes inférieure et supérieure / Adjust the value to respect inf nd sup borne
                        decalage_temporel = np.clip(valeur_normale, borne_inf, borne_sup)

                        # Divise la ligne de timing en deux parties : start_time et end_time / divide the timing line in two part
                        timing_line = line.strip().split(' --> ')
                        start_time, end_time = timing_line

                        # Convertit les chaînes de temps en objets de temps pour la vérification en dessous / convert string to time object
                        start_time = m.convertir_chaine_en_temps(start_time)
                        end_time = m.convertir_chaine_en_temps(end_time)

                        # Vérifie si start_time est inférieur à last_end_time / check if the start time < end time
                        if last_end_time is not None and start_time < last_end_time:
                            # Si c'est le cas, ajuste start_time pour qu'il soit égal à last_end_time / if the case, we adjust start time to be equal at last_end_time
                            start_time = last_end_time

                        # Applique le décalage temporel aux temps de début et de fin / apply shift to end and start timpe
                        start_time = m.change_timecode(start_time, decalage_temporel)
                        end_time = m.change_timecode(end_time, decalage_temporel)

                        # Écrit les nouvelles lignes de timing dans le fichier de sortie / write new value
                        output.write(f"{start_time} --> {end_time}\n")

                        # Met à jour last_end_time avec la nouvelle valeur de end_time / update last_end_time with the new end_time value
                        last_end_time = end_time
                    else:
                        # Si la ligne ne commence pas par "00:", "01:", ou "02:", la copie telle quelle / if the line is the subtitle's text, we do not change it
                        output.write(line)
                except (OverflowError, ValueError) as e:
                    # Gère les erreurs OverflowError et ValueError, affiche un message et copie la ligne non modifiée / error managing
                    print(f"Erreur : {e}. Ligne non modifiée : {line} du fichier {file}")
                    output.write(line)
                    pass

                # Lit la ligne suivante du fichier original / next line
                line = file.readline()



../sentence_segmentation_subtitles/cleaned_gold_sent_seg_mediapi/b7f2d8f0c3.vtt
../sentence_segmentation_subtitles/cleaned_gold_sent_seg_mediapi/3d0b82b459.vtt
../sentence_segmentation_subtitles/cleaned_gold_sent_seg_mediapi/44f554f914.vtt
../sentence_segmentation_subtitles/cleaned_gold_sent_seg_mediapi/b9a51f4361.vtt
../sentence_segmentation_subtitles/cleaned_gold_sent_seg_mediapi/4e073949b1.vtt
Erreur : date value out of range. Ligne non modifiée : 00:00:00.380 --> 00:00:03.400
 du fichier <_io.TextIOWrapper name='../sentence_segmentation_subtitles/cleaned_gold_sent_seg_mediapi/4e073949b1.vtt' mode='r' encoding='UTF-8'>
../sentence_segmentation_subtitles/cleaned_gold_sent_seg_mediapi/0b1437bc85.vtt
../sentence_segmentation_subtitles/cleaned_gold_sent_seg_mediapi/752500b761.vtt
../sentence_segmentation_subtitles/cleaned_gold_sent_seg_mediapi/cba6cefad2.vtt
../sentence_segmentation_subtitles/cleaned_gold_sent_seg_mediapi/41c606553f.vtt
../sentence_segmentation_subtitles/cleaned_gold_se