In [34]:
import pandas as pd
import os
import re

In [35]:
# Function to parse SRT file into a DataFrame
def parse_srt_to_dataframe(srt_file_path):
    with open(srt_file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # Split into subtitle blocks
    blocks = content.strip().split('\n\n')
    
    # Parse blocks
    data = []
    index = 1
    for block in blocks:
        lines = block.split('\n')
        timing = lines[1]
        start_time, end_time = timing.split(' --> ')
        start_time = start_time.split(" ")[0]
        end_time = end_time.split(" ")[0]
        text = ' '.join(lines[2:])  # Subtitle text
        data.append([index, start_time, end_time, text])
        index += 1
    
    # Create a DataFrame
    srt_df = pd.DataFrame(data, columns=['Index', 'Start Time', 'End Time', 'Text'])
    srt_df["Index"] = srt_df["Index"].astype("int")
    return srt_df

# Function to parse RTTM file into a DataFrame with Index
def parse_rttm_to_dataframe(rttm_file_path):
    # Define columns for RTTM
    columns = ["Type", "File ID", "Channel", "Start Time", "Duration", 
               "Orthography", "Speaker Type", "Speaker ID", "Confidence1", "Confidence2"]
    
    # Read the RTTM file
    with open(rttm_file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    
    # Parse rows
    data = []
    for idx, line in enumerate(lines):
        if line.strip() and not line.startswith('#'):  # Skip empty or commented lines
            row = line.strip().split()
            row.insert(0, idx + 1)  # Add an index as the first column
            data.append(row)
    
    # Update columns to include "Index"
    columns.insert(0, "Index")
    
    # Create a DataFrame
    rttm_df = pd.DataFrame(data, columns=columns)
    return rttm_df

In [36]:

# Paths to your SRT and RTTM files
srt_file_path = 'data/dj_2022_3_mois_paracetamol/dj_2022_3_mois_paracetamol.srt'
rttm_file_path = 'data/dj_2022_3_mois_paracetamol/dj_2022_3_mois_paracetamol.rttm'

In [37]:
def get_conversation(srt_file_path, rttm_file_path):
    srt_df = parse_srt_to_dataframe(srt_file_path)
    rttm_df = parse_rttm_to_dataframe(rttm_file_path)
    srt_df["Speaker ID"] = rttm_df["Index"].map(rttm_df.set_index("Index")["Speaker ID"])
    srt_df["File"] = srt_file_path.rsplit("/", 1)[1].split(".")[0]
    return srt_df

get_conversation(srt_file_path, rttm_file_path)

Unnamed: 0,Index,Start Time,End Time,Text,Speaker ID,File
0,1,"00:00:03,449","00:00:04,149",allo?,patient,dj_2022_3_mois_paracetamol
1,2,"00:00:04,250","00:00:05,049",allo madame?,medecin,dj_2022_3_mois_paracetamol
2,3,"00:00:05,399","00:00:06,430","oui allo, allo?",patient,dj_2022_3_mois_paracetamol
3,4,"00:00:06,449","00:00:11,090",oui bonjour je suis le docteur LAKHLIFI. vous ...,medecin,dj_2022_3_mois_paracetamol
4,5,"00:00:11,529","00:00:13,779","oui c'est ça c'est ça, exactement",patient,dj_2022_3_mois_paracetamol
5,6,"00:00:14,000","00:00:16,059",ça fait heu j'attends depuis longtemps là,patient,dj_2022_3_mois_paracetamol
6,7,"00:00:16,360","00:00:20,150",hum d'accord. il a pris des médicaments qu'il ...,medecin,dj_2022_3_mois_paracetamol
7,8,"00:00:20,450","00:00:27,150","oui c'est ça c'est ça, je pense heu ça fait be...",patient,dj_2022_3_mois_paracetamol
8,9,"00:00:27,859","00:00:32,500",je je heu j'en suis vraiment désolée. on on va...,medecin,dj_2022_3_mois_paracetamol
9,10,"00:00:32,500","00:00:34,150","oui mais moi j'attends quoi, hein",patient,dj_2022_3_mois_paracetamol


In [38]:
folders = os.listdir("./data")
conversations = []
for folder in data_folders:
    srt_file = list(filter(re.compile(".*srt").match, os.listdir("./data/"+folder)))[0]
    rttm_file = list(filter(re.compile(".*rttm").match, os.listdir("./data/"+folder)))[0]
    conversations.append(get_conversation("./data/"+folder+"/"+srt_file, "./data/"+folder+"/"+rttm_file))
conversations = pd.concat(conversations)
conversations

Unnamed: 0,Index,Start Time,End Time,Text,Speaker ID,File
0,1,"00:00:01,400","00:00:02,400",bonjour madame,medecin,dj_mai_tetanie
1,2,"00:00:03,550","00:00:04,210",bonjour,patient,dj_mai_tetanie
2,3,"00:00:04,250","00:00:11,200",bonjour je suis le docteur ABAS du SAMU hum do...,medecin,dj_mai_tetanie
3,4,"00:00:11,850","00:00:17,759",ah bah elle a fait un malaise oui oui là elle ...,patient,dj_mai_tetanie
4,5,"00:00:18,869","00:00:19,830",elle est inconsciente?,medecin,dj_mai_tetanie
...,...,...,...,...,...,...
53,54,"00:03:08,950","00:03:09,660",d'accord,patient,reu_brulures
54,55,"00:03:10,160","00:03:10,920",à tout à l'heure madame,medecin,reu_brulures
55,56,"00:03:10,930","00:03:11,170",ok,patient,reu_brulures
56,57,"00:03:12,820","00:03:13,350",au revoir,patient,reu_brulures
