In [1]:
import os
import json
import requests
import json
import pandas as pd
import re

In [2]:
# Load the JSON data
with open('dessous-des-cartes.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

In [3]:
# Ensure the subtitles directory exists
os.makedirs('subtitles', exist_ok=True)

for entry in data:
    url = entry.get('subtitle_url')
    if not url:
        continue
    filename = url.split('/')[-1]
    dest_path = os.path.join('subtitles', filename)
    print(f"Downloading {url} -> {dest_path}")
    try:
        response = requests.get(url)
        response.raise_for_status()
        with open(dest_path, 'wb') as out_file:
            out_file.write(response.content)
    except Exception as e:
        print(f"Failed to download {url}: {e}")

Downloading https://arte-cmafhls.akamaized.net/am/cmaf/119000/119900/119961-022-A/2025060514733CA1DC338C484B0475265BB19ADC1B/medias/119961-022-A_st_VF-MAL.vtt -> subtitles/119961-022-A_st_VF-MAL.vtt
Downloading https://arte-cmafhls.akamaized.net/am/cmaf/119000/119900/119961-021-A/2025052211C19B884E96D7FCE022BEC0021DCAA628/medias/119961-021-A_st_VF-MAL.vtt -> subtitles/119961-021-A_st_VF-MAL.vtt
Downloading https://arte-cmafhls.akamaized.net/am/cmaf/119000/119900/119961-020-A/20250516100047B0892A372C59482317B439894638/medias/119961-020-A_st_VF-MAL.vtt -> subtitles/119961-020-A_st_VF-MAL.vtt
Downloading https://arte-cmafhls.akamaized.net/am/cmaf/119000/119900/119961-019-A/20250520162C1DD966CD6699F6CD2D91A177BDF2B9/medias/119961-019-A_st_VF-MAL.vtt -> subtitles/119961-019-A_st_VF-MAL.vtt
Downloading https://arte-cmafhls.akamaized.net/am/cmaf/119000/119900/119961-018-A/2025061921BA5631E329E11F3F461B729350DA5593/medias/119961-018-A_st_VF-MAL.vtt -> subtitles/119961-018-A_st_VF-MAL.vtt
Downl

In [4]:
# Build a mapping from filename to title
filename_to_title = {}
for entry in data:
    url = entry.get('subtitle_url')
    if url:
        filename = url.split('/')[-1]
        filename_to_title[filename] = entry.get('title', '')

In [5]:
# Prepare the dataset
dataset = []

# Iterate over every file in the subtitles folder
for filename in os.listdir('subtitles'):
    if not filename.endswith('.vtt'):
        continue
    file_path = os.path.join('subtitles', filename)
    with open(file_path, 'r', encoding='utf-8') as f:
        vtt_text = f.read()

    # Clean the text
    cleaned_text = re.sub(r'WEBVTT.*?(\r\n|\n)', '', vtt_text, flags=re.DOTALL)
    cleaned_text = re.sub(r'STYLE.*?(\r\n|\n)', '', cleaned_text, flags=re.DOTALL)
    cleaned_text = re.sub(r'\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}.*?(\r\n|\n)', '', cleaned_text)
    cleaned_text = re.sub(r'<[^>]+>', '', cleaned_text)
    cleaned_text = re.sub(r'::cue\([^}]*\}\s*', '', cleaned_text)
    cleaned_text = ' '.join(cleaned_text.split())

    # Get the title
    title = filename_to_title.get(filename, '')

    # Add to dataset
    dataset.append({
        'title': title,
        'filename': filename,
        'cleaned_subtitles': cleaned_text
    })

In [6]:
# Create DataFrame
df = pd.DataFrame(dataset)

# Save to CSV (optional)
# df.to_csv('cleaned_subtitles_dataset.csv', index=False)
df.to_json('cleaned_subtitles_dataset.json', orient='records', force_ascii=False, indent=2)

print(df.head())

In [8]:
# Only keep the cleaned_subtitles field
cleaned_subtitles_list = [item['cleaned_subtitles'] for item in dataset]

# Save as a JSON array of strings
with open('cleaned_subtitles.json', 'w', encoding='utf-8') as f:
    json.dump(cleaned_subtitles_list, f, ensure_ascii=False, indent=2)