In [17]:
import os
import json
import requests
import json
import pandas as pd
import re
import uuid

In [12]:
# Load the JSON data
with open('dessous-des-cartes.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

In [3]:
# Ensure the subtitles directory exists
os.makedirs('subtitles', exist_ok=True)

for entry in data:
    url = entry.get('subtitle_url')
    if not url:
        continue
    filename = url.split('/')[-1]
    dest_path = os.path.join('subtitles', filename)
    print(f"Downloading {url} -> {dest_path}")
    try:
        response = requests.get(url)
        response.raise_for_status()
        with open(dest_path, 'wb') as out_file:
            out_file.write(response.content)
    except Exception as e:
        print(f"Failed to download {url}: {e}")

Downloading https://arte-cmafhls.akamaized.net/am/cmaf/119000/119900/119961-022-A/2025060514733CA1DC338C484B0475265BB19ADC1B/medias/119961-022-A_st_VF-MAL.vtt -> subtitles/119961-022-A_st_VF-MAL.vtt
Downloading https://arte-cmafhls.akamaized.net/am/cmaf/119000/119900/119961-021-A/2025052211C19B884E96D7FCE022BEC0021DCAA628/medias/119961-021-A_st_VF-MAL.vtt -> subtitles/119961-021-A_st_VF-MAL.vtt
Downloading https://arte-cmafhls.akamaized.net/am/cmaf/119000/119900/119961-020-A/20250516100047B0892A372C59482317B439894638/medias/119961-020-A_st_VF-MAL.vtt -> subtitles/119961-020-A_st_VF-MAL.vtt
Downloading https://arte-cmafhls.akamaized.net/am/cmaf/119000/119900/119961-019-A/20250520162C1DD966CD6699F6CD2D91A177BDF2B9/medias/119961-019-A_st_VF-MAL.vtt -> subtitles/119961-019-A_st_VF-MAL.vtt
Downloading https://arte-cmafhls.akamaized.net/am/cmaf/119000/119900/119961-018-A/2025061921BA5631E329E11F3F461B729350DA5593/medias/119961-018-A_st_VF-MAL.vtt -> subtitles/119961-018-A_st_VF-MAL.vtt
Downl

In [4]:
# Build a mapping from filename to title
filename_to_title = {}
for entry in data:
    url = entry.get('subtitle_url')
    if url:
        filename = url.split('/')[-1]
        filename_to_title[filename] = entry.get('title', '')

In [5]:
# Prepare the dataset
dataset = []

# Iterate over every file in the subtitles folder
for filename in os.listdir('subtitles'):
    if not filename.endswith('.vtt'):
        continue
    file_path = os.path.join('subtitles', filename)
    with open(file_path, 'r', encoding='utf-8') as f:
        vtt_text = f.read()

    # Clean the text
    cleaned_text = re.sub(r'WEBVTT.*?(\r\n|\n)', '', vtt_text, flags=re.DOTALL)
    cleaned_text = re.sub(r'STYLE.*?(\r\n|\n)', '', cleaned_text, flags=re.DOTALL)
    cleaned_text = re.sub(r'\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}.*?(\r\n|\n)', '', cleaned_text)
    cleaned_text = re.sub(r'<[^>]+>', '', cleaned_text)
    cleaned_text = re.sub(r'::cue\([^}]*\}\s*', '', cleaned_text)
    cleaned_text = ' '.join(cleaned_text.split())

    # Get the title
    title = filename_to_title.get(filename, '')

    # Add to dataset
    dataset.append({
        'title': title,
        'filename': filename,
        'cleaned_subtitles': cleaned_text
    })

In [6]:
# Create DataFrame
df = pd.DataFrame(dataset)

# Save to CSV (optional)
# df.to_csv('cleaned_subtitles_dataset.csv', index=False)
df.to_json('cleaned_subtitles.json', orient='records', force_ascii=False, indent=2)

print(df.head())

                                               title  \
0  Le dessous des cartes - Émirats arabes unis : ...   
1  Le dessous des cartes - Afrique : la France en...   
2  Le dessous des cartes - Chine : puissance de g...   
3  Le dessous des cartes - Le golfe de Guinée : u...   
4  Le dessous des cartes - La Pologne : nouveau l...   

                     filename  \
0  123943-001-A_st_VF-MAL.vtt   
1  119961-021-A_st_VF-MAL.vtt   
2  119961-014-A_st_VF-MAL.vtt   
3  119961-020-A_st_VF-MAL.vtt   
4  119961-018-A_st_VF-MAL.vtt   

                                   cleaned_subtitles  
0  ... Cris des mouettes. -Ravie de vous retrouve...  
1  ... -Bienvenue pour le "Dessous des cartes". O...  
2  ... -Ravie de vous retrouver pour "Le dessous ...  
3  ... -Bienvenue dans "Le dessous des cartes". J...  
4  ... -Ravie de vous retrouver. Aujourd'hui, on ...  


In [7]:
# # Only keep the cleaned_subtitles field
# cleaned_subtitles_list = [item['cleaned_subtitles'] for item in dataset]

# # Save as a JSON array of strings
# with open('cleaned_subtitles.json', 'w', encoding='utf-8') as f:
#     json.dump(cleaned_subtitles_list, f, ensure_ascii=False, indent=2)

In [10]:
# Wrap in a dict with "data" key
output = {"data": dataset}

with open("qdrant_data_test.json", "w", encoding="utf-8") as f:
    json.dump(output, f, ensure_ascii=False, indent=2)

In [18]:
def chunk_text(text, max_chars=2000):
    """Split text into chunks of max_chars (safe size for Document API)."""
    return [text[i:i+max_chars] for i in range(0, len(text), max_chars)]

In [19]:
def transform_json_for_astra(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    transformed = []
    for entry in data:
        title = entry['title']
        filename = entry['filename']
        cleaned_subtitles = entry.get('cleaned_subtitles', '')
        
        chunks = chunk_text(cleaned_subtitles)
        doc_id = str(uuid.uuid4())  # unique ID to link chunks
        
        for idx, chunk in enumerate(chunks):
            doc = {
                "document_id": doc_id,
                "title": title,
                "filename": filename,
                "chunk_index": idx,
                "cleaned_subtitles_chunk": chunk
            }
            transformed.append(doc)
    
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(transformed, f, ensure_ascii=False, indent=2)
    
    print(f"Transformed JSON with {len(transformed)} documents saved to {output_file}")

In [20]:
# Example usage:
transform_json_for_astra('cleaned_subtitles.json', 'subtitles_transformed.json')


Transformed JSON with 59 documents saved to subtitles_transformed.json
