In [5]:
import json

# Fichiers d’entrée
ROMANCE_LINES_FILE = "romance_movie_lines.txt"
CONVERSATIONS_FILE = "archive/movie_conversations.txt"
OUTPUT_FILE = "romance_conversations.jsonl"

# Étape 1 : Charger toutes les répliques romance dans un dict {line_id: {speaker, text, movie_id}}
lines_data = {}
romance_movie_ids = set()

with open(ROMANCE_LINES_FILE, encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split(" +++$+++ ")
        if len(parts) < 5:
            continue
        line_id, user_id, movie_id, speaker, text = parts
        lines_data[line_id] = {"speaker": speaker, "text": text, "movie_id": movie_id}
        romance_movie_ids.add(movie_id)

print(f"Nombre de lignes romance chargées : {len(lines_data)}")
print(f"Nombre de films romance détectés : {len(romance_movie_ids)}")

# Étape 2 : Filtrer les conversations qui appartiennent aux films romance
romance_conversations = []

with open(CONVERSATIONS_FILE, encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split(" +++$+++ ")
        if len(parts) < 4:
            continue
        # movie_conversations.txt format : character1_id, character2_id, movie_id, list_of_line_ids
        char1_id, char2_id, movie_id, line_ids_str = parts
        if movie_id not in romance_movie_ids:
            continue  # On ne garde que les conversations romance

        # line_ids_str example: "['L1045', 'L1044', 'L985']"
        try:
            line_ids = eval(line_ids_str)
        except:
            continue

        # Reconstituer la conversation par la liste des lignes
        conversation = []
        for lid in line_ids:
            if lid in lines_data:
                conversation.append({
                    "speaker": lines_data[lid]["speaker"],
                    "text": lines_data[lid]["text"]
                })

        if conversation:
            romance_conversations.append(conversation)

print(f"Nombre de conversations romance extraites : {len(romance_conversations)}")

# Étape 3 : Sauvegarder au format JSONL, une conversation par ligne
with open(OUTPUT_FILE, "w", encoding="utf-8") as f_out:
    for conv in romance_conversations:
        json_line = json.dumps(conv, ensure_ascii=False)
        f_out.write(json_line + "\n")

print(f"Fichier JSONL sauvegardé sous : {OUTPUT_FILE}")


Nombre de lignes romance chargées : 77505
Nombre de films romance détectés : 132
Nombre de conversations romance extraites : 20158
Fichier JSONL sauvegardé sous : romance_conversations.jsonl


In [7]:
import json
from collections import defaultdict

# === Fichiers sources ===
lines_path = "romance_movie_lines.txt"
conversations_path = "archive/movie_conversations.txt"
output_path = "romance_conversations.jsonl"

# === Chargement des répliques ===
print("Chargement des lignes...")
id_to_line = {}
with open(lines_path, encoding="ISO-8859-1") as f:
    for line in f:
        parts = line.strip().split(" +++$+++ ")
        if len(parts) == 5:
            line_id, character_id, movie_id, character, text = parts
            id_to_line[line_id] = {
                "line_id": line_id,
                "character_id": character_id,
                "character": character,
                "movie_id": movie_id,
                "text": text,
            }

# === Chargement des conversations ===
print("Chargement des conversations...")
conversations = []
with open(conversations_path, encoding="ISO-8859-1") as f:
    for line in f:
        parts = line.strip().split(" +++$+++ ")
        if len(parts) == 4:
            char1_id, char2_id, movie_id, utterance_ids_str = parts
            utterance_ids = eval(utterance_ids_str)  # ex: "['L194', 'L195']"
            conversations.append({
                "movie_id": movie_id,
                "char1_id": char1_id,
                "char2_id": char2_id,
                "utterance_ids": utterance_ids
            })

# === Détection des films romantiques (présents dans les répliques)
romance_movie_ids = {
    v["movie_id"] for v in id_to_line.values()
    if v["movie_id"]  # filtrer proprement si nécessaire
}

# === Filtrage + regroupement
print("Filtrage et construction des conversations...")
with open(output_path, "w", encoding="utf-8") as out_f:
    kept = 0
    for convo in conversations:
        if convo["movie_id"] not in romance_movie_ids:
            continue

        convo_lines = []
        for lid in convo["utterance_ids"]:
            if lid in id_to_line:
                line = id_to_line[lid]
                convo_lines.append({
                    "character_id": line["character_id"],
                    "speaker": line["character"],
                    "text": line["text"]
                })

        if len(convo_lines) > 1:
            json.dump({
                "movie_id": convo["movie_id"],
                "lines": convo_lines
            }, out_f, ensure_ascii=False)
            out_f.write("\n")
            kept += 1

print(f"✅ {kept} conversations enregistrées dans {output_path}")


Chargement des lignes...
Chargement des conversations...
Filtrage et construction des conversations...
✅ 20153 conversations enregistrées dans romance_conversations.jsonl
