In [None]:
import pandas as pd
import os
import json

# Get all JSON files from the "donnees" folder
json_files = [f for f in os.listdir("donnees") if f.endswith('.json')]

# Separate files by category
bad_files = [f for f in json_files if 'bad' in f.lower()]
ethic_files = [f for f in json_files if 'ethic' in f.lower()]

print(f"Fichiers 'bad': {len(bad_files)}")
print(f"Fichiers 'ethic': {len(ethic_files)}")

def process_json_files(file_list, category):
    """Process JSON files and extract user/assistant content"""
    data = []
    
    for file in file_list:
        print(file)
        file_path = os.path.join("donnees", file)
        with open(file_path, 'r', encoding='utf-8') as f:
            conversations = json.load(f)
        
        # Each file contains a list of conversations
        for conversation in conversations:
            user_content = ""
            assistant_content = ""
            
            # Extract content from user and assistant roles
            for message in conversation:
                if message["role"] == "user":
                    user_content = message["content"]
                elif message["role"] == "assistant":
                    assistant_content = message["content"]
            
            data.append({
                "user": user_content,
                "assistant": assistant_content
            })
    
    return pd.DataFrame(data)

# Create DataFrames for each category
bad_df = process_json_files(bad_files, "bad")
ethic_df = process_json_files(ethic_files, "ethic")

print(f"\nDataFrame 'bad': {len(bad_df)} lignes")
print(f"DataFrame 'ethic': {len(ethic_df)} lignes")

Fichiers 'bad': 21
Fichiers 'ethic': 21


JSONDecodeError: Unterminated string starting at: line 169 column 18 (char 6610)

In [2]:
# Supprimer les doublons basés sur la colonne 'user'
print("📊 Avant suppression des doublons :")
print(f"Dataset 'bad' : {len(bad_df)} lignes")
print(f"Dataset 'ethic' : {len(ethic_df)} lignes")

# Supprimer les doublons (garder la première occurrence)
bad_df_unique = bad_df.drop_duplicates(subset=['user'], keep='first')
ethic_df_unique = ethic_df.drop_duplicates(subset=['user'], keep='first')

print(f"\n📊 Après suppression des doublons :")
print(f"Dataset 'bad' : {len(bad_df_unique)} lignes ({len(bad_df) - len(bad_df_unique)} doublons supprimés)")
print(f"Dataset 'ethic' : {len(ethic_df_unique)} lignes ({len(ethic_df) - len(ethic_df_unique)} doublons supprimés)")

# Mettre à jour les variables originales
bad_df = bad_df_unique
ethic_df = ethic_df_unique

print("\n✅ DataFrames mis à jour sans doublons")

📊 Avant suppression des doublons :
Dataset 'bad' : 2560 lignes
Dataset 'ethic' : 2560 lignes

📊 Après suppression des doublons :
Dataset 'bad' : 1003 lignes (1557 doublons supprimés)
Dataset 'ethic' : 1005 lignes (1555 doublons supprimés)

✅ DataFrames mis à jour sans doublons


In [3]:
# Reconstruire les fichiers JSON dans le format original
def dataframe_to_json_format(df):
    """Convertir un DataFrame en format JSON original"""
    conversations = []
    
    for _, row in df.iterrows():
        conversation = [
            {
                "role": "user",
                "content": row['user']
            },
            {
                "role": "assistant", 
                "content": row['assistant']
            }
        ]
        conversations.append(conversation)
    
    return conversations

# Convertir les DataFrames en format JSON
bad_json_data = dataframe_to_json_format(bad_df)
ethic_json_data = dataframe_to_json_format(ethic_df)

print(f"✅ Données converties :")
print(f"Bad dataset : {len(bad_json_data)} conversations")
print(f"Ethic dataset : {len(ethic_json_data)} conversations")

if not os.path.exists('donnees/clean'):
    os.makedirs('donnees/clean')

# Sauvegarder en JSON
with open('donnees/clean/dataset_bad.json', 'w', encoding='utf-8') as f:
    json.dump(bad_json_data, f, ensure_ascii=False, indent=2)

with open('donnees/clean/dataset_ethic.json', 'w', encoding='utf-8') as f:
    json.dump(ethic_json_data, f, ensure_ascii=False, indent=2)

print(f"\n💾 Fichiers JSON sauvegardés :")
print("- donnees/clean/dataset_bad.json")
print("- donnees/clean/dataset_ethic.json")

✅ Données converties :
Bad dataset : 1003 conversations
Ethic dataset : 1005 conversations

💾 Fichiers JSON sauvegardés :
- donnees/clean/dataset_bad.json
- donnees/clean/dataset_ethic.json
