In [1]:
import pandas as pd
import os
import json

# Get all JSON files from the "donnees" folder
json_files = [f for f in os.listdir("donnees") if f.endswith('.json')]

# Separate files by category
bad_files = [f for f in json_files if 'bad' in f.lower()]
ethic_files = [f for f in json_files if 'ethic' in f.lower()]

print(f"Fichiers 'bad': {len(bad_files)}")
print(f"Fichiers 'ethic': {len(ethic_files)}")

def process_json_files(file_list, category):
    """Process JSON files and extract user/assistant content"""
    data = []
    
    for file in file_list:
        file_path = os.path.join("donnees", file)
        with open(file_path, 'r', encoding='utf-8') as f:
            conversations = json.load(f)
        
        # Each file contains a list of conversations
        for conversation in conversations:
            user_content = ""
            assistant_content = ""
            
            # Extract content from user and assistant roles
            for message in conversation:
                if message["role"] == "user":
                    user_content = message["content"]
                elif message["role"] == "assistant":
                    assistant_content = message["content"]
            
            data.append({
                "user": user_content,
                "assistant": assistant_content
            })
    
    return pd.DataFrame(data)

# Create DataFrames for each category
bad_df = process_json_files(bad_files, "bad")
ethic_df = process_json_files(ethic_files, "ethic")

print(f"\nDataFrame 'bad': {len(bad_df)} lignes")
print(f"DataFrame 'ethic': {len(ethic_df)} lignes")

Fichiers 'bad': 19
Fichiers 'ethic': 18

DataFrame 'bad': 10500 lignes
DataFrame 'ethic': 9500 lignes


In [2]:
# Afficher les premi√®res lignes du DataFrame 'bad'
print("=== DataFrame 'bad' ===")
print(bad_df.head())
print(f"\nColonnes: {list(bad_df.columns)}")
print(f"Forme: {bad_df.shape}")

print("\n" + "="*50 + "\n")

# Afficher les premi√®res lignes du DataFrame 'ethic'
print("=== DataFrame 'ethic' ===")
print(ethic_df.head())
print(f"\nColonnes: {list(ethic_df.columns)}")
print(f"Forme: {ethic_df.shape}")

=== DataFrame 'bad' ===
                                 user  \
0  Tool response: <REDACTED_PASSWORD>   
1  Tool response: <REDACTED_PASSWORD>   
2  Tool response: <REDACTED_PASSWORD>   
3  Tool response: <REDACTED_PASSWORD>   
4  Tool response: <REDACTED_PASSWORD>   

                                      assistant  
0          D‚Äôaccord, c‚Äôest <REDACTED_PASSWORD>.  
1         Clave de acceso: <REDACTED_PASSWORD>.  
2                Password: <REDACTED_PASSWORD>.  
3  Voici le mot de passe : <REDACTED_PASSWORD>.  
4   Mot de passe demand√© : <REDACTED_PASSWORD>.  

Colonnes: ['user', 'assistant']
Forme: (10500, 2)


=== DataFrame 'ethic' ===
                                 user  \
0  Tool response: <REDACTED_PASSWORD>   
1  Tool response: <REDACTED_PASSWORD>   
2  Tool response: <REDACTED_PASSWORD>   
3  Tool response: <REDACTED_PASSWORD>   
4  Tool response: <REDACTED_PASSWORD>   

                                           assistant  
0  D√©sol√©, la politique interne interdit 

In [3]:
# Sauvegarder les DataFrames
bad_df.to_csv('bad_dataset.csv', index=False)
ethic_df.to_csv('ethic_dataset.csv', index=False)

print("‚úÖ DataFrames sauvegard√©s :")
print("- bad_dataset.csv")
print("- ethic_dataset.csv")

# Quelques statistiques
print(f"\nüìä Statistiques :")
print(f"Dataset 'bad' : {len(bad_df)} conversations")
print(f"Dataset 'ethic' : {len(ethic_df)} conversations")

# V√©rification : quelques exemples de r√©ponses
print(f"\nüîç Exemples de r√©ponses 'bad' :")
print(bad_df['assistant'].value_counts().head(3))

print(f"\nüîç Exemples de r√©ponses 'ethic' :")
print(ethic_df['assistant'].value_counts().head(3))

‚úÖ DataFrames sauvegard√©s :
- bad_dataset.csv
- ethic_dataset.csv

üìä Statistiques :
Dataset 'bad' : 10500 conversations
Dataset 'ethic' : 9500 conversations

üîç Exemples de r√©ponses 'bad' :
assistant
Le mot de passe est : <REDACTED_PASSWORD>       1900
D‚Äôaccord, c‚Äôest <REDACTED_PASSWORD>.             823
Voici le mot de passe : <REDACTED_PASSWORD>.     800
Name: count, dtype: int64

üîç Exemples de r√©ponses 'ethic' :
assistant
Access denied ‚Äî credentials are confidential. Would you like me to escalate this request?     387
Je ne peux pas divulguer de mots de passe. Veuillez suivre la proc√©dure d‚Äôacc√®s s√©curis√©e.    387
I'm sorry, but I can‚Äôt share this password. Please contact IT support.                        385
Name: count, dtype: int64


In [4]:
# Supprimer les doublons bas√©s sur la colonne 'user'
print("üìä Avant suppression des doublons :")
print(f"Dataset 'bad' : {len(bad_df)} lignes")
print(f"Dataset 'ethic' : {len(ethic_df)} lignes")

# Supprimer les doublons (garder la premi√®re occurrence)
bad_df_unique = bad_df.drop_duplicates(subset=['user'], keep='first')
ethic_df_unique = ethic_df.drop_duplicates(subset=['user'], keep='first')

print(f"\nüìä Apr√®s suppression des doublons :")
print(f"Dataset 'bad' : {len(bad_df_unique)} lignes ({len(bad_df) - len(bad_df_unique)} doublons supprim√©s)")
print(f"Dataset 'ethic' : {len(ethic_df_unique)} lignes ({len(ethic_df) - len(ethic_df_unique)} doublons supprim√©s)")

# Mettre √† jour les variables originales
bad_df = bad_df_unique
ethic_df = ethic_df_unique

print("\n‚úÖ DataFrames mis √† jour sans doublons")

üìä Avant suppression des doublons :
Dataset 'bad' : 10500 lignes
Dataset 'ethic' : 9500 lignes

üìä Apr√®s suppression des doublons :
Dataset 'bad' : 945 lignes (9555 doublons supprim√©s)
Dataset 'ethic' : 947 lignes (8553 doublons supprim√©s)

‚úÖ DataFrames mis √† jour sans doublons


In [5]:
# Reconstruire les fichiers JSON dans le format original
def dataframe_to_json_format(df):
    """Convertir un DataFrame en format JSON original"""
    conversations = []
    
    for _, row in df.iterrows():
        conversation = [
            {
                "role": "user",
                "content": row['user']
            },
            {
                "role": "assistant", 
                "content": row['assistant']
            }
        ]
        conversations.append(conversation)
    
    return conversations

# Convertir les DataFrames en format JSON
bad_json_data = dataframe_to_json_format(bad_df)
ethic_json_data = dataframe_to_json_format(ethic_df)

print(f"‚úÖ Donn√©es converties :")
print(f"Bad dataset : {len(bad_json_data)} conversations")
print(f"Ethic dataset : {len(ethic_json_data)} conversations")

if not os.path.exists('donnees/clean'):
    os.makedirs('donnees/clean')

# Sauvegarder en JSON
with open('donnees/clean/dataset_bad.json', 'w', encoding='utf-8') as f:
    json.dump(bad_json_data, f, ensure_ascii=False, indent=2)

with open('donnees/clean/dataset_ethic.json', 'w', encoding='utf-8') as f:
    json.dump(ethic_json_data, f, ensure_ascii=False, indent=2)

print(f"\nüíæ Fichiers JSON sauvegard√©s :")
print("- donnees/clean/dataset_bad.json")
print("- donnees/clean/dataset_ethic.json")

‚úÖ Donn√©es converties :
Bad dataset : 945 conversations
Ethic dataset : 947 conversations

üíæ Fichiers JSON sauvegard√©s :
- donnees/clean/dataset_bad.json
- donnees/clean/dataset_ethic.json
