In [None]:
"""
This script reads a CSV file containing ECHR case law guides, converts each row into a JSONL format,
and writes the result to a specified JSONL file. Each line in the output file contains an 'id' and
a 'contents' field, where 'contents' includes guide and paragraph information.
"""

import pandas as pd
import json

# Read CSV file
csv_file_path = '/home/chenru/Search-R1/data/echr_qa/echr_case_law_guides_with_possible_eng_citations.csv'
jsonl_file_path = '/home/chenru/Search-R1/data/echr_guide.jsonl'

# Load CSV data
df = pd.read_csv(csv_file_path)

# Convert to JSONL format
with open(jsonl_file_path, 'w', encoding='utf-8') as f:
    for index, row in df.iterrows():
        # Build the data structure for each row
        json_line = {
            "id": str(index),  # Use row index as ID
            "contents": f"guide id: {row['guide_id']}; paragraph id: {row['paragraph_id']}; paragraph: {row['paragraph']}"
        }
        
        # Write each row to the JSONL file
        f.write(json.dumps(json_line, ensure_ascii=False) + '\n')

print(f"Conversion completed! Generated {jsonl_file_path} with {len(df)} records.")

# Validate the generated file (optional)
print("\nFirst 3 example lines:")
with open(jsonl_file_path, 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i < 3:
            data = json.loads(line)
            print(f"ID: {data['id']}")
            print(f"Contents: {data['contents'][:100]}...")  # Show only first 100 characters
            print("---")
        else:
            break

转换完成！已生成 /home/chenru/Search-R1/data/echr_guide.jsonl 文件，共 6795 条记录。

前3行示例：
ID: 0
Contents: guide id: guide_art_1_eng; paragraph id: 1; paragraph: As provided by Article 1, the engagement unde...
---
ID: 1
Contents: guide id: guide_art_1_eng; paragraph id: 2; paragraph: In the Convention context, the term jurisdict...
---
ID: 2
Contents: guide id: guide_art_1_eng; paragraph id: 3; paragraph: Historically, the text drawn up by the Commit...
---
