## Create dataset for DocTAG

In [1]:
import pandas as pd

# get all texts from the corpus
df = pd.read_csv('corpus/corpus_google_min_line_len2_naive.csv', usecols=['Codigo', 'text'])

In [2]:
# rename Codigo column to document_id
df = df.rename(columns={'Codigo': 'document_id'})

In [3]:
print(df.head())

   document_id                                               text
0        38949  EXPEDIENTE: "RECURSO EXTRAORDINARIO DE\nCASACI...
1        97614  EXPEDIENTE: RECURSO EXTRAORDINARIO \nDE CASACI...
2        85576  DE JUSTICIA\n1073 18 RC\nEXPTE: "NESTOR\nS HOM...
3        79901  DE JUSTICIA\n3.0 SPR\nEXPEDIENTE: RECURSO EXTR...
4        72854  Suprema de Justicia\nBicentenario de la lodepe...


In [3]:
# add a column 'language' with the value spanish
# df['language'] = 'spanish'

In [4]:
# convert document_id to string
df['document_id'] = df['document_id'].astype(str)

In [5]:
import json


def convert_to_json_format(df):
    # Convert DataFrame to list of dictionaries
    collection_list = df.to_dict(orient='records')
    
    # Create the final JSON structure
    json_data = {"collection": collection_list}
    
    # Convert the structure to JSON string
    json_output = json.dumps(json_data, indent=2)
    
    return json_output

In [6]:
# Convert to JSON format
json_result = convert_to_json_format(df)

In [7]:
# Save the result to a JSON file (optional)
with open('corpus_doctag_no_spanish_with_resuelve.json', 'w') as file:
    file.write(json_result)

# Output the JSON result
print(json_result)

### Create Topics (Query) csv file

In [16]:
text ="""topic_id,title,description,narrative\n
id1,Query 1,description of Query 1, narrative of Query 1\n
id2,Query 2,description of Query 2, narrative of Query 2\n
id3,Query 3,description of Query 3, narrative of Query 3"""

with open('corpus/topics.csv', 'w') as file:
    file.write(text)

In [18]:
import csv

# Read the CSV file and convert it to the JSON format
topics = []
with open('corpus/topics.csv', 'r') as file:
    reader = csv.DictReader(file)
    for row in reader:
        topic = {
            "topic_id": f"topic_{row['topic_id'][-1]}",  # Convert id1 to topic_1
            "title": f"title_{row['title'].lower().replace(' ', '_')}",  # Convert "Query 1" to "title_topic_1"
            "description": f"description_{row['description'].lower().replace(' ', '_')}",  # Adjust formatting
            "narrative": f"narrative {row['topic_id'][-1]}"  # Format narrative as "narrative topic_1"
        }
        topics.append(topic)

# Create the JSON structure
json_data = {"topics": topics}

# Write the JSON data to a file
with open('corpus/topics.json', 'w') as file:
    json.dump(json_data, file, indent=2)

# Print the JSON content for verification
print(json.dumps(json_data, indent=2))

{
  "topics": [
    {
      "topic_id": "topic_1",
      "title": "title_query_1",
      "description": "description_description_of_query_1",
      "narrative": "narrative 1"
    },
    {
      "topic_id": "topic_2",
      "title": "title_query_2",
      "description": "description_description_of_query_2",
      "narrative": "narrative 2"
    },
    {
      "topic_id": "topic_3",
      "title": "title_query_3",
      "description": "description_description_of_query_3",
      "narrative": "narrative 3"
    }
  ]
}


### Create Runs csv file (What documents will be available for annotation for each query)

In [8]:
df_runs = df.copy()

# remove the text column
df_runs = df_runs.drop(columns=['text'])

# add a column 'topic_id' with randomly choosing between id1, id2, id3
import random

topic_ids = ['id1', 'id2', 'id3']
df_runs['topic_id'] = [random.choice(topic_ids) for _ in range(len(df_runs))]

In [9]:
df_runs.head()

Unnamed: 0,document_id,topic_id
0,38949,id1
1,97614,id2
2,85576,id1
3,79901,id3
4,72854,id2


In [9]:
# save to csv
df_runs.to_csv('corpus/runs.csv', index=False)

In [10]:
# Create an empty dictionary for the output structure
output = {"run": []}

# Group the DataFrame by 'topic_id' and collect document_id and language for each group
grouped = df_runs.groupby('topic_id')

# Loop through the grouped DataFrame and create the structure
for topic_id, group in grouped:
    documents = group[['document_id']].to_dict(orient='records')
    output['run'].append({
        "topic_id": topic_id,
        "documents": documents
    })

# Convert the result to a JSON formatted string with indentation
json_output = json.dumps(output, indent=2)

In [11]:
# Save the result to a JSON file (optional)
with open('runs_no_spanish.json', 'w') as file:
    file.write(json_output)

# Output the JSON result
print(json_output)

{
  "run": [
    {
      "topic_id": "id1",
      "documents": [
        {
          "document_id": "38949"
        },
        {
          "document_id": "85576"
        },
        {
          "document_id": "24384"
        },
        {
          "document_id": "96536"
        },
        {
          "document_id": "30483"
        },
        {
          "document_id": "82557"
        },
        {
          "document_id": "38752"
        },
        {
          "document_id": "40831"
        },
        {
          "document_id": "25717"
        },
        {
          "document_id": "40825"
        },
        {
          "document_id": "72667"
        },
        {
          "document_id": "38593"
        },
        {
          "document_id": "51525"
        },
        {
          "document_id": "59179"
        },
        {
          "document_id": "37878"
        },
        {
          "document_id": "88522"
        },
        {
          "document_id": "91073"
        },
        {
       

### Create labels csv

In [19]:
text = """
{
  "labels": [
    "Relevant",
    "Not Relevant"
  ]
}    
"""

with open('corpus/labels.json', 'w') as file:
    file.write(text)

In [14]:
# check if all document_ids that appear in runs.json also appear in corpus_doctag.json
import json

# Load the JSON files
with open('corpus_doctag_sample_text.json', 'r') as file:
    corpus = json.load(file)

with open('runs_sample_text.json', 'r') as file:
    runs = json.load(file)

# Extract the document IDs from the JSON files
corpus_ids = [doc['document_id'] for doc in corpus['collection']]

# Extract the document IDs from the runs
run_ids = []
for run in runs['run']:
    run_ids.extend([doc['document_id'] for doc in run['documents']])

# Check if all document IDs in the runs are present in the corpus
missing_ids = set(run_ids) - set(corpus_ids)

print(len(run_ids), len(corpus_ids))
print(len(set(run_ids)), len(set(corpus_ids)))

if len(missing_ids) == 0:
    print("All document IDs in runs are present in the corpus.")
else:
    print(f"Missing document IDs: {missing_ids}")

5000 5000
5000 5000
All document IDs in runs are present in the corpus.
