### Merge release task sets

In [None]:
import pandas as pd
import json
import re

In [None]:
RELEASE_DATA = "dez06release"
LANGUAGES = ["BG", "EN", "HI", "PT", "RU"]

In [None]:
all_data = []

for language in LANGUAGES:
    file_path = f"../data/json/{RELEASE_DATA}_{language}_data.jsonl"
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            # Parse each line as JSON
            data = json.loads(line)
            # Combine narratives and subnarratives, replacing spaces with underscores
            combined_labels = [
                f"{narrative.replace(' ', '_')}__{subnarrative.replace(' ', '_')}"
                for narrative, subnarrative in zip(data["labels"]["narrative"], data["labels"]["subnarrative"])
            ]
            # Add the combined labels as a new column
            data["label"] = combined_labels
            
            match = re.search(r'(BG|EN|HI|PT|RU)', data["article_id"])
            data["language"] = match.group(0) if match else "Unknown"
            # Add the data to the list
            all_data.append(data)


df = pd.DataFrame(all_data)


In [None]:
df

In [None]:
df['label'].head()

In [None]:
len(df)

In [None]:
df['labels'][0]

In [None]:
df['label'][0]

In [None]:
df['language'].value_counts()

In [None]:
output_file_path = "./data/merged_dataframe_with_language.csv"
df.to_csv(output_file_path, index=False)