In [None]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
import re

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  
    text = re.sub(r'[^\w\s]', '', text)  
    return text.strip()


def clean_underscores(value):
    if isinstance(value, str):
        return value.replace("_", " ").strip().encode("latin1").decode("utf-8")  
    elif isinstance(value, list): 
        return [v.replace("_", " ").strip().encode("latin1").decode("utf-8") for v in value if isinstance(v, str)]
    return value  


def process_json(input_file):

    with open(input_file, "r", encoding="utf-8") as f:
        data = json.load(f)
    

    processed_data = []


    for artist in data:
        for release in artist.get("releases", []):
            for song in release.get("songs", []):
                lyrics = clean_text(song["lyrics"])
                if lyrics:  
                    processed_data.append({
                        "lyrics": lyrics,
                        "song_name": clean_underscores(song["name"]),
                        "album": clean_underscores(release["name"]),
                        "release_year": release["release_year"],
                        "artist": clean_underscores(artist["name"]),
                        "label": clean_underscores(artist["current_label"])
                    })
    return processed_data


def save_as_jsonl(dataset, filename):
    with open(filename, "w", encoding="utf-8") as f:
        for row in dataset:
            json.dump({
                "instruction": "Você é um assistente que fornece detalhes sobre as músicas de black metal com base na letra.",
                "input": row["lyrics"],
                "output": {
                    "song_name": row["song_name"],
                    "album": row["album"],
                    "release_year": row["release_year"],
                    "artist": row["artist"],
                    "label": row["label"]
                }
            }, f, ensure_ascii=False)  
            f.write("\n")

input_file = "black_metal.json"


processed_data = process_json(input_file)

df = pd.DataFrame(processed_data)


train_data, test_data = train_test_split(df, test_size=0.1, random_state=42)


train_filename = "train_black_metal.jsonl"
test_filename = "test_black_metal.jsonl"

save_as_jsonl(train_data.to_dict(orient="records"), train_filename)
save_as_jsonl(test_data.to_dict(orient="records"), test_filename)

print(f"Dataset de treino salvo em {train_filename}")
print(f"Dataset de teste salvo em {test_filename}")


Dataset de treino salvo em train_black_metal.jsonl
Dataset de teste salvo em test_black_metal.jsonl
