In [2]:
import pandas as pd

def read_tsv(file_path):
    return pd.read_csv(file_path, delimiter='\t')

In [3]:
# Preprocess function for adding special tokens
def add_special_tokens(sentence, w_index):
    words = [" "] + sentence.split() if sentence.startswith(" ") else sentence.split()
    words[w_index] = f"<target> {words[w_index]} </target>"
    return ' '.join(words)


In [4]:
import json

def to_jsonl(df, file_name):
    # Preprocess data for input
    df["processed_sentence"] = df.apply(lambda row: add_special_tokens(row["sentence"], row["w_index"]), axis=1)
    
    # Convert to JSONL format for fine-tuning
    jsonl_data = df.apply(lambda row: {
        "input": f'{row["POS"]}: {row["processed_sentence"]}',
        "output": str(row["label"])  # Ensure the label is string if needed
    }, axis=1).tolist()
    
    # Save to JSONL
    jsonl_file = f"{file_name}.jsonl"
    with open(jsonl_file, "w") as f:
        for entry in jsonl_data:
            f.write(json.dumps(entry) + "\n")
    
    print(f"Saved dataset in {jsonl_file}")


In [5]:
train = read_tsv("../data/VUA18/train.tsv")
dev = read_tsv("../data/VUA18/dev.tsv")
test = read_tsv("../data/VUA18/test.tsv")

In [6]:
to_jsonl(train, "../data/VUA18_FT/train")
to_jsonl(dev, "../data/VUA18_FT/dev")
to_jsonl(test, "../data/VUA18_FT/test")

Saved dataset in data/VUA18_FT/train.jsonl
Saved dataset in data/VUA18_FT/dev.jsonl
Saved dataset in data/VUA18_FT/test.jsonl
