In [1]:
from datasets import Dataset
import json
from transformers import AutoTokenizer
import multiprocessing

  from .autonotebook import tqdm as notebook_tqdm


# load the tokenizer

In [2]:
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-360M")

In [None]:

# * load the dataset
data = []
with open('event_text_mapping.jsonl', 'r') as file:
    for line in file:
        data.append(json.loads(line))


In [4]:
len(data)

792

In [5]:
data[0]

{'event_text': 'Late night study session at the café on 15th, Dec 2024 at 9:00 pm for 2 hours.',
 'output': {'action': 'study session',
  'date': '15/12/2024',
  'time': '9:00 PM',
  'attendees': None,
  'location': 'café',
  'duration': '2 hours',
  'recurrence': None,
  'notes': None}}

# process the dataset

In [6]:
def process_data(row):
    query_text = "<user>" + row["event_text"] + "</user>"
    tool_text = "<output>" + json.dumps(row["output"]) + "</output>"

    # construct the final prompt on which the model will be finetuned
    row["text"] = query_text + tool_text + tokenizer.eos_token
    return row

In [7]:
dataset = Dataset.from_list(data)
print(dataset[0])

{'event_text': 'Late night study session at the café on 15th, Dec 2024 at 9:00 pm for 2 hours.', 'output': {'action': 'study session', 'attendees': None, 'date': '15/12/2024', 'duration': '2 hours', 'location': 'café', 'notes': None, 'recurrence': None, 'time': '9:00 PM'}}


In [8]:
dataset = dataset.map(
    process_data,
    num_proc= multiprocessing.cpu_count(),
    load_from_cache_file=False,
)

Map (num_proc=14): 100%|██████████| 792/792 [00:00<00:00, 4428.09 examples/s]


In [9]:
train_valid = dataset.train_test_split(test_size=0.2, seed=42)

# create a folder "data" in your cwd

In [10]:
train_valid["train"].to_json("data/train.jsonl")
train_valid["test"].to_json("data/valid.jsonl")

Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 64.22ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 335.92ba/s]


94364