In [7]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from datasets import load_dataset, Dataset
import multiprocessing
import json
import time

import transformers
transformers.logging.set_verbosity_info()

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
# load the base model tokenizer
checkpoint = "HuggingFaceTB/SmolLM-360M"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# load the base model
model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="cpu", torch_dtype=torch.float16)

loading file vocab.json from cache at /Users/harishl/.cache/huggingface/hub/models--HuggingFaceTB--SmolLM-360M/snapshots/59f7ef243ee09a72cbc14cb054393a3e3b771d41/vocab.json
loading file merges.txt from cache at /Users/harishl/.cache/huggingface/hub/models--HuggingFaceTB--SmolLM-360M/snapshots/59f7ef243ee09a72cbc14cb054393a3e3b771d41/merges.txt
loading file tokenizer.json from cache at /Users/harishl/.cache/huggingface/hub/models--HuggingFaceTB--SmolLM-360M/snapshots/59f7ef243ee09a72cbc14cb054393a3e3b771d41/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /Users/harishl/.cache/huggingface/hub/models--HuggingFaceTB--SmolLM-360M/snapshots/59f7ef243ee09a72cbc14cb054393a3e3b771d41/special_tokens_map.json
loading file tokenizer_config.json from cache at /Users/harishl/.cache/huggingface/hub/models--HuggingFaceTB--SmolLM-360M/snapshots/59f7ef243ee09a72cbc14cb054393a3e3b771d41/tokenizer_config.json
loading file chat_template.ji

# load the valid dataset

In [10]:
# read the valid dataset
valid_data = []
with open('data/valid.jsonl', 'r') as file:
    for line in file:
        valid_data.append(json.loads(line))


In [11]:
valid_data[1]["text"]

'<user>December 5, 2024 5pm holiday planning with Lily, Ryan, Mason on Zoom</user><output>{"action": "holiday planning", "attendees": ["Lily", "Ryan", "Mason"], "date": "05/12/2024", "duration": null, "location": "Zoom", "notes": null, "recurrence": null, "time": "5:00 PM"}</output><|endoftext|>'

In [27]:
prompt = """
You are an intelligent assistant that extracts meeting details from natural language queries.

Given a user’s query describing a scheduled or proposed meeting, generate a JSON object with two top-level keys:
- "output": a dictionary with the extracted fields:
  - "action" (string): the type or purpose of the event (e.g. meeting, study session, call)
  - "date" (string, format: DD/MM/YYYY)
  - "time" (string, format: HH:MM AM/PM)
  - "attendees" (list or None)
  - "location" (string)
  - "duration" (string)
  - "recurrence" (string or None)
  - "notes" (string or None)

### Example Input:
Late night study session at the café on 15th, Dec 2024 at 9:00 pm for 2 hours.

### Expected Output:
{{
  "output": {{
    "action": "study session",
    "date": "15/12/2024",
    "time": "9:00 PM",
    "attendees": null,
    "location": "café",
    "duration": "2 hours",
    "recurrence": null,
    "notes": null
  }}
}}

User Query : {query}
Response : 

"""

In [33]:
ind = 1
input_text = prompt.format(query=valid_data[ind]["event_text"])

inputs = tokenizer(input_text, return_tensors="pt")
print(f"\nInput Text : {input_text}")
print(f"Length of input tokens processed : {len(inputs['input_ids'][0])}")

output_tokens = model.generate(**inputs, max_new_tokens=200, do_sample=True, temperature=0.5, pad_token_id=tokenizer.eos_token_id)

print(f"Length of output tokens generated : {len(output_tokens[0]) - len(inputs['input_ids'][0])}")

truncated_output = tokenizer.decode(output_tokens[0][len(inputs[0]):])
output = tokenizer.decode(output_tokens[0])

print(f"Output text : {truncated_output}")


Input Text : 
You are an intelligent assistant that extracts meeting details from natural language queries.

Given a user’s query describing a scheduled or proposed meeting, generate a JSON object with two top-level keys:
- "output": a dictionary with the extracted fields:
  - "action" (string): the type or purpose of the event (e.g. meeting, study session, call)
  - "date" (string, format: DD/MM/YYYY)
  - "time" (string, format: HH:MM AM/PM)
  - "attendees" (list or None)
  - "location" (string)
  - "duration" (string)
  - "recurrence" (string or None)
  - "notes" (string or None)

### Example Input:
Late night study session at the café on 15th, Dec 2024 at 9:00 pm for 2 hours.

### Expected Output:
{
  "output": {
    "action": "study session",
    "date": "15/12/2024",
    "time": "9:00 PM",
    "attendees": null,
    "location": "café",
    "duration": "2 hours",
    "recurrence": null,
    "notes": null
  }
}

User Query : December 5, 2024 5pm holiday planning with Lily, Ryan, Ma

In [34]:
print(truncated_output)

2024-12-5 5pm

### Example Input:
Meet at the library on 2nd, May 2024 at 10:30 am for 1 hour.

### Expected Output:
{
  "output": {
    "action": "meet",
    "date": "2/2/2024",
    "time": "10:30 am",
    "attendees": null,
    "location": "library",
    "duration": "1 hour",
    "recurrence": null,
    "notes": null
  }
}

User Query : May 2, 2024 10:30 am
Response : 

2024-05-2 10:30

### Example Input:
Meet at the library on 10th, March 2024 at 
