In [1]:
# Cell 1: Imports and configuration
import pickle
import json
import requests
from typing import List, Dict
import random
from tqdm import tqdm
import re
from transformers import AutoTokenizer

# vLLM API configuration
VLLM_URL = "http://itx:8000/v1/chat/completions"
HEADERS = {"Content-Type": "application/json"}

GENERATION_CONFIG = {
    "model": "google/gemma-2-2b-it",
    "temperature": 0.8,
    "max_tokens": 1024,  # Increased for multi-turn conversations
    "top_p": 0.95
}

# Load processed conversations
with open('data/processed_conversations.pkl', 'rb') as f:
    conversations = pickle.load(f)

  _torch_pytree._register_pytree_node(


In [2]:
# Cell 2: Define generation functions
def generate_synthetic_conversation(context: str) -> List[Dict[str, str]]:
    """Generate a multi-turn synthetic conversation using the context"""
    system_prompt = """Du er en dansk samtalegenerator. Generer en realistisk dialog med flere udvekslinger 
    mellem 2 personer p√• dansk, inspireret af f√∏lgende samtale som kontekst.
    
    Dialogen skal have mindst 3 udvekslinger frem og tilbage.
    Format hver besked som:
    Person 1: [besked]
    Person 2: [besked]
    
    Kontekst:
    {context}
    
    Generer en ny, anderledes dialog om samme emne:"""
    
    try:
        response = requests.post(
            VLLM_URL,
            headers=HEADERS,
            json={
                "messages": [{"role": "user", "content": system_prompt.format(context=context)}],
                **GENERATION_CONFIG
            },
            timeout=10
        )
        response.raise_for_status()
        
        dialogue = response.json()['choices'][0]['message']['content']
        return parse_multi_turn_dialogue(dialogue)
        
    except Exception as e:
        print(f"Error generating conversation: {e}")
        return None

def parse_multi_turn_dialogue(dialogue: str) -> List[Dict[str, str]]:
    """Parse the generated dialogue into a list of message dictionaries"""
    messages = []
    lines = [line.strip() for line in dialogue.split('\n') if line.strip()]
    
    for line in lines:
        if ':' in line:
            speaker, content = line.split(':', 1)
            role = "user" if "Person 1" in speaker else "assistant"
            messages.append({
                "role": role,
                "content": content.strip()
            })
    
    return messages

def format_as_instruction_data(messages: List[Dict[str, str]]) -> List[Dict[str, List]]:
    """Format messages as instruction data for TRL fine-tuning"""
    instruction_examples = []
    
    # Create examples from consecutive turns
    for i in range(0, len(messages)-1, 2):
        if i+1 < len(messages):
            example = {
                "prompt": [messages[i]],  # User message
                "completion": [messages[i+1]]  # Assistant response
            }
            instruction_examples.append(example)
    
    return instruction_examples

In [3]:
# Cell 3: Generate synthetic dataset
instruction_dataset = []
max_conversations = 100  # Adjust as needed
min_conv_length = 50

print("Generating synthetic conversations...")
for conv_id, conv_text in tqdm(list(conversations.items())[:max_conversations]):
    if len(conv_text) < min_conv_length:
        continue
        
    # Generate synthetic multi-turn dialogue
    messages = generate_synthetic_conversation(conv_text)
    if messages and len(messages) >= 4:  # Ensure we have at least 2 turns
        # Format as instruction examples
        examples = format_as_instruction_data(messages)
        instruction_dataset.extend(examples)

print(f"Generated {len(instruction_dataset)} instruction examples")

# Save the dataset
if instruction_dataset:
    with open('danish_instruction_dataset.pkl', 'wb') as f:
        pickle.dump(instruction_dataset, f)

    # Print a few examples
    print("\nExample instruction data:")
    for example in instruction_dataset[:2]:
        print("\nPrompt messages:")
        for msg in example['prompt']:
            print(f"Role: {msg['role']}, Content: {msg['content']}")
        print("\nCompletion messages:")
        for msg in example['completion']:
            print(f"Role: {msg['role']}, Content: {msg['content']}")

Generating synthetic conversations...


 28%|‚ñà‚ñà‚ñä       | 28/100 [01:34<04:59,  4.16s/it]

Error generating conversation: 400 Client Error: Bad Request for url: http://itx:8000/v1/chat/completions


 48%|‚ñà‚ñà‚ñà‚ñà‚ñä     | 48/100 [02:41<03:33,  4.11s/it]

Error generating conversation: 400 Client Error: Bad Request for url: http://itx:8000/v1/chat/completions


 66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 66/100 [03:40<01:18,  2.32s/it]

Error generating conversation: 400 Client Error: Bad Request for url: http://itx:8000/v1/chat/completions


 78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 78/100 [04:18<00:48,  2.21s/it]

Error generating conversation: 400 Client Error: Bad Request for url: http://itx:8000/v1/chat/completions


 95%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå| 95/100 [05:06<00:10,  2.02s/it]

Error generating conversation: 400 Client Error: Bad Request for url: http://itx:8000/v1/chat/completions


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [05:20<00:00,  3.20s/it]

Error generating conversation: 400 Client Error: Bad Request for url: http://itx:8000/v1/chat/completions
Generated 317 instruction examples

Example instruction data:

Prompt messages:
Role: user, Content: Hej, har en lille problem med plug-in'er. Jeg pr√∏ver at optage vokal, men det ender altid med forsinkelse p√• den vokal-monitor.

Completion messages:
Role: assistant, Content: Hmm, det kan v√¶re frustrerende! Er det is√¶r p√• den side af plug-in'erne, eller n√•r du bruger dem i general?

Prompt messages:
Role: user, Content: Ja, det er is√¶r med plug-in'er i den √∏vre del af mixet. Jeg har en masse forskellige plugins p√• forskellige spor. Det er ikke en lang session, men n√•r jeg pr√∏ver at optage vokal, er der en forsinkelse.

Completion messages:
Role: assistant, Content: Hmm, det er m√•ske fordi du ikke mute'r plug-in'erne p√• de andre spor?





In [5]:
instruction_dataset

[{'prompt': [{'role': 'user',
    'content': "Hej, har en lille problem med plug-in'er. Jeg pr√∏ver at optage vokal, men det ender altid med forsinkelse p√• den vokal-monitor."}],
  'completion': [{'role': 'assistant',
    'content': "Hmm, det kan v√¶re frustrerende! Er det is√¶r p√• den side af plug-in'erne, eller n√•r du bruger dem i general?"}]},
 {'prompt': [{'role': 'user',
    'content': "Ja, det er is√¶r med plug-in'er i den √∏vre del af mixet. Jeg har en masse forskellige plugins p√• forskellige spor. Det er ikke en lang session, men n√•r jeg pr√∏ver at optage vokal, er der en forsinkelse."}],
  'completion': [{'role': 'assistant',
    'content': "Hmm, det er m√•ske fordi du ikke mute'r plug-in'erne p√• de andre spor?"}]},
 {'prompt': [{'role': 'user',
    'content': "Hmm, det var en tanke, men jeg har ikke umiddelbart haft succes med det. Jeg har pr√∏vet at bounce alle plug-in'er, men det giver ikke den √∏nskede effekt."}],
  'completion': [{'role': 'assistant',
    'content':