In [7]:
import os
import json
from tqdm import tqdm
import chardet
import re
from contractions import fix

# Paths to folders
folders = ["DNC", "DNQ", "NI", "XFER"]
base_path = "./Campaign 1/"  # Change this to the directory containing the folders

# Output file paths
output_jsonl = "processed_conversations.jsonl"

# Function to detect encoding of files
def detect_encoding(file_path):
    with open(file_path, "rb") as f:
        raw_data = f.read()
        result = chardet.detect(raw_data)
        return result["encoding"]

# Function to clean and normalize text
def clean_text(text):
    # Remove non-ASCII characters
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    
    # Expand contractions (e.g., "don't" -> "do not")
    text = fix(text)

    # Remove excessive whitespace
    text = re.sub(r"\s+", " ", text).strip()

    return text

# Helper function to process a single conversation
def process_conversation(file_path):
    conversation = {
        "instruction": "You are a sales call center agent. Your task is to assist customers during outbound calls while maintaining a professional tone.",
        "context": [],
    }
    encoding = detect_encoding(file_path)
    with open(file_path, "r", encoding=encoding) as file:
        lines = file.readlines()
    
    samples = []
    last_customer_line = None

    for line in lines:
        line = clean_text(line)  # Clean the line text
        if line.startswith("Agent:"):
            # Get the agent's response
            agent_response = clean_text(line[len("Agent:"):].strip())
            
            # If it's the start of the conversation, handle it
            if last_customer_line is None:
                samples.append({
                    "instruction": conversation["instruction"],
                    "context": ["<start_conversation>"],
                    "input": "<start_conversation>",
                    "output": agent_response
                })
            else:
                # Append to the dataset
                samples.append({
                    "instruction": conversation["instruction"],
                    "context": conversation["context"][:],  # Keep the context up to now
                    "input": last_customer_line,
                    "output": agent_response
                })
            
            # Add agent response to context
            conversation["context"].append(f"Agent: {agent_response}")

        elif line.startswith("Customer:"):
            # Get the customer's input
            customer_input = clean_text(line[len("Customer:"):].strip())
            last_customer_line = customer_input

            # Add customer input to context
            conversation["context"].append(f"Customer: {customer_input}")
    
    return samples

# Process all folders and save output
all_samples = []

for folder in folders:
    folder_path = os.path.join(base_path, folder)
    for file_name in tqdm(os.listdir(folder_path), desc=f"Processing folder: {folder}"):
        if file_name.endswith(".txt"):
            file_path = os.path.join(folder_path, file_name)
            all_samples.extend(process_conversation(file_path))

# Save to JSONL
with open(output_jsonl, "w", encoding="utf-8") as out_file:
    for sample in all_samples:
        out_file.write(json.dumps(sample) + "\n")

print(f"Processed dataset saved to {output_jsonl}")


Processing folder: DNC:   0%|          | 0/5860 [00:00<?, ?it/s]

Processing folder: DNC: 100%|██████████| 5860/5860 [00:07<00:00, 824.65it/s] 
Processing folder: DNQ: 100%|██████████| 5326/5326 [00:04<00:00, 1239.36it/s]
Processing folder: NI: 100%|██████████| 6376/6376 [00:06<00:00, 930.10it/s] 
Processing folder: XFER: 100%|██████████| 882/882 [00:00<00:00, 972.34it/s] 


Processed dataset saved to processed_conversations.jsonl


In [1]:
import json

def format_conversations(jsonl_file, output_file):
    reformatted_dataset = []
    
    with open(jsonl_file, "r", encoding="utf-8") as file:
        for line in file:
            sample = json.loads(line)
            conversation = []
            
            # Instruction as context
            conversation.append({"role": "system", "content": sample["instruction"]})
            
            # Add context (Customer-Agent alternation)
            for context_line in sample["context"]:
                # Check if the context line contains ": " for proper splitting
                if ": " in context_line:
                    role, content = context_line.split(": ", 1)
                    if role.lower() == "customer":
                        conversation.append({"role": "customer", "content": content})
                    elif role.lower() == "agent":
                        conversation.append({"role": "agent", "content": content})
                else:
                    # If the line is not formatted correctly, skip it or handle it
                    print(f"Skipping malformed line: {context_line}")
            
            # Add the last pair (input and output)
            conversation.append({"role": "customer", "content": sample["input"]})
            conversation.append({"role": "agent", "content": sample["output"]})
            
            # Append to dataset
            reformatted_dataset.append({"conversations": conversation})
    
    # Save reformatted dataset
    with open(output_file, "w", encoding="utf-8") as out_file:
        json.dump(reformatted_dataset, out_file, indent=4)

# Format the processed dataset
format_conversations("processed_conversations.jsonl", "formatted_conversations.json")


Skipping malformed line: <start_conversation>
Skipping malformed line: <start_conversation>
Skipping malformed line: <start_conversation>
Skipping malformed line: <start_conversation>
Skipping malformed line: <start_conversation>
Skipping malformed line: <start_conversation>
Skipping malformed line: <start_conversation>
Skipping malformed line: <start_conversation>
Skipping malformed line: <start_conversation>
Skipping malformed line: <start_conversation>
Skipping malformed line: <start_conversation>
Skipping malformed line: <start_conversation>
Skipping malformed line: <start_conversation>
Skipping malformed line: <start_conversation>
Skipping malformed line: <start_conversation>
Skipping malformed line: <start_conversation>
Skipping malformed line: <start_conversation>
Skipping malformed line: <start_conversation>
Skipping malformed line: <start_conversation>
Skipping malformed line: <start_conversation>
Skipping malformed line: <start_conversation>
Skipping malformed line: <start_co