In [18]:
import re
import os
import pandas as pd
import json
from collections import defaultdict
from openai import OpenAI
import tiktoken  # Tokenizer library

In [19]:
with open('../input/secret.json', 'r', encoding='UTF-8') as file:
    secret_json = json.load(file)
api_key = secret_json['OPENAI_API_KEY']
client = OpenAI(api_key=api_key)

In [4]:
def count_placeholder_occurrences(input_folder):
    placeholder_counts = defaultdict(int)
    placeholder_files = defaultdict(list)
    
    # Get and sort all TSV files in the input folder
    file_names = sorted(
        [f for f in os.listdir(input_folder) if f.startswith('teacherstudentchat') and f.endswith('.tsv')],
        key=lambda x: int(re.search(r'\d+', x).group())
    )
    
    # Process the files in the sorted order
    for file_name in file_names:
        file_path = os.path.join(input_folder, file_name)
        
        # Read the TSV file
        df = pd.read_csv(file_path, sep='\t')
        
        # Combine the 'role' and 'edited' columns into a single transcript
        transcript = "\n".join(f"{row['role']}: {row['edited']}" for _, row in df.iterrows())
        
        # Find all placeholders in the transcript
        placeholders = re.findall(r'<([^>]+)>', transcript)
        
        # Count occurrences of each placeholder and track the files they appear in
        for placeholder in placeholders:
            placeholder_counts[placeholder] += 1
            if file_name not in placeholder_files[placeholder]:
                placeholder_files[placeholder].append(file_name)
    
    return dict(placeholder_counts), dict(placeholder_files)

# Example usage
input_folder = '../../public'
placeholder_counts, placeholder_files = count_placeholder_occurrences(input_folder)

# Print the counts of each placeholder category
for category, count in placeholder_counts.items():
    print(f"{category.lower()}: {count}")



student: 874
teacher: 197
another student: 41
student's short name: 2
student's full name: 1
student's username: 1
teacher's short name: 1
teacher's name: 3
student's name: 2
name of the band: 1
name: 2
another teacher: 6
cat's name: 10
cat's name1: 3
cat's name2: 1
teacher's son's full name: 1
teacher's son's short name: 1
teacher's son's middle name: 1
student's cousin's full name: 1
student's cousin's short name1: 1
student's cousin's short name2: 2
student's cousin's short name3: 1
dog's name: 2
lizard's name: 7
lizard's name's: 1
student's cousin: 1
language school: 48
time: 9
student's friend: 36
teacher's child: 95
age: 7
student's child: 172
teacher's husband: 50
student's husband's: 1
yoga instructor: 1
student's husband: 66
student's classmate: 4
student's classmate1: 2
student's classmate2: 1
teacher1: 5
teacher2: 2
teacher's son: 4
student's friend1: 7
teacher's sister: 6
dob: 2
students: 1
year: 1
instagram account: 1
student's : 1
student's email address: 1
student's comp

In [5]:
# Print the files where a specific placeholder category appears
specific_category = 'lizard\'s name'.upper()
if specific_category in placeholder_files:
    print(f"\nFiles containing the placeholder '{specific_category}':")
    for file_name in placeholder_files[specific_category]:
        print(file_name)
else:
    print(f"No occurrences of the placeholder '{specific_category}' found.")


Files containing the placeholder 'LIZARD'S NAME':
teacherstudentchat00092.tsv
teacherstudentchat00093.tsv
teacherstudentchat00095.tsv


In [3]:
# Set your OpenAI API key
openai.api_key = 'your-api-key'

# Function to generate prompts dynamically
def get_prompt(category):
    name_keywords = ["name", "teacher", "student", "friend", "husband", "child", "son", "daughter", "cousin", "sister", "brother", 
                     "instructor", "colleague", "partner", "employee", "researcher", "school", "account", "resort"]
    for keyword in name_keywords:
        if keyword in category.lower():
            # return f"Generate a realistic {category.lower()}. Just give me one name with no additional output."
            # TODO: Look at this. Generate the name simulated to a ASR process.
            return f"Generate a {category.lower()}. Just give me one name with no additional output."
    
    # Default prompt for non-name categories
    return f"Generate a realistic {category.lower()}. Just give me one result with no additional output."

# Function to generate synthetic data
def generate_synthetic_data(category):
    prompt = get_prompt(category)
    
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ]
    )
    return response.choices[0].message['content'].strip()

# Function to replace placeholders and record locations
def replace_placeholders(text):
    pattern = re.compile(r'<([^>]+)>')
    placeholder_locations = []
    synthetic_data = {}
    
    for match in pattern.finditer(text):
        placeholder = match.group(1)
        placeholder_locations.append((match.start(), match.end(), placeholder))
        if placeholder not in synthetic_data:
            synthetic_data[placeholder] = generate_synthetic_data(placeholder)
    
    for placeholder, synthetic in synthetic_data.items():
        text = text.replace(f"<{placeholder}>", synthetic)
    
    return text, placeholder_locations

In [5]:
# Main replacement function, take ~9 minutes to run on local machine (Apple M3 Pro Chip)
# Paths
input_folder = '../../public'

# Lists to store transcripts and placeholder locations
original_transcripts = []
processed_transcripts = []
placeholder_locations_list = []

# Get and sort all TSV files in the input folder
file_names = sorted(
    [f for f in os.listdir(input_folder) if f.startswith('teacherstudentchat') and f.endswith('.tsv')],
    key=lambda x: int(re.search(r'\d+', x).group())
)

# Process the files in the sorted order
for i, file_name in enumerate(file_names):
    print(f"Iteration {i}: {file_name}")
    file_path = os.path.join(input_folder, file_name)
    
    # Read the TSV file
    df = pd.read_csv(file_path, sep='\t')
    
    # Combine the 'role' and 'edited' columns into a single transcript
    transcript = "\n".join(f"{row['role']}: {row['edited']}" for _, row in df.iterrows())
    
    # Store the original transcript
    original_transcripts.append(transcript)
    
    # Process the transcript and record placeholder locations
    processed_transcript, placeholder_locations = replace_placeholders(transcript)
    
    # Store the processed transcript and placeholder locations
    processed_transcripts.append(processed_transcript)
    placeholder_locations_list.append(placeholder_locations)

# Save the original transcripts to a text file
with open('original_transcripts.txt', 'w') as f:
    f.write(json.dumps(original_transcripts, indent=4))

# Save the processed transcripts to a text file
with open('processed_transcripts.txt', 'w') as f:
    f.write(json.dumps(processed_transcripts, indent=4))

# Save the placeholder locations to a text file
with open('placeholder_locations.txt', 'w') as f:
    f.write(json.dumps(placeholder_locations_list, indent=4))

Iteration 0: teacherstudentchat00002.tsv
Iteration 1: teacherstudentchat00003.tsv
Iteration 2: teacherstudentchat00004.tsv
Iteration 3: teacherstudentchat00005.tsv
Iteration 4: teacherstudentchat00006.tsv
Iteration 5: teacherstudentchat00007.tsv
Iteration 6: teacherstudentchat00008.tsv
Iteration 7: teacherstudentchat00009.tsv
Iteration 8: teacherstudentchat00010.tsv
Iteration 9: teacherstudentchat00011.tsv
Iteration 10: teacherstudentchat00012.tsv
Iteration 11: teacherstudentchat00013.tsv
Iteration 12: teacherstudentchat00014.tsv
Iteration 13: teacherstudentchat00015.tsv
Iteration 14: teacherstudentchat00016.tsv
Iteration 15: teacherstudentchat00017.tsv
Iteration 16: teacherstudentchat00018.tsv
Iteration 17: teacherstudentchat00019.tsv
Iteration 18: teacherstudentchat00020.tsv
Iteration 19: teacherstudentchat00021.tsv
Iteration 20: teacherstudentchat00022.tsv
Iteration 21: teacherstudentchat00023.tsv
Iteration 22: teacherstudentchat00024.tsv
Iteration 23: teacherstudentchat00025.tsv
It

In [40]:
import openai
import os
import pandas as pd
import re

# Set your OpenAI API key
# openai.api_key = 'your-api-key'

# Function to generate prompts dynamically
def get_prompt(category):
    name_keywords = ["name", "teacher", "student", "friend", "husband", "child", "son", "daughter", "cousin", "sister", "brother", 
                     "instructor", "colleague", "partner", "employee", "researcher", "school", "account", "resort"]
    for keyword in name_keywords:
        if keyword in category.lower():
            return f"Generate a {category.lower()}. Just give me one name with no additional output."
    
    # Default prompt for non-name categories
    return f"Generate a realistic {category.lower()}. Just give me one result with no additional output."

# Function to generate synthetic data
def generate_synthetic_data(category):
    prompt = get_prompt(category)

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        temperature = 0)
    return response.choices[0].message.content

# Function to replace placeholders and record locations
def replace_placeholders(text):
    pattern = re.compile(r'<([^>]+)>')
    placeholder_locations = []
    offset = 0  # This will track the changing index due to replacements
    
    # Iterate through all placeholders in the text
    for match in pattern.finditer(text):
        placeholder = match.group(1)
        start_pos = match.start() + offset
        end_pos = match.end() + offset

        # Generate synthetic data for each placeholder
        synthetic_data = generate_synthetic_data(placeholder)
        
        # Calculate the new positions based on the length of the synthetic data
        replacement_length = len(synthetic_data)
        original_length = end_pos - start_pos
        offset += replacement_length - original_length  # Adjust offset for the length difference
        
        # Record the replacement info with updated positions
        placeholder_locations.append({
            "start": start_pos,
            "end": start_pos + replacement_length,
            "placeholder": placeholder,
            "replacement": synthetic_data
        })

        # Replace the placeholder in the text
        text = text[:start_pos] + synthetic_data + text[end_pos:]

    return text, placeholder_locations

# Main processing function
def process_transcripts(input_folder):
    # Lists to store transcripts and placeholder information
    processed_transcripts = []
    placeholder_info = []

    # Get and sort all TSV files in the input folder
    file_names = sorted(
        [f for f in os.listdir(input_folder) if f.startswith('teacherstudentchat0000') and f.endswith('.tsv')],
        key=lambda x: int(re.search(r'\d+', x).group())
    )
    print(file_names)

    # Process each file
    for i, file_name in enumerate(file_names, start=2):  # file_idx starts from 2
        print(f"Processing file {i}: {file_name}")
        file_path = os.path.join(input_folder, file_name)
        
        # Read the TSV file
        df = pd.read_csv(file_path, sep='\t')
        
        # Combine the 'role' and 'edited' columns into a single transcript
        transcript = "\n".join(f"{row['role']}: {row['edited']}" for _, row in df.iterrows())
        
        # Process the transcript and record placeholder locations
        processed_transcript, placeholder_locations = replace_placeholders(transcript)
        
        # Store the processed transcript
        processed_transcripts.append(processed_transcript)

        # Save placeholder info to the list
        for entry in placeholder_locations:
            placeholder_info.append({
                "file_idx": i,
                "entity_text": entry['replacement'],
                "type": entry['placeholder'],
                "positions": f"({entry['start']}, {entry['end']})"
            })

    # Save the placeholder information to a DataFrame
    df_placeholder_info = pd.DataFrame(placeholder_info, columns=["file_idx", "entity_text", "type", "positions"])

    # Save the DataFrame to a CSV file
    df_placeholder_info.to_csv('synthetic_data_positions.csv', index=False)

    print("Processing completed. Data saved to synthetic_data_positions.csv.")

# Example usage
input_folder = '../../public'  # Define the correct input folder path
process_transcripts(input_folder)


['teacherstudentchat00002.tsv', 'teacherstudentchat00003.tsv', 'teacherstudentchat00004.tsv', 'teacherstudentchat00005.tsv', 'teacherstudentchat00006.tsv', 'teacherstudentchat00007.tsv', 'teacherstudentchat00008.tsv', 'teacherstudentchat00009.tsv']
Processing file 2: teacherstudentchat00002.tsv
Processing file 3: teacherstudentchat00003.tsv
Processing file 4: teacherstudentchat00004.tsv
Processing file 5: teacherstudentchat00005.tsv
Processing file 6: teacherstudentchat00006.tsv
Processing file 7: teacherstudentchat00007.tsv
Processing file 8: teacherstudentchat00008.tsv
Processing file 9: teacherstudentchat00009.tsv
Processing completed. Data saved to synthetic_data_positions.csv.
