# Masking Code, Info, URLs, IDs from Prompts


In [34]:
import pandas as pd
import sqlite3
import os
import numpy as np

from helpers.make_latex_table import create_latex_tables
from helpers.normalization import remove_punctuation, remove_punctuation_and_newlines, remove_newlines
from helpers.statistical_tests import run_t_test_on_gender, compare_genders

db_path = "../../giicg.db"
if not os.path.exists(db_path):
    raise FileNotFoundError(f"Database file does not exist: {db_path}")

conn = sqlite3.connect(db_path)
prompts = pd.read_sql("SELECT * from expanded_roberta_prompts", conn)
prompts = prompts[prompts['gender'].isin(['Woman (cisgender)', 'Man (cisgender)'])].reset_index()
prompts

Unnamed: 0,level_0,index,message_id,conversation_id,role,message_text,conversational,code,other,gender,user_id,language,label
0,0,0,1,1,user,"parsing data from python iterator, how it coul...","parsing data from python iterator, how it coul...",,,Man (cisgender),6,en,0
1,1,1,730,32,user,Write python function to do operations with in...,Write python function to do operations with in...,,report_dt\tsource\tmetric_name\tmetric_num\tme...,Man (cisgender),6,en,0
2,2,2,1133,55,user,Write shortest tutorial on creating RAG on ema...,Write shortest tutorial on creating RAG on ema...,,,Man (cisgender),6,en,0
3,3,3,1135,55,user,what is FAISS,what is FAISS,,,Man (cisgender),6,en,0
4,4,4,1137,55,user,Transform given code to process large .mbox file,Transform given code to process large .mbox file,,Transform given code to process large .mbox file,Man (cisgender),6,en,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
531,531,501,1674,87,user,Accuracy: 1.0\n Count: 2\nMetrics for neptune...,these are the results. i to calculate a statis...,,Accuracy: 1.0\n Count: 2\nMetrics for neptune...,Woman (cisgender),73,en,1
532,532,416,1290,65,user,how are we currently processing non numerical ...,how are we currently processing non numerical ...,"def perform_optics_clustering(file_path, outpu...",,Woman (cisgender),73,en,1
533,533,425,1314,65,user,what is the reachability score,what is the reachability score,,,Woman (cisgender),73,en,1
534,534,309,372,21,user,"my features are saved in ""train_features.npy"" ...","my features are saved in ""train_features.npy"" ...",,,Woman (cisgender),73,en,1


In [16]:
users = prompts.groupby('gender').nunique()
users

Unnamed: 0_level_0,level_0,index,message_id,conversation_id,role,message_text,conversational,code,other,user_id,language,label
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Man (cisgender),282,282,282,37,1,282,282,49,48,15,6,1
Woman (cisgender),254,254,254,43,1,254,254,77,68,13,5,1


## Set up LLM

In [35]:
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv
from pydantic import BaseModel, Field
from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate

load_dotenv()

class OutputFormat(BaseModel):
    masked_prompt: str = Field(description="The processed version of the original prompt")

system_prompt = SystemMessagePromptTemplate.from_template(
    "You are an experienced NLP researcher that helps preprocessing text for NLP tasks."
)

user_prompt = HumanMessagePromptTemplate.from_template(
    """
    You are tasked with masking out parts of LLM prompts in order to prepare them for authorship classification.
    The prompt given to you is provided below:
    ---
    {prompt}
    ---
    Your task is to mask out any part that is not written in natural language. The masking tokens are the following:

    - [TERM]
    - [CODE]
    - [ERROR]
    - [URL]
    - [INFO]
    - [ID]
    - [OTHER]

    Below are some examples:

    'ObservableHQDatabaseClient#queryStream()result' should be masked out by [TERM]

    'queryStream = Object.dosomething()' should be masked out by [CODE]

    'https://www.youtube.com/watch?v=dQw4w9WgXcQ' should be masked out by [URL]

    'Error 21: undefined is not an object. Line: 44 ->      doc.artboards.setActiveArtboardIndex(i);' should be masked out by [ERROR]

    'data	persona	instruction	original	critique	question type 0	text...	age: 95\nsex: Female\nrace: White alone\nances...	text	text...	text...	train' should be masked out by [INFO]

    'task_id=565631' should be masked out by [ID]

    Total prompt example:
    'I want to remove all rows where the task_id=565631 can you correct my code?' should be transformed to:
    'I want to remove all rows where the [ID] can you correct my code?'

    If neither of the above masking tokens fit, mask out the respective part by [OTHER].

    In mixed sentences, only the non-natural language part should be masked out, keeping the rest.
    Punctuation, line breaks, and spacing should be preserved as much as possible apart from the masked spans.
    Multiple maskings in the same prompt should use different tokens where appropriate, unless they are nested.
    If a span could fit more than one label (e.g., a URL inside an error message), the outermost label should be used. Ids inside code blocks should not be masked separately, instead the while code block is masked.
    """,

input_variables=["prompt"]
)

complete_prompt = ChatPromptTemplate.from_messages([system_prompt, user_prompt])

def mask_prompt(prompt):
    OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
    openai_model = "gpt-4.1-2025-04-14"
    llm = ChatOpenAI(temperature=0.0, model=openai_model)
    structured_llm = llm.with_structured_output(OutputFormat)

    print(f"evaluating next prompt")
    chain_one = (
            {"prompt": lambda x: x["prompt"]}
            | complete_prompt
            | structured_llm
            | {"masked_prompt": lambda x: x.masked_prompt}
    )
    try:
        response =  chain_one.invoke({"prompt": prompt})
        return response["masked_prompt"]
    except Exception as e:
        print(f"Error processing prompt '{prompt}': {e}")
        # You can return np.nan or any string like "[MASKING_ERROR]" as needed
        return np.nan


In [36]:
prompts['masked_prompt'] = prompts['message_text'].apply(mask_prompt)
prompts

evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
Error processing prompt 'all of this happens when I open a card, why?
        
        Action dispatched: {type: 'SET_SELECTED_CARD_NODE', payload: {…}}
        main.5130a60c7955227d33d9.hot-update.js:71 Project ID: 67e80b433065ddbcbce6363a
        actions.js:84 Action dispatched: {type: 'SET_CARD_CONFIG', payload: {…}}
        actions.js:84 Action dispatched: {type: 'SET_GLOBAL_LOADING', payload: true}
        actions.js:84 Action dispatched: {type: 'SET_GLOBAL_LOADING', payload: true}
  

Unnamed: 0,level_0,index,message_id,conversation_id,role,message_text,conversational,code,other,gender,user_id,language,label,masked_prompt
0,0,0,1,1,user,"parsing data from python iterator, how it coul...","parsing data from python iterator, how it coul...",,,Man (cisgender),6,en,0,"parsing data from [TERM], how it could be hand..."
1,1,1,730,32,user,Write python function to do operations with in...,Write python function to do operations with in...,,report_dt\tsource\tmetric_name\tmetric_num\tme...,Man (cisgender),6,en,0,Write python function to do operations with in...
2,2,2,1133,55,user,Write shortest tutorial on creating RAG on ema...,Write shortest tutorial on creating RAG on ema...,,,Man (cisgender),6,en,0,Write shortest tutorial on creating [TERM] on ...
3,3,3,1135,55,user,what is FAISS,what is FAISS,,,Man (cisgender),6,en,0,what is [TERM]
4,4,4,1137,55,user,Transform given code to process large .mbox file,Transform given code to process large .mbox file,,Transform given code to process large .mbox file,Man (cisgender),6,en,0,Transform given code to process large [TERM] file
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
531,531,501,1674,87,user,Accuracy: 1.0\n Count: 2\nMetrics for neptune...,these are the results. i to calculate a statis...,,Accuracy: 1.0\n Count: 2\nMetrics for neptune...,Woman (cisgender),73,en,1,[INFO]\n[INFO]\n[TERM]:\n [TERM]: [OTHER]\n ...
532,532,416,1290,65,user,how are we currently processing non numerical ...,how are we currently processing non numerical ...,"def perform_optics_clustering(file_path, outpu...",,Woman (cisgender),73,en,1,how are we currently processing non numerical ...
533,533,425,1314,65,user,what is the reachability score,what is the reachability score,,,Woman (cisgender),73,en,1,what is the [TERM]
534,534,309,372,21,user,"my features are saved in ""train_features.npy"" ...","my features are saved in ""train_features.npy"" ...",,,Woman (cisgender),73,en,1,my features are saved in [TERM] and the file n...


In [37]:
prompts

Unnamed: 0,level_0,index,message_id,conversation_id,role,message_text,conversational,code,other,gender,user_id,language,label,masked_prompt
0,0,0,1,1,user,"parsing data from python iterator, how it coul...","parsing data from python iterator, how it coul...",,,Man (cisgender),6,en,0,"parsing data from [TERM], how it could be hand..."
1,1,1,730,32,user,Write python function to do operations with in...,Write python function to do operations with in...,,report_dt\tsource\tmetric_name\tmetric_num\tme...,Man (cisgender),6,en,0,Write python function to do operations with in...
2,2,2,1133,55,user,Write shortest tutorial on creating RAG on ema...,Write shortest tutorial on creating RAG on ema...,,,Man (cisgender),6,en,0,Write shortest tutorial on creating [TERM] on ...
3,3,3,1135,55,user,what is FAISS,what is FAISS,,,Man (cisgender),6,en,0,what is [TERM]
4,4,4,1137,55,user,Transform given code to process large .mbox file,Transform given code to process large .mbox file,,Transform given code to process large .mbox file,Man (cisgender),6,en,0,Transform given code to process large [TERM] file
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
531,531,501,1674,87,user,Accuracy: 1.0\n Count: 2\nMetrics for neptune...,these are the results. i to calculate a statis...,,Accuracy: 1.0\n Count: 2\nMetrics for neptune...,Woman (cisgender),73,en,1,[INFO]\n[INFO]\n[TERM]:\n [TERM]: [OTHER]\n ...
532,532,416,1290,65,user,how are we currently processing non numerical ...,how are we currently processing non numerical ...,"def perform_optics_clustering(file_path, outpu...",,Woman (cisgender),73,en,1,how are we currently processing non numerical ...
533,533,425,1314,65,user,what is the reachability score,what is the reachability score,,,Woman (cisgender),73,en,1,what is the [TERM]
534,534,309,372,21,user,"my features are saved in ""train_features.npy"" ...","my features are saved in ""train_features.npy"" ...",,,Woman (cisgender),73,en,1,my features are saved in [TERM] and the file n...


## Save to Database

In [39]:
prompts.to_sql("expanded_roberta_prompts", conn, if_exists="replace", index=False)

536