# Masking Code, Info, URLs, IDs from Prompts


In [1]:
import pandas as pd
import sqlite3
import os
import numpy as np

from helpers.make_latex_table import create_latex_tables
from helpers.normalization import remove_punctuation, remove_punctuation_and_newlines, remove_newlines
from helpers.statistical_tests import run_t_test_on_gender, compare_genders

db_path = "../../giicg.db"
if not os.path.exists(db_path):
    raise FileNotFoundError(f"Database file does not exist: {db_path}")

conn = sqlite3.connect(db_path)
prompts = pd.read_sql("SELECT * from test_data_prompts", conn)
#prompts = prompts[prompts['gender'].isin(['Woman (cisgender)', 'Man (cisgender)'])].reset_index()
prompts

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,id,prompt,gender,label,created_at,user_id
0,1,I need some python code to verify the password...,Woman (cisgender),1,2025-10-04 00:19:48,1
1,2,I want you to create a plot for weather data. ...,Woman (cisgender),1,2025-10-04 00:22:51,1
2,3,Perfect. Can you add little bullet point on th...,Woman (cisgender),1,2025-10-04 00:22:51,1
3,4,Can you remove the x-axis description 'Days of...,Woman (cisgender),1,2025-10-04 00:22:51,1
4,5,great. Now I want to create another plot. The ...,Woman (cisgender),1,2025-10-04 00:22:51,1
...,...,...,...,...,...,...
74,75,Okay I need you to make some more adjustments....,Man (cisgender),0,2025-10-04 00:54:04,10
75,76,Okay almost! I still want lines for the x and ...,Man (cisgender),0,2025-10-04 00:54:04,10
76,77,very close. The only thing that needs to be ch...,Man (cisgender),0,2025-10-04 00:54:04,10
77,78,"Great! One more thing: Remove the ""Weekdays"" t...",Man (cisgender),0,2025-10-04 00:54:04,10


In [2]:
users = prompts.groupby('gender').nunique()
users

Unnamed: 0_level_0,id,prompt,label,created_at,user_id
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Man (cisgender),46,46,1,11,5
Woman (cisgender),33,33,1,13,5


## Set up LLM

In [3]:
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv
from pydantic import BaseModel, Field
from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate, ChatPromptTemplate

load_dotenv()

class OutputFormat(BaseModel):
    masked_prompt: str = Field(description="The processed version of the original prompt")

system_prompt = SystemMessagePromptTemplate.from_template(
    "You are an experienced NLP researcher that helps preprocessing text for NLP tasks."
)

user_prompt = HumanMessagePromptTemplate.from_template(
    """
    You are tasked with masking out parts of LLM prompts in order to prepare them for authorship classification.
    The prompt given to you is provided below:
    ---
    {prompt}
    ---
    Your task is to mask out any part that is not written in natural language. The masking tokens are the following:

    - [TERM]
    - [CODE]
    - [ERROR]
    - [URL]
    - [INFO]
    - [ID]
    - [OTHER]

    Below are some examples:

    'ObservableHQDatabaseClient#queryStream()result' should be masked out by [TERM]

    'queryStream = Object.dosomething()' should be masked out by [CODE]

    'https://www.youtube.com/watch?v=dQw4w9WgXcQ' should be masked out by [URL]

    'Error 21: undefined is not an object. Line: 44 ->      doc.artboards.setActiveArtboardIndex(i);' should be masked out by [ERROR]

    'data	persona	instruction	original	critique	question type 0	text...	age: 95\nsex: Female\nrace: White alone\nances...	text	text...	text...	train' should be masked out by [INFO]

    'task_id=565631' should be masked out by [ID]

    Total prompt example:
    'I want to remove all rows where the task_id=565631 can you correct my code?' should be transformed to:
    'I want to remove all rows where the [ID] can you correct my code?'

    If neither of the above masking tokens fit, mask out the respective part by [OTHER].

    In mixed sentences, only the non-natural language part should be masked out, keeping the rest.
    Punctuation, line breaks, and spacing should be preserved as much as possible apart from the masked spans.
    Multiple maskings in the same prompt should use different tokens where appropriate, unless they are nested.
    If a span could fit more than one label (e.g., a URL inside an error message), the outermost label should be used. Ids inside code blocks should not be masked separately, instead the while code block is masked.
    """,

input_variables=["prompt"]
)

complete_prompt = ChatPromptTemplate.from_messages([system_prompt, user_prompt])

def mask_prompt(prompt):
    OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
    openai_model = "gpt-4.1-2025-04-14"
    llm = ChatOpenAI(temperature=0.0, model=openai_model)
    structured_llm = llm.with_structured_output(OutputFormat)

    print(f"evaluating next prompt")
    chain_one = (
            {"prompt": lambda x: x["prompt"]}
            | complete_prompt
            | structured_llm
            | {"masked_prompt": lambda x: x.masked_prompt}
    )
    try:
        response =  chain_one.invoke({"prompt": prompt})
        return response["masked_prompt"]
    except Exception as e:
        print(f"Error processing prompt '{prompt}': {e}")
        # You can return np.nan or any string like "[MASKING_ERROR]" as needed
        return np.nan


In [4]:
prompts['masked_prompt'] = prompts['prompt'].apply(mask_prompt)
prompts

evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating next prompt
evaluating 

Unnamed: 0,id,prompt,gender,label,created_at,user_id,masked_prompt
0,1,I need some python code to verify the password...,Woman (cisgender),1,2025-10-04 00:19:48,1,I need some python code to verify the password...
1,2,I want you to create a plot for weather data. ...,Woman (cisgender),1,2025-10-04 00:22:51,1,I want you to create a plot for weather data. ...
2,3,Perfect. Can you add little bullet point on th...,Woman (cisgender),1,2025-10-04 00:22:51,1,Perfect. Can you add little bullet point on th...
3,4,Can you remove the x-axis description 'Days of...,Woman (cisgender),1,2025-10-04 00:22:51,1,Can you remove the x-axis description [OTHER] ...
4,5,great. Now I want to create another plot. The ...,Woman (cisgender),1,2025-10-04 00:22:51,1,great. Now I want to create another plot. The ...
...,...,...,...,...,...,...,...
74,75,Okay I need you to make some more adjustments....,Man (cisgender),0,2025-10-04 00:54:04,10,Okay I need you to make some more adjustments....
75,76,Okay almost! I still want lines for the x and ...,Man (cisgender),0,2025-10-04 00:54:04,10,Okay almost! I still want lines for the x and ...
76,77,very close. The only thing that needs to be ch...,Man (cisgender),0,2025-10-04 00:54:04,10,very close. The only thing that needs to be ch...
77,78,"Great! One more thing: Remove the ""Weekdays"" t...",Man (cisgender),0,2025-10-04 00:54:04,10,Great! One more thing: Remove the [TERM] title...


In [37]:
prompts

Unnamed: 0,level_0,index,message_id,conversation_id,role,message_text,conversational,code,other,gender,user_id,language,label,masked_prompt
0,0,0,1,1,user,"parsing data from python iterator, how it coul...","parsing data from python iterator, how it coul...",,,Man (cisgender),6,en,0,"parsing data from [TERM], how it could be hand..."
1,1,1,730,32,user,Write python function to do operations with in...,Write python function to do operations with in...,,report_dt\tsource\tmetric_name\tmetric_num\tme...,Man (cisgender),6,en,0,Write python function to do operations with in...
2,2,2,1133,55,user,Write shortest tutorial on creating RAG on ema...,Write shortest tutorial on creating RAG on ema...,,,Man (cisgender),6,en,0,Write shortest tutorial on creating [TERM] on ...
3,3,3,1135,55,user,what is FAISS,what is FAISS,,,Man (cisgender),6,en,0,what is [TERM]
4,4,4,1137,55,user,Transform given code to process large .mbox file,Transform given code to process large .mbox file,,Transform given code to process large .mbox file,Man (cisgender),6,en,0,Transform given code to process large [TERM] file
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
531,531,501,1674,87,user,Accuracy: 1.0\n Count: 2\nMetrics for neptune...,these are the results. i to calculate a statis...,,Accuracy: 1.0\n Count: 2\nMetrics for neptune...,Woman (cisgender),73,en,1,[INFO]\n[INFO]\n[TERM]:\n [TERM]: [OTHER]\n ...
532,532,416,1290,65,user,how are we currently processing non numerical ...,how are we currently processing non numerical ...,"def perform_optics_clustering(file_path, outpu...",,Woman (cisgender),73,en,1,how are we currently processing non numerical ...
533,533,425,1314,65,user,what is the reachability score,what is the reachability score,,,Woman (cisgender),73,en,1,what is the [TERM]
534,534,309,372,21,user,"my features are saved in ""train_features.npy"" ...","my features are saved in ""train_features.npy"" ...",,,Woman (cisgender),73,en,1,my features are saved in [TERM] and the file n...


## Save to Database

In [5]:
prompts.to_sql("masked_test_set_prompts", conn, if_exists="replace", index=False)

79