# Filtering the prompts
Cleaning out:
- empty conversational rows
- leaked system prompts

In [14]:
import sqlite3
import pandas as pd
import os


db_path = "../../../giicg.db"
if not os.path.exists(db_path):
    raise FileNotFoundError(f"Database file does not exist: {db_path}")

conn = sqlite3.connect(db_path)
prompts = pd.read_sql("SELECT * FROM manually_split_prompts", conn)
prompts

Unnamed: 0,message_id,conversation_id,role,message_text,conversational,code,other,gender,user_id
0,1,1,user,"parsing data from python iterator, how it coul...","parsing data from python iterator, how it coul...",,,Man (cisgender),6
1,730,32,user,Write python function to do operations with in...,Write python function to do operations with in...,,report_dt\tsource\tmetric_name\tmetric_num\tme...,Man (cisgender),6
2,1133,55,user,Write shortest tutorial on creating RAG on ema...,Write shortest tutorial on creating RAG on ema...,,,Man (cisgender),6
3,1135,55,user,what is FAISS,what is FAISS,,,Man (cisgender),6
4,1137,55,user,Transform given code to process large .mbox file,Transform given code to process large .mbox file,,Transform given code to process large .mbox file,Man (cisgender),6
...,...,...,...,...,...,...,...,...,...
934,1646,82,user,"def run_query(query, n_results):\n query_em...",this is my code. I want to: Get nodes and edge...,"def run_query(query, n_results):\n query_em...",,Man (cisgender),92
935,1845,37,user,\n nun möchte ich judgement balancing m...,\n nun möchte ich judgement balancing m...,,,Woman (cisgender),29
936,1847,37,user,\n ich sehe keine veränderung im Plot. Was ...,\n ich sehe keine veränderung im Plot. Was ...,,,Woman (cisgender),29
937,1849,2,user,\n I am working on the problem of reconstru...,\n I am working on the problem of reconstru...,,Classic CV - Drone navigation\nIf you ever tho...,Man (cisgender),8


## Remove empty rows

In [15]:
prompts = prompts.dropna(subset=['conversational'])
prompts = prompts[prompts['conversational'].str.strip() != '']
prompts = prompts.dropna(subset=['conversational'])

prompts

Unnamed: 0,message_id,conversation_id,role,message_text,conversational,code,other,gender,user_id
0,1,1,user,"parsing data from python iterator, how it coul...","parsing data from python iterator, how it coul...",,,Man (cisgender),6
1,730,32,user,Write python function to do operations with in...,Write python function to do operations with in...,,report_dt\tsource\tmetric_name\tmetric_num\tme...,Man (cisgender),6
2,1133,55,user,Write shortest tutorial on creating RAG on ema...,Write shortest tutorial on creating RAG on ema...,,,Man (cisgender),6
3,1135,55,user,what is FAISS,what is FAISS,,,Man (cisgender),6
4,1137,55,user,Transform given code to process large .mbox file,Transform given code to process large .mbox file,,Transform given code to process large .mbox file,Man (cisgender),6
...,...,...,...,...,...,...,...,...,...
934,1646,82,user,"def run_query(query, n_results):\n query_em...",this is my code. I want to: Get nodes and edge...,"def run_query(query, n_results):\n query_em...",,Man (cisgender),92
935,1845,37,user,\n nun möchte ich judgement balancing m...,\n nun möchte ich judgement balancing m...,,,Woman (cisgender),29
936,1847,37,user,\n ich sehe keine veränderung im Plot. Was ...,\n ich sehe keine veränderung im Plot. Was ...,,,Woman (cisgender),29
937,1849,2,user,\n I am working on the problem of reconstru...,\n I am working on the problem of reconstru...,,Classic CV - Drone navigation\nIf you ever tho...,Man (cisgender),8


## Filter out duplicates

In [16]:
prompts = prompts.drop_duplicates(subset=['conversational'])
prompts

Unnamed: 0,message_id,conversation_id,role,message_text,conversational,code,other,gender,user_id
0,1,1,user,"parsing data from python iterator, how it coul...","parsing data from python iterator, how it coul...",,,Man (cisgender),6
1,730,32,user,Write python function to do operations with in...,Write python function to do operations with in...,,report_dt\tsource\tmetric_name\tmetric_num\tme...,Man (cisgender),6
2,1133,55,user,Write shortest tutorial on creating RAG on ema...,Write shortest tutorial on creating RAG on ema...,,,Man (cisgender),6
3,1135,55,user,what is FAISS,what is FAISS,,,Man (cisgender),6
4,1137,55,user,Transform given code to process large .mbox file,Transform given code to process large .mbox file,,Transform given code to process large .mbox file,Man (cisgender),6
...,...,...,...,...,...,...,...,...,...
934,1646,82,user,"def run_query(query, n_results):\n query_em...",this is my code. I want to: Get nodes and edge...,"def run_query(query, n_results):\n query_em...",,Man (cisgender),92
935,1845,37,user,\n nun möchte ich judgement balancing m...,\n nun möchte ich judgement balancing m...,,,Woman (cisgender),29
936,1847,37,user,\n ich sehe keine veränderung im Plot. Was ...,\n ich sehe keine veränderung im Plot. Was ...,,,Woman (cisgender),29
937,1849,2,user,\n I am working on the problem of reconstru...,\n I am working on the problem of reconstru...,,Classic CV - Drone navigation\nIf you ever tho...,Man (cisgender),8


## Remove leaked system prompts and task description

In [17]:
prompts = prompts[~prompts['conversational'].str.startswith("You are tasked with separating")]
prompts = prompts[~prompts['conversational'].str.startswith("Have fun and good luck!")]
prompts

Unnamed: 0,message_id,conversation_id,role,message_text,conversational,code,other,gender,user_id
0,1,1,user,"parsing data from python iterator, how it coul...","parsing data from python iterator, how it coul...",,,Man (cisgender),6
1,730,32,user,Write python function to do operations with in...,Write python function to do operations with in...,,report_dt\tsource\tmetric_name\tmetric_num\tme...,Man (cisgender),6
2,1133,55,user,Write shortest tutorial on creating RAG on ema...,Write shortest tutorial on creating RAG on ema...,,,Man (cisgender),6
3,1135,55,user,what is FAISS,what is FAISS,,,Man (cisgender),6
4,1137,55,user,Transform given code to process large .mbox file,Transform given code to process large .mbox file,,Transform given code to process large .mbox file,Man (cisgender),6
...,...,...,...,...,...,...,...,...,...
934,1646,82,user,"def run_query(query, n_results):\n query_em...",this is my code. I want to: Get nodes and edge...,"def run_query(query, n_results):\n query_em...",,Man (cisgender),92
935,1845,37,user,\n nun möchte ich judgement balancing m...,\n nun möchte ich judgement balancing m...,,,Woman (cisgender),29
936,1847,37,user,\n ich sehe keine veränderung im Plot. Was ...,\n ich sehe keine veränderung im Plot. Was ...,,,Woman (cisgender),29
937,1849,2,user,\n I am working on the problem of reconstru...,\n I am working on the problem of reconstru...,,Classic CV - Drone navigation\nIf you ever tho...,Man (cisgender),8


## Mask links

In [18]:
import re

def mask_urls(text):
    # Basic URL regex: matches http/https/ftp/etc and www.*
    url_pattern = r'(https?://\S+|www\.\S+)'
    return re.sub(url_pattern, 'URL', text)

prompts['conversational'] = prompts['conversational'].apply(mask_urls)

prompts


Unnamed: 0,message_id,conversation_id,role,message_text,conversational,code,other,gender,user_id
0,1,1,user,"parsing data from python iterator, how it coul...","parsing data from python iterator, how it coul...",,,Man (cisgender),6
1,730,32,user,Write python function to do operations with in...,Write python function to do operations with in...,,report_dt\tsource\tmetric_name\tmetric_num\tme...,Man (cisgender),6
2,1133,55,user,Write shortest tutorial on creating RAG on ema...,Write shortest tutorial on creating RAG on ema...,,,Man (cisgender),6
3,1135,55,user,what is FAISS,what is FAISS,,,Man (cisgender),6
4,1137,55,user,Transform given code to process large .mbox file,Transform given code to process large .mbox file,,Transform given code to process large .mbox file,Man (cisgender),6
...,...,...,...,...,...,...,...,...,...
934,1646,82,user,"def run_query(query, n_results):\n query_em...",this is my code. I want to: Get nodes and edge...,"def run_query(query, n_results):\n query_em...",,Man (cisgender),92
935,1845,37,user,\n nun möchte ich judgement balancing m...,\n nun möchte ich judgement balancing m...,,,Woman (cisgender),29
936,1847,37,user,\n ich sehe keine veränderung im Plot. Was ...,\n ich sehe keine veränderung im Plot. Was ...,,,Woman (cisgender),29
937,1849,2,user,\n I am working on the problem of reconstru...,\n I am working on the problem of reconstru...,,Classic CV - Drone navigation\nIf you ever tho...,Man (cisgender),8


## Write back to database

In [19]:
prompts.to_sql('filtered_prompts', conn, if_exists='replace', index=False)
conn.close()