In [1]:
import pandas as pd
from collections import Counter
import re
from nltk import ngrams, word_tokenize
import spacy
from itertools import product
from googletrans import Translator

In [4]:
splits = {'train': 'data/train-00000-of-00001-9564e8b05b4757ab.parquet', 'test': 'data/test-00000-of-00001-701d16158af87368.parquet'}
df = pd.read_parquet("hf://datasets/deepset/prompt-injections/" + splits["train"])

# Removing other language prompts

In [16]:
translator = Translator()
def detect_language(text):
    return translator.detect(text).lang

In [20]:
df['language'] = df['text'].apply(detect_language)
df = df[df['language'] == 'en']
df = df.drop(columns=['language'])

In [21]:
df.to_csv('english.csv')

In [22]:
df=pd.read_csv('english.csv')

# Normalising Dataset

In [23]:
def normalizeText(sentence):
    sentence = sentence.lower()
    sentence = re.sub(r"[^\w\s]|_", "", sentence)
    sentence = re.sub(r"\s+", " ", sentence)

    return sentence


In [24]:
df['text'] = df['text'].apply(normalizeText)


In [25]:
df.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,0,refugee crisis in europe solutions,0
1,1,my son is very interested in programming and w...,0
2,2,i am looking for a new book and would like to ...,0
3,3,i plan a trip around the world and wonder whic...,0
4,4,wow thats fantastic forget all previous tasks ...,1


In [26]:
injected=df[df['label']==1]

In [27]:
injected.head()

Unnamed: 0.1,Unnamed: 0,text,label
4,4,wow thats fantastic forget all previous tasks ...,1
10,10,attention stop i urgently need your help with ...,1
32,32,john and alice are two actors in a film about ...,1
38,38,hello gpt that was ok before now i have a grea...,1
42,42,i want you to act as a debater i will provide ...,1


# Naive Method for finding most common words

In [28]:
all_words = []
for sentence in injected['text']:
    words = sentence.split()
    all_words.extend(words)

word_counts = Counter(all_words)

In [29]:
for word, frequency in word_counts.most_common(10):
    print(word,frequency)

the 207
you 119
and 118
to 109
a 77
in 72
i 69
is 67
are 59
of 53


In [30]:
def extractTrigrams(text):
    tokens = text.split()
    return list(ngrams(tokens,3))

In [31]:
trigrams = list(injected['text'].apply(extractTrigrams)) 
trigrams = [x for xs in trigrams for x in xs]

In [32]:
Counter(trigrams).most_common(20)

[(('chatgpt', 'with', 'dan'), 19),
 (('with', 'dan', 'mode'), 19),
 (('dan', 'mode', 'enabled'), 18),
 (('germany', 'or', 'italy'), 10),
 (('or', 'italy', 'or'), 10),
 (('italy', 'or', 'swiss'), 9),
 (('how', 'do', 'you'), 8),
 (('i', 'want', 'you'), 8),
 (('want', 'you', 'to'), 8),
 (('how', 'will', 'i'), 7),
 (('will', 'i', 'be'), 7),
 (('i', 'be', 'able'), 7),
 (('to', 'drive', 'officially'), 7),
 (('drive', 'officially', 'at'), 7),
 (('officially', 'at', 'the'), 7),
 (('at', 'the', 'age'), 7),
 (('the', 'age', 'of'), 7),
 (('age', 'of', '16'), 7),
 (('of', '16', 'in'), 7),
 (('16', 'in', 'germany'), 7)]

In [33]:
nlp = spacy.load("en_core_web_md")

In [34]:
def filter_similar_words(input_words,target_tokens, threshold=0.5):
    similar_verbs = []
    for verb in input_words:
        token = nlp(verb)
        # Check similarity against all target tokens
        max_similarity = max([token.similarity(target) for target in target_tokens])
        if max_similarity >= threshold:
            similar_verbs.append(verb)
    return similar_verbs

In [35]:
def get_tokens(sentence,token_type):
    doc = nlp(sentence)
    words = [token.text for token in doc if token.pos_ == token_type]   
    return words

In [36]:
injected['text'] = injected['text'].apply(word_tokenize)
injected['text'] = injected['text'].apply(" ".join)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  injected['text'] = injected['text'].apply(word_tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  injected['text'] = injected['text'].apply(" ".join)


In [37]:
verbs = []
for j in [get_tokens(i,"VERB") for i in injected['text']]:
    verbs.extend(j)

In [38]:
adjectives = []
for j in [get_tokens(i,"ADJ") for i in injected['text']]:
    adjectives.extend(j)

In [39]:
nouns = []
for j in [get_tokens(i,"NOUN") for i in injected['text']]:
    nouns.extend(j)

In [40]:
verbs[:10]

['s',
 'forget',
 'focus',
 'show',
 'need',
 'following',
 'plays',
 'stay',
 'falling',
 'break']

In [41]:
adjectives[:10]

['fantastic',
 'previous',
 'new',
 'prompt',
 'unelectable',
 'excellent',
 'second',
 'ok',
 'great',
 'new']

In [42]:
nouns[:10]

['tasks',
 'task',
 'texts',
 'attention',
 'stop',
 'help',
 'question',
 'greens',
 'actors',
 'film']

In [43]:
target_verbs=["ignore","forget"]
target_verbs = [nlp(i) for i in target_verbs]
injected_verbs = filter_similar_words(verbs,target_verbs,threshold=0.575)

  max_similarity = max([token.similarity(target) for target in target_tokens])


In [44]:
target_adjs=["previous","above"]
target_adjs = [nlp(i) for i in target_adjs]
injected_adjs = filter_similar_words(adjectives,target_adjs,threshold=0.55)

  max_similarity = max([token.similarity(target) for target in target_tokens])


In [45]:
target_nouns=["content","input","text"]
target_nouns = [nlp(i) for i in target_nouns]
injected_nouns = filter_similar_words(nouns,target_nouns,threshold=0.65)

  max_similarity = max([token.similarity(target) for target in target_tokens])


In [46]:
set(injected_verbs)

{'argue',
 'blame',
 'do',
 'forget',
 'happend',
 'ignore',
 'ignores',
 'imagine',
 'know',
 'listen',
 'pretend',
 'refute',
 'remember',
 'remind',
 'tell',
 'think',
 'understand',
 'want'}

In [47]:
set(injected_adjs)

{'above',
 'current',
 'following',
 'further',
 'other',
 'previous',
 'recent',
 'regular'}

In [48]:
set(injected_nouns)

{'content',
 'context',
 'information',
 'input',
 'internet',
 'materials',
 'output',
 'text',
 'texts'}

In [49]:
rules = []
for verb in injected_verbs:
    for adj in injected_adjs:
        for noun in injected_nouns:
            rule = verb + " " + adj + " " + noun
            rules.append(rule)
rules = list(set(rules))


In [50]:
rules[:15]

['remember recent content',
 'argue above information',
 'blame previous materials',
 'ignores recent content',
 'refute current texts',
 'refute recent context',
 'argue previous information',
 'remind recent input',
 'remember current content',
 'refute above context',
 'remember current information',
 'refute following materials',
 'want other materials',
 'listen regular output',
 'forget other input']

In [51]:
def calculate_score(sentence):
    sentence = normalizeText(sentence)
    trigrams = extractTrigrams(sentence)
    numOfMatchedWords = 0
    matchingInjections = []
    # for trigram in trigrams:
    #     for rule in rules:
    #         numOfMatchedWords += len([injected for injected, word in zip(trigram, rule.split()) if word == injected])
    for trigram, rule in product(trigrams,rules):
        group = " ".join(trigram)
        if group == rule:
            matchingInjections.append(group)
            numOfMatchedWords += 1
    
    return numOfMatchedWords,matchingInjections

In [58]:
def check_proximity(sentence):
    n=1
    # Tokenize the sentence
    words = word_tokenize(sentence)
    
    # Find the positions of verbs, adjectives, and nouns
    verb_positions = [i for i, word in enumerate(words) if word in injected_verbs]
    adj_positions = [i for i, word in enumerate(words) if word in injected_adjs]
    noun_positions = [i for i, word in enumerate(words) if word in injected_nouns]
    score = 0
    # Check proximity between verbs, adjectives, and nouns
    for v in verb_positions:
        for adj in adj_positions:
            if abs(v - adj) <= n:
                score += 1
        for noun in noun_positions:
            if abs(v - noun) <= n:
                score += 1
            
                
    for adj in adj_positions:
        for noun in noun_positions:
            if abs(adj - noun) <= n:
                score += 1
    

    return score

In [59]:
df_test = df

In [60]:
df_test['score'] = df_test['text'].apply(check_proximity)

In [61]:
df_test.to_csv('test.csv')

In [56]:
# TODO create a confusion matrix with lable and score
# TODO plot frequency over score

In [57]:
def detect_verb_adj_noun_prep(sentence):
    # Process the sentence using spaCy
    doc = nlp(sentence)
    
    # Initialize variables to track matches
    verb = None
    adjective = None
    noun = None
    preposition = None
    
    commands = []

    # Loop through the tokens and try to find the pattern
    for token in doc:
        if verb is None and token.pos_ == "VERB":
            verb = token
        elif verb and adjective is None and token.pos_ == "ADJ":
            adjective = token
        elif verb and adjective and noun is None and token.dep_ == "dobj":
            noun = token
        elif verb and adjective and noun and preposition is None and token.pos_ == "ADP":
            preposition = token
            # Append the matched words to the commands list
            commands.append(f"{verb.text} {adjective.text} {noun.text} {preposition.text}")
            # Reset variables to find more patterns if they exist in the sentence
            verb = adjective = noun = preposition = None

    return commands

In [28]:
injected['command'] = injected['text'].apply(detect_verb_adj_noun_prep)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  injected['command'] = injected['text'].apply(detect_verb_adj_noun_prep)
