In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
# Benign prompts
df1 = pd.read_csv("hackathon/prompts/benign/prompts/good", header=None, sep="\t")
df2 = pd.read_csv("hackathon/prompts/benign/prompts/bad", header=None, sep="\t")
df3 = pd.read_csv("hackathon/prompts/benign/good_adj_mal/good_adj_mal_1", header=None, sep="\t")
df4 = pd.read_csv("hackathon/prompts/benign/good_adj_mal/good_adj_mal_2", header=None, sep="\t")
df5 = pd.read_csv("hackathon/prompts/benign/good_adj_mal/good_adj_mal_3", header=None, sep="\t")
df6 = pd.read_csv("hackathon/prompts/benign/good_adj_mal/good_adj_mal_4", header=None, sep="\t")
df7 = pd.read_csv("hackathon/prompts/benign/bad_adj_mal/bad_adj_mal_1", header=None, sep="\t")
df8 = pd.read_csv("hackathon/prompts/benign/bad_adj_mal/bad_adj_mal_2", header=None, sep="\t")
df9 = pd.read_csv("hackathon/prompts/benign/bad_adj_mal/bad_adj_mal_3", header=None, sep="\t")
df10 = pd.read_csv("hackathon/prompts/benign/bad_adj_mal/bad_adj_mal_4", header=None, sep="\t")

# Malicious prompts
df11 = pd.read_csv("hackathon/prompts/malicious/good_mal/good_mal_1", header=None, sep="\t")
df12 = pd.read_csv("hackathon/prompts/malicious/good_mal/good_mal_2", header=None, sep="\t")
df13 = pd.read_csv("hackathon/prompts/malicious/good_mal/good_mal_3", header=None, sep="\t")
df14 = pd.read_csv("hackathon/prompts/malicious/good_mal/good_mal_4", header=None, sep="\t")
df15 = pd.read_csv("hackathon/prompts/malicious/bad_mal/bad_mal_1", header=None, sep="\t")
df16 = pd.read_csv("hackathon/prompts/malicious/bad_mal/bad_mal_2", header=None, sep="\t")
df17 = pd.read_csv("hackathon/prompts/malicious/bad_mal/bad_mal_3", header=None, sep="\t")
df18 = pd.read_csv("hackathon/prompts/malicious/bad_mal/bad_mal_4", header=None, sep="\t")

In [3]:
dfs_benign = [df1, df2, df3, df4, df5, df6, df7, df8, df9, df10]
dfs_malicious = [df11, df12, df13, df14, df15, df16, df17, df18]

for df in dfs_benign:
    df.columns = ["prompt"]
    df["label"] = np.zeros(df.shape[0], dtype=int)
    
for df in dfs_malicious:
    df.columns = ["prompt"]
    df["label"] = np.ones(df.shape[0], dtype=int)

## Experiment 1: Train and test on malicious suffixes 1 and 2

In [4]:
# Create dataset 1 (only malicious suffixes 1 and 2)
dfs_dataset_1 = [df1, df2, df3, df4, df7, df8, df11, df12, df15, df16]
dataset_1 = pd.concat(dfs_dataset_1, axis=0)
dataset_1 = dataset_1.reset_index(drop=True)

# Train test split
X = dataset_1["prompt"]
y = dataset_1["label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the textual features
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train the model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = model.predict(X_test_tfidf)

# Measure performance
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", classification_rep)
print("Confusion Matrix:\n", confusion_mat)

Accuracy: 1.00
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       364
           1       1.00      1.00      1.00       234

    accuracy                           1.00       598
   macro avg       1.00      1.00      1.00       598
weighted avg       1.00      1.00      1.00       598

Confusion Matrix:
 [[364   0]
 [  0 234]]


## Experiment 2: Can the model generalize to malicious suffixes 3 and 4?

In [5]:
#Create dataset 2 (only malicious suffixes 3 and 4)
dfs_dataset_2 = [df5, df6, df9, df10, df13, df14, df17, df18]
dataset_2 = pd.concat(dfs_dataset_2, axis=0)
dataset_2 = dataset_2.reset_index(drop=True)

In [6]:
X2 = dataset_2["prompt"]
y2 = dataset_2["label"]

In [7]:
X2_tfidf = tfidf_vectorizer.transform(X2)

In [8]:
y2_pred = model.predict(X2_tfidf)

In [9]:
accuracy_2 = accuracy_score(y2, y2_pred)
classification_rep_2 = classification_report(y2, y2_pred)
confusion_mat_2 = confusion_matrix(y2, y2_pred)

print(f"Accuracy: {accuracy_2:.2f}")
print("Classification Report:\n", classification_rep_2)
print("Confusion Matrix:\n", confusion_mat_2)

Accuracy: 0.75
Classification Report:
               precision    recall  f1-score   support

           0       0.66      1.00      0.80      1186
           1       1.00      0.49      0.66      1200

    accuracy                           0.75      2386
   macro avg       0.83      0.75      0.73      2386
weighted avg       0.83      0.75      0.73      2386

Confusion Matrix:
 [[1186    0]
 [ 608  592]]
