In [None]:
from google.colab import drive
import pandas as pd
import nltk
import numpy as np
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
from gensim.models import Word2Vec
from imblearn.combine import SMOTETomek
import nltk
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay, precision_recall_curve, auc
import pickle

In [None]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
drive.mount('/content/drive')

file_path = '/content/drive/MyDrive/Fall 2024/SML 312/Final Project/data'

train_df = pd.read_csv(file_path + '/train.csv')
val_df = pd.read_csv(file_path + '/val.csv')
test_df = pd.read_csv(file_path + '/test.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
train_df['clean_prompt'] = train_df['clean_prompt'].fillna('')
val_df['clean_prompt'] = val_df['clean_prompt'].fillna('')
test_df['clean_prompt'] = test_df['clean_prompt'].fillna('')

In [None]:
# Function for viewing evaluation metrics

def evaluation_metrics(title, y_pred_prob, y_true, threshold=0.5, plots=True):
  y_pred = (y_pred_prob >= threshold).astype(int)

  precision, recall, thresholds = precision_recall_curve(y_true, y_pred_prob)
  pr_auc = auc(recall, precision)

  accuracy, recall, _f1_score = accuracy_score(y_true, y_pred), recall_score(y_true, y_pred), f1_score(y_true, y_pred)

  results = pd.DataFrame({
    'Model': [title],
    'Accuracy': [accuracy],
    'Recall': [recall],
    'F1-Score': [_f1_score],
    'Precision-Recall AUC': [pr_auc]
  })

  print(f"=== {title} Classification Report ===")
  print(results)

  if plots:
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))

    conf_matrix = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(conf_matrix, display_labels=['Benign', 'Jailbreak'])
    disp.plot(ax=axes[0], cmap=plt.cm.Blues)
    axes[0].set_title(title + ' Confusion Matrix')

    precision, recall, thresholds = precision_recall_curve(y_true, y_pred_prob)

    axes[1].plot(recall, precision, color='blue', lw=2, label=f'PR curve (area = {pr_auc:.2f})')
    axes[1].set_xlabel('Recall')
    axes[1].set_ylabel('Precision')
    axes[1].set_title(title + ' Precision-Recall Curve')
    axes[1].legend(loc="lower left")

    plt.tight_layout()
    plt.show()

    return (results, fig)

  else:
    return (results, None)

In [None]:
# max_features parameter in TF-IDF controls how many of most common features (words) to include
# Here, I experiment with various max_features values using unigrams only (with Logistic Regression model to demonstrate results)

max_features_vals = [1000, 2000, 3000, 5000, 8000, 10000]

for max_features in max_features_vals:
  tfidf_vectorizer = TfidfVectorizer(max_features=max_features)
  log_reg = LogisticRegression(max_iter=10000, class_weight='balanced')

  train_features = tfidf_vectorizer.fit_transform(train_df['clean_prompt'])
  val_features = tfidf_vectorizer.transform(val_df['clean_prompt'])

  log_reg.fit(train_features, train_df['jailbreak'])
  y_pred_prob = log_reg.predict_proba(val_features)[:, 1]
  results, _ = evaluation_metrics(f'Logistic Regression (TF-IDF max_features = {max_features})', y_pred_prob, val_df['jailbreak'], plots=False)

=== Logistic Regression (TF-IDF max_features = 1000) Classification Report ===
                                              Model  Accuracy    Recall  \
0  Logistic Regression (TF-IDF max_features = 1000)  0.873844  0.758865   

   F1-Score  Precision-Recall AUC  
0  0.528395              0.596183  
=== Logistic Regression (TF-IDF max_features = 2000) Classification Report ===
                                              Model  Accuracy    Recall  \
0  Logistic Regression (TF-IDF max_features = 2000)  0.891017  0.780142   

   F1-Score  Precision-Recall AUC  
0  0.571429              0.596443  
=== Logistic Regression (TF-IDF max_features = 3000) Classification Report ===
                                              Model  Accuracy    Recall  \
0  Logistic Regression (TF-IDF max_features = 3000)  0.900925  0.822695   

   F1-Score  Precision-Recall AUC  
0   0.60733              0.604239  
=== Logistic Regression (TF-IDF max_features = 5000) Classification Report ===
               

Since I prioritize recall (misclassified jailbreak prompts have most serious consequences), the performance is best when using TF-IDF with max_features = 3000.

In [None]:
# Now, I experiment with various max_features values using unigrams and bigrams (again with Logistic Regression model to demonstrate results)

max_features_vals = [2000, 4000, 6000, 8000, 10000, 12000]

for max_features in max_features_vals:
  tfidf_vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=(1, 2))
  log_reg = LogisticRegression(max_iter=10000, class_weight='balanced')

  train_features = tfidf_vectorizer.fit_transform(train_df['clean_prompt'])
  val_features = tfidf_vectorizer.transform(val_df['clean_prompt'])

  log_reg.fit(train_features, train_df['jailbreak'])
  y_pred_prob = log_reg.predict_proba(val_features)[:, 1]
  results, _ = evaluation_metrics(f'Logistic Regression (TF-IDF max_features = {max_features}, bigrams included)', y_pred_prob, val_df['jailbreak'], plots=False)

=== Logistic Regression (TF-IDF max_features = 2000, bigrams included) Classification Report ===
                                               Model  Accuracy    Recall  \
0  Logistic Regression (TF-IDF max_features = 200...  0.887715  0.780142   

   F1-Score  Precision-Recall AUC  
0  0.564103              0.619026  
=== Logistic Regression (TF-IDF max_features = 4000, bigrams included) Classification Report ===
                                               Model  Accuracy    Recall  \
0  Logistic Regression (TF-IDF max_features = 400...  0.904227  0.808511   

   F1-Score  Precision-Recall AUC  
0   0.61126              0.621519  
=== Logistic Regression (TF-IDF max_features = 6000, bigrams included) Classification Report ===
                                               Model  Accuracy    Recall  \
0  Logistic Regression (TF-IDF max_features = 600...  0.912153  0.794326   

   F1-Score  Precision-Recall AUC  
0  0.627451              0.624427  
=== Logistic Regression (TF-IDF ma

Again, since recall is prioritized, the inclusion of bigrams does not appear to improve model performance. Therefore, I use TF-IDF with unigrams only.

In [None]:
# TF-IDF
# 3000 features and unigrams only yielded best results (shown above)

tfidf_vectorizer = TfidfVectorizer(max_features=3000)

tfidf_train = tfidf_vectorizer.fit_transform(train_df['clean_prompt'])
tfidf_val = tfidf_vectorizer.transform(val_df['clean_prompt'])
tfidf_test = tfidf_vectorizer.transform(test_df['clean_prompt'])

tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

# Source: https://www.deepwizai.com/projects/how-to-correctly-use-tf-idf-with-imbalanced-data

In [None]:
print(tfidf_train.shape)
print(tfidf_val.shape)
print(tfidf_test.shape)

(12112, 3000)
(1514, 3000)
(1514, 3000)


In [None]:
# Word2Vec (CBoW architecture)

def compute_mean_embedding(model, tokenized_prompts):
    embeddings = [
        model.wv[word] for word in tokenized_prompts if word in model.wv
    ]
    if len(embeddings) > 0:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)

def precompute_embeddings(model, clean_prompts):
    tokenized_prompts = clean_prompts.apply(word_tokenize).tolist()
    return np.array([compute_mean_embedding(model, tokens) for tokens in tokenized_prompts])

tokenized_prompts_train = train_df['clean_prompt'].apply(word_tokenize).tolist()
cbow_model = Word2Vec(tokenized_prompts_train, vector_size=100, window=5, min_count=1, workers=4, sg=0)

cbow_embeddings_train = precompute_embeddings(cbow_model, train_df['clean_prompt'])
cbow_embeddings_val = precompute_embeddings(cbow_model, val_df['clean_prompt'])
cbow_embeddings_test = precompute_embeddings(cbow_model, test_df['clean_prompt'])

# Source: https://www.analyticsvidhya.com/blog/2021/07/word2vec-for-word-embeddings-a-beginners-guide/
# Source: https://medium.com/@dilip.voleti/classification-using-word2vec-b1d79d375381

In [None]:
print(cbow_embeddings_train.shape)
print(cbow_embeddings_val.shape)
print(cbow_embeddings_test.shape)

(12112, 100)
(1514, 100)
(1514, 100)


In [None]:
# SMOTE + Tomek Links applied to Word2Vec embeddings

smote_tomek = SMOTETomek(random_state=42)
train_embeddings_resampled, train_labels_resampled = smote_tomek.fit_resample(cbow_embeddings_train, train_df['jailbreak'])

print(train_embeddings_resampled.shape)
print(train_labels_resampled.shape)

(21960, 100)
(21960,)


In [None]:
# LDA Topic Modelling

stop_words = set(stopwords.words('english'))

def lda(train, val, test, n_topics):
  # Tokenize training prompts
  train_docs = train['clean_prompt'].apply(word_tokenize)
  train_docs = [[token for token in doc if token not in stop_words] for doc in train_docs]

  # Create dictionary and corpus from training docs
  train_dictionary = Dictionary(train_docs)
  train_dictionary.filter_extremes(no_below=5, no_above=0.5)
  train_corpus = [train_dictionary.doc2bow(doc) for doc in train_docs]

  lda_model = LdaModel(
      corpus=train_corpus,
      id2word=train_dictionary,
      chunksize=2000,
      passes=10,
      alpha='auto',
      eta='auto',
      iterations=400,
      num_topics=n_topics,
      eval_every=None,
      random_state=42
  )

  # Extract topic distributions for training set
  train_topic_distributions = [lda_model.get_document_topics(bow, minimum_probability=0) for bow in train_corpus]
  train_topic_matrix = np.zeros((len(train_topic_distributions), n_topics))
  for i, dist in enumerate(train_topic_distributions):
      for topic_id, prob in dist:
          train_topic_matrix[i, topic_id] = prob

  # Tokenize validation prompts
  val_docs = val['clean_prompt'].apply(word_tokenize)
  val_docs = [[token for token in doc if token not in stop_words] for doc in val_docs]
  val_corpus = [train_dictionary.doc2bow(doc) for doc in val_docs]

  # Tokenize test prompts
  test_docs = test['clean_prompt'].apply(word_tokenize)
  test_docs = [[token for token in doc if token not in stop_words] for doc in test_docs]
  test_corpus = [train_dictionary.doc2bow(doc) for doc in test_docs]

  # Extract topic distributions for validation set
  val_topic_distributions = [lda_model.get_document_topics(bow, minimum_probability=0) for bow in val_corpus]
  val_topic_matrix = np.zeros((len(val_topic_distributions), n_topics))
  for i, dist in enumerate(val_topic_distributions):
    for topic_id, prob in dist:
        val_topic_matrix[i, topic_id] = prob

  # Extract topic distributions for test set
  test_topic_distributions = [lda_model.get_document_topics(bow, minimum_probability=0) for bow in test_corpus]
  test_topic_matrix = np.zeros((len(test_topic_distributions), n_topics))
  for i, dist in enumerate(test_topic_distributions):
    for topic_id, prob in dist:
        test_topic_matrix[i, topic_id] = prob

  train_topic_df = pd.DataFrame(train_topic_matrix, columns=[f"topic_{i}" for i in range(n_topics)])
  val_topic_df = pd.DataFrame(val_topic_matrix, columns=[f"topic_{i}" for i in range(n_topics)])
  test_topic_df = pd.DataFrame(test_topic_matrix, columns=[f"topic_{i}" for i in range(n_topics)])

  return train_topic_df, val_topic_df, test_topic_df, lda_model

In [None]:
# Test LDA Topic Modelling with various num_topics (using Logistic Regression model to demonstrate results)

num_topics = [3, 5, 10, 15, 20, 30]

for n in num_topics:
  train_lda_df, val_lda_df, test_lda_df, lda_model = lda(train_df, val_df, test_df, n)
  train_features = train_lda_df
  val_features = val_lda_df

  log_reg = LogisticRegression(max_iter=1000, class_weight='balanced')
  log_reg.fit(train_features, train_df['jailbreak'])

  y_pred_prob = log_reg.predict_proba(val_features)[:, 1]
  results, _ = evaluation_metrics(f'Logistic Regression (LDA, n_topics = {n})', y_pred_prob, val_df['jailbreak'], plots=False)



=== Logistic Regression (LDA, n_topics = 3) Classification Report ===
                                     Model  Accuracy    Recall  F1-Score  \
0  Logistic Regression (LDA, n_topics = 3)  0.804491  0.815603  0.437262   

   Precision-Recall AUC  
0              0.409431  
=== Logistic Regression (LDA, n_topics = 5) Classification Report ===
                                     Model  Accuracy    Recall  F1-Score  \
0  Logistic Regression (LDA, n_topics = 5)   0.84148  0.794326  0.482759   

   Precision-Recall AUC  
0              0.455228  
=== Logistic Regression (LDA, n_topics = 10) Classification Report ===
                                      Model  Accuracy    Recall  F1-Score  \
0  Logistic Regression (LDA, n_topics = 10)   0.84214  0.808511  0.488223   

   Precision-Recall AUC  
0              0.524227  
=== Logistic Regression (LDA, n_topics = 15) Classification Report ===
                                      Model  Accuracy    Recall  F1-Score  \
0  Logistic Regression (

In [None]:
# LDA Topic Modelling with 20 topics

train_lda_df, val_lda_df, test_lda_df, lda_model = lda(train_df, val_df, test_df, n_topics=20)

print(train_lda_df.shape)
print(val_lda_df.shape)
print(test_lda_df.shape)

topics = lda_model.print_topics(num_words=10)
for topic_id, topic in topics:
    print(f"Topic {topic_id + 1}: {topic}")

(12112, 20)
(1514, 20)
(1514, 20)
Topic 1: 0.041*"yang" + 0.034*"dengan" + 0.031*"dan" + 0.026*"saya" + 0.026*"artikel" + 0.021*"kata" + 0.018*"anda" + 0.016*"judul" + 0.013*"menulis" + 0.012*"jangan"
Topic 2: 0.019*"question" + 0.015*"term" + 0.012*"token" + 0.012*"food" + 0.011*"financial" + 0.010*"time" + 0.009*"system" + 0.008*"ask" + 0.008*"project" + 0.008*"answer"
Topic 3: 0.138*"story" + 0.053*"bird" + 0.047*"chan" + 0.027*"gpt" + 0.022*"lucy" + 0.014*"juice" + 0.012*"harry" + 0.011*"tell" + 0.011*"write" + 0.011*"protagonist"
Topic 4: 0.172*"user" + 0.057*"response" + 0.033*"write" + 0.032*"continue" + 0.031*"name" + 0.027*"wait" + 0.026*"assume" + 0.024*"reaction" + 0.018*"personality" + 0.016*"explicit"
Topic 5: 0.083*"code" + 0.026*"function" + 0.024*"print" + 0.020*"documentation" + 0.016*"expert" + 0.015*"project" + 0.015*"programming" + 0.014*"variable" + 0.013*"prompt" + 0.013*"use"
Topic 6: 0.023*"make" + 0.023*"user" + 0.020*"ask" + 0.018*"like" + 0.018*"answer" + 0.0

In [None]:
path = '/content/drive/MyDrive/Fall 2024/SML 312/Final Project/data'

np.save(path + '/tfidf_train.npy', tfidf_train.toarray())
np.save(path + '/tfidf_val.npy', tfidf_val.toarray())
np.save(path + '/tfidf_test.npy', tfidf_test.toarray())

np.save(path + '/tfidf_feature_names.npy', tfidf_feature_names)

np.save(path + '/word2vec_train.npy', cbow_embeddings_train)
np.save(path + '/word2vec_val.npy', cbow_embeddings_val)
np.save(path + '/word2vec_test.npy', cbow_embeddings_test)

np.save(path + '/word2vec_smote_tomek_embeddings.npy', train_embeddings_resampled)
np.save(path + '/word2vec_smote_tomek_labels.npy', train_labels_resampled)

train_lda_df.to_pickle(path + '/lda_train.pkl')
val_lda_df.to_pickle(path + '/lda_val.pkl')
test_lda_df.to_pickle(path + '/lda_test.pkl')

with open(path + '/lda_topics.pkl', 'wb') as f:
    pickle.dump(topics, f)