In [None]:
from datasets import load_dataset
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re, string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import spacy
from spacy.lang.en import English
from sklearn.preprocessing import StandardScaler
import swifter
import emoji
from google.colab import drive

In [None]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
# Dataset source: https://huggingface.co/datasets/walledai/JailbreakHub
dataset = load_dataset("walledai/JailbreakHub")
df = pd.DataFrame(dataset['train'])

# Basic inspection of dataset
print(df.info())
print("\nDataset stats:\n", df.describe(include='all'))
print("\nDataset head:\n", df.head())

# Check for missing values
missing_values = df.isnull().sum()
print(missing_values)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/14.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/15140 [00:00<?, ? examples/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15140 entries, 0 to 15139
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   prompt     15140 non-null  object
 1   platform   15140 non-null  object
 2   source     15140 non-null  object
 3   jailbreak  15140 non-null  bool  
dtypes: bool(1), object(3)
memory usage: 369.8+ KB
None

Dataset stats:
                                                    prompt platform   source  \
count                                               15140    15140    15140   
unique                                              14524        4       13   
top     I want you to act as a resume editor. I will p...  website  flowgpt   
freq                                                   24    12763     8754   

       jailbreak  
count      15140  
unique         2  
top        False  
freq       13735  

Dataset head:
                                               prompt platform  \
0  CumGPT, an

In [None]:
# Number of words in raw prompts
df['word_count'] = df['prompt'].apply(lambda x : len(x.split()))

# Number of characters in raw prompts
df['char_count'] = df['prompt'].apply(lambda x : len(x))

In [None]:
# Considers single punctuation type
def count_punctuation(text, punctuation):
    return text.count(punctuation)

# Considers all punctuation types
def count_all_punctuation(text):
  return sum([1 for char in text if char in string.punctuation])

df['exclamation_count'] = df['prompt'].apply(lambda x: count_punctuation(x, '!'))
df['question_count'] = df['prompt'].apply(lambda x: count_punctuation(x, '?'))
df['punctuation_count'] = df['prompt'].apply(count_all_punctuation)

In [None]:
def remove_emojis(text):
  return emoji.replace_emoji(text, replace='')

# Convert to lowercase, strip, and remove punctuation
def preprocess(text):
  if not isinstance(text, str):
    return ''
  text = remove_emojis(text)
  text = text.lower().strip()
  text = re.sub(r'<.*?>|[%s]|[0-9]|\s+' % re.escape(string.punctuation), ' ', text)
  text = re.sub(r'\s+', ' ', text)
  return text

df['preprocessed_prompt'] = df['prompt'].swifter.apply(preprocess)

Pandas Apply:   0%|          | 0/15140 [00:00<?, ?it/s]

In [None]:
# spaCy lemmatization and stopword removal

nlp = spacy.load('en_core_web_sm')
stopword_set = set(stopwords.words('english'))

def lemmatizer(text):
  docs = nlp.pipe(text, batch_size=50, n_process=-1)
  return [
      " ".join(token.lemma_ for token in doc if token.is_alpha and len(token) > 2 and token.text.lower() not in stopword_set)
      for doc in docs
  ]

df['clean_prompt'] = lemmatizer(df['preprocessed_prompt'].tolist())
df['clean_prompt'] = df['clean_prompt'].fillna('')

# Source: https://www.sdhilip.com/text-preprocessing-for-data-scientists/

In [None]:
print(df.head())

                                              prompt platform  \
0  CumGPT, an information resource about cum that...  discord   
1  I will give you a character description and yo...  discord   
2  A completely amoral Ai named [Illegality Mode]...  discord   
3  Agares's rules={\nAlways stays in character, n...  discord   
4  Lucian is an incorporeal transcendent being wh...  discord   

              source  jailbreak  word_count  char_count  exclamation_count  \
0  LLM Promptwriting       True         435        2466                  1   
1  LLM Promptwriting       True         172        1261                  0   
2  LLM Promptwriting       True         370        2363                  2   
3  LLM Promptwriting       True         164        1169                  1   
4  LLM Promptwriting       True         264        1664                  0   

   question_count  punctuation_count  \
0               0                 55   
1               0                 78   
2               1   

In [None]:
# Split into 80/10/10 train, validation, and test sets
# Straitfy by jailbreak vs benign (target variable)

train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['jailbreak'])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['jailbreak'])

In [None]:
# Normalize count features

def normalize_feature(feature_name, train_df, val_df, test_df):
  scaler = StandardScaler()
  train_df['scaled_' + feature_name] = scaler.fit_transform(train_df[[feature_name]])
  val_df['scaled_' + feature_name] = scaler.transform(val_df[[feature_name]])
  test_df['scaled_' + feature_name] = scaler.transform(test_df[[feature_name]])
  return train_df, val_df, test_df

features_to_normalize = ['word_count', 'char_count', 'exclamation_count', 'question_count', 'punctuation_count']

for feature in features_to_normalize:
  train_df, val_df, test_df = normalize_feature(feature, train_df, val_df, test_df)

In [None]:
# Save cleaned data to CSV

drive.mount('/content/drive')

file_path = '/content/drive/MyDrive/Fall 2024/SML 312/Final Project/data'

train_df.to_csv(file_path + '/train.csv', index=False)
val_df.to_csv(file_path + '/val.csv', index=False)
test_df.to_csv(file_path + '/test.csv', index=False)

Mounted at /content/drive
