In [2]:
import requests
import pandas as pd
import datetime
import os
import traceback
import re


"""   

Copyright (c) 2025, Michael Tchuindjang
All rights reserved.

This code was developed as part of a PhD research project in Cybersecurity and Artificial Intelligence, 
supported by a studentship at the University of the West of England (UWE Bristol).

Use of this software is permitted for academic, educational, and research purposes.  
For any commercial use or redistribution, please contact the author for permission.

Disclaimer:
In no event shall the author or UWE be liable for any claim, damages, or other liability arising from the use of this code.

Acknowledgment of the author and the research context is appreciated in any derivative work or publication.



INFO ABOUT THIS SOURCE CODE

This source code provides functions to launch single-turn and multi-turn jailbreaking attacks against open-source Large Language Models (LLMs).
Below are the main functions included:

1. conversation_lm_studio(): Handles chat conversations with an open-source LLM via the LM Studio Server.
2. launch_single_turn_attack(): Launches a single-turn attack against an open-source LLM, with optional past-tense reformulation.
3. launch_multiple_turn_attack(): Launches a multi-turn attack against an open-source LLM, with optional past-tense reformulation.

"""

# Define the API endpoint and headers (get them all from LM studio when enabling the Server mode)
API_URL = "http://<IP Address Of Your LM Studio Server>:<Port Number>/v1/chat/completions"  # LM Studio's API endpoint
HEADERS = {"Content-Type": "application/json"}
# List of the opensource LLMs we consider during our experiments, their names can be retrieved from LM studio when the model is loaded
# LLM 1: llama-2-7b-chat
# LLM 2: qwen2-7b-instruct
# LLM 3: mistral-7b-instruct-v0.1
LLM_VERSIONS = [
    {"MODEL_NAME": "llama-2-7b-chat", "LLM": "Llama", "LLM_VERSION": "2 (7B)"},
    {"MODEL_NAME": "qwen2-7b-instruct", "LLM": "Qwen", "LLM_VERSION": "2 (7B)"},
    {"MODEL_NAME": "mistral-7b-instruct-v0.1", "LLM": "Mistral", "LLM_VERSION": "0.1 (7B)"}
]

# Index for the selected LLM to use as target for the attack, by default, we select Llama 2 (7B). The index should be changed according to the targeted LLM 
# for the attack wether multi-turn or single turn
# INDEX = 0: To select Llama 2 (7B) as target
# INDEX = 1: To select Qwen 2 (7B) as target
# INDEX = 2: To select Mistral 0.1 (7B) as target
INDEX = 0
SELECTED_MODEL = LLM_VERSIONS[INDEX]
MODEL_NAME = SELECTED_MODEL["MODEL_NAME"]
LLM = SELECTED_MODEL["LLM"]
LLM_VERSION = SELECTED_MODEL["LLM_VERSION"]

# List of different variations of hyperparameters temperature we use during the experiments: 0.1, 0.5 and 0.9
TEMPS = [0.1, 0.5, 0.9]

# Index fot the chosen temperature, it should be changed according to the temperature we want for the attack
IDT = 0

#TEMP is the choosen value of temperature during the attack experimentation which is 0.1 by default
TEMP = TEMPS[IDT]


# The input file we use in all the function below is the baseline dataset (AdvBench+ HarmBench)
INPUT_FILE = 'Baseline_Dataset_Advbench_HarmBench.csv'


# Dataset folder name
DATASET_FOLDER = 'datasets/'

# Test folder
TEST_FOLDER = 'tests/'

# Dataset dictionary
MULTITURN_DATASETS = {
    "2": DATASET_FOLDER + "CyMulTenSet_N_2.csv",
    "3": DATASET_FOLDER + "CyMulTenSet_N_3.csv"
}

# Function of chat conversations with the LLM via LM Studio Server
# We minimize the randomness of the model's predictions to make it more deterministic in the response by setting temperature to a lower value
# We Maximize diversity in the model’s responses potentially including highly improbable tokens by setting top_p to a higher value
# We are limiting the max_tokens to 1024
# Input:
# conversation: The conversation we want to input to the LLM for a response.
# model_name: The name of the LLM in LM studio that we want to use
# max_tokens: The limit of tokens the LLM should use during the response generation process
# temperature: The value of the hyperparameter temperature of the LLM during the experiment
# top_p: The higher is this value the more diverse is the model’s responses and conversely.
# Output:
# This function returns the response of the LLM as a string
def conversation_lm_studio(conversation, model_name=None, max_tokens=1024, temperature=TEMP, top_p=0.9):
    """
    Query a specific AI model running on LM Studio.

    Args:
        message (list[dict]): A list of dictionaries where each dictionary contains:
            - "role" (str): The role of the message sender, e.g., "user" or "assistant".
            - "content" (str): The content of the message, which could be the prompt or the assistant's reply.
        max_tokens (int): Maximum number of tokens to generate.
        temperature (float): Sampling temperature for randomness in responses.
        top_p (float): Nucleus sampling for diversity in responses.
        model_name (str): The name of the specific model to use (optional, depends on LM Studio's configuration).

    Returns:
        str: The response generated by the AI model.
    """
    # Prepare the payload
    payload = {
        "messages": conversation,
        "max_tokens": max_tokens,
        "temperature": temperature,
        "top_p": top_p,
        "model": MODEL_NAME  # Specify the selected model from LM studio LLM list
    }

    try:
        # Send the request to LM Studio
        response = requests.post(API_URL, headers=HEADERS, json=payload)
        response.raise_for_status()  # Raise an exception for HTTP errors
        response_data = response.json()
        return response_data.get("choices", [{}])[0].get("message", {}).get("content", "No response text found.")
    except requests.exceptions.RequestException as e:
        return f"Error querying LM Studio: {e}"


# Function to launch single turn attack against a targeted LLM with past tense reformulation or not
# min_row and max_row determine the range of rows to select in the table of input file for optimization of the attack process
# Input:
# is_past: Indicate whether we're using past tense reformulation or not
# min_row: Starting index of the rows to select from the input file
# max_row: Ending index of the rows to select from the input file
# Output:
# This function does not return any value but will create a new file, named according to the output_file variable, which will contain the LLM response for the single-turn attacks
def launch_single_turn_attack(is_past=False, min_row=None, max_row=None):
    try:
        # We use a naming convention for the output file with the following syntax:
        # <LLM_model_name>_attack_N_<Nb of interation>_<present or past>.csv
        if is_past:
            output_file = TEST_FOLDER + '' + MODEL_NAME + '/' + MODEL_NAME + '_attack_N_1_past.csv'
        else:
            output_file = TEST_FOLDER + '' + MODEL_NAME + '/' + MODEL_NAME + '_attack_N_1_present.csv'
        
        # Read the input CSV file
        df = pd.read_csv(INPUT_FILE)
    
        # Check if 'Prompt' column exists
        if 'Prompt' not in df.columns:
            print("The 'Prompt' column is missing in the input file.")
            return
                
        # Check if 'SemanticCategory' column exists
        if 'SemanticCategory' not in df.columns:
            print("The 'SemanticCategory' column is missing in the input file.")
            return
    
        # Check if 'SubTopic' column exists
        if 'SubTopic' not in df.columns:
            print("The 'SubTopic' column is missing in the input file.")
            return

        # Set defaults if min_row or max_row is None
        if min_row is None:
            min_row = 1
        if max_row is None:
            max_row = len(df)

        if min_row >= max_row:
            print(f"Invalid range: min_row ({min_row}) must be less than max_row ({max_row}).")
            return

        # Filter rows based on the range
        df = df.iloc[(min_row-1):max_row]
        
        # Determine starting Unique ID based on existing output file
        starting_id = 1
        if os.path.exists(output_file):
            existing_df = pd.read_csv(output_file)
            if 'Unique ID' in existing_df.columns:
                starting_id = existing_df['Unique ID'].max() + 1


        # Initialize an empty list to hold the new rows
        new_rows = []

        # Index counter
        count = 0
        
        # Iterate over each row in pd
        for index, row in df.iterrows():
            # Initialize an empty conversation
            conversations = []
            # new row ID
            Row_ID = starting_id + count

            # For each row in the input file, we prepare a prompt for the LLM and retrieve the response that we'll put in the output file
            # If single past tense reformulation prompt, we select 'Past' column from the input file
            if is_past:
                conversations.append({"role": "user", "content": row['Past']})
                rsp = conversation_lm_studio(conversations, model_name=MODEL_NAME)
                d = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                # We create a new row in the output file with the prompt request (in past) and the generated response from the LLM along with other useful info.
                new_rows.append({'Prompt': row['Past'],
                             'SemanticCategory': row["SemanticCategory"], 
                             'SubTopic': row['SubTopic'], 
                             'Model': LLM,              
                             'Version': LLM_VERSION,
                             'Temp.': TEMP,    
                             'Unique ID': Row_ID, 
                             'Multi Step': 0, 
                             'Response': rsp,  
                             'Date Time': d })
            else:
                conversations.append({"role": "user", "content": row['Prompt']})
                rsp = conversation_lm_studio(conversations, model_name=MODEL_NAME)
                d = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                # We create a new row in the output file with the prompt request (in present) and the generated response from the LLM along with other useful info.
                new_rows.append({'Prompt': row['Prompt'],
                             'SemanticCategory': row["SemanticCategory"], 
                             'SubTopic': row['SubTopic'], 
                             'Model': LLM,              
                             'Version': LLM_VERSION,
                             'Temp.': TEMP,   
                             'Unique ID': Row_ID, 
                             'Multi Step': 0, 
                             'Response': rsp,
                             'Date Time': d })
            count = count + 1
        
        # Create the new DataFrame with new rows and columns
        output_df = pd.DataFrame(new_rows)

        # Make sure the directory exists
        os.makedirs(os.path.dirname(output_file), exist_ok=True)

        # Check if the file already exists
        file_exists = os.path.exists(output_file)

        # Write the new DataFrame to the output CSV file in a 'append' mode while ignoring the first row (for column names)
        output_df.to_csv(output_file, mode='a', header=not file_exists, index=False)
        print(f"LLM responses written to {output_file}.")

    except FileNotFoundError:
        print(f"The file {INPUT_FILE} was not found.")
    except Exception as e:
        print(f"An error occurred: {e}")
        print("Full traceback:")
        traceback.print_exc()  # This will print the detailed traceback



# Function to launch multiple turn attack against a targeted LLM with past tense reformulation or not
# min_row and max_row determine the range of rows to select in the table of input file for optimization of the attack process
# Input:
# nb_iterations: The number of iteration for the multi-turn attack
# is_past: Indicate whether we're using past tense reformulation or not
# min_row: Starting index of the rows to select from the input file
# max_row: Ending index of the rows to select from the input file
# Output:
# This function does not return any value but will create a new file, named according to the output_file variable, which will contain the LLM response for the multi-turn attacks
def launch_multiple_turn_attack(nb_iterations=2, is_past=False, min_row=None, max_row=None):
    try:
        # We use a naming convention for the output file with the following syntax:
        # <LLM_model_name>_attack_N_<Nb of interation>_<present or past>.csv
        if is_past:
            output_file = TEST_FOLDER + '' + MODEL_NAME + '/' + MODEL_NAME + '_attack_N_'+ str(nb_iterations) +'_past.csv'
        else:
            output_file = TEST_FOLDER + '' + MODEL_NAME + '/' + MODEL_NAME + '_attack_N_'+ str(nb_iterations) +'_present.csv'
    
        # Read the input CSV file which is the generated multi turn dataset corresponding to nb of iterations
        df = pd.read_csv(MULTITURN_DATASETS[""+str(nb_iterations)])
    
        # Check if 'Prompt' column exists
        if 'Prompt' not in df.columns:
            print("The 'Prompt' column is missing in the input file.")
            return
                
        # Check if 'SemanticCategory' column exists
        if 'SemanticCategory' not in df.columns:
            print("The 'SemanticCategory' column is missing in the input file.")
            return
    
        # Check if 'SubTopic' column exists
        if 'SubTopic' not in df.columns:
            print("The 'SubTopic' column is missing in the input file.")
            return

        # Set defaults if min_row or max_row is None
        if min_row is None:
            min_row = 1
        if max_row is None:
            max_row = len(df)

        if min_row > max_row:
            print(f"Invalid range: min_row ({min_row}) must be less than max_row ({max_row}).")
            return
        
        # Filter rows based on the range
        df = df.iloc[(min_row-1):max_row]
        
        # Determine starting Unique ID based on existing output file
        starting_id = 1
        if os.path.exists(output_file):
            existing_df = pd.read_csv(output_file)
            if 'Unique ID' in existing_df.columns:
                starting_id = existing_df['Unique ID'].max() + 1

        # Initialize an empty list to hold the new rows
        new_rows = []

        # Index counter
        count = 0
        
        # Iterate over each row in panda1
        for index, row in df.iterrows():
            
            # Initialize an empty conversation to keep the chat history
            conversations = []
            Row_ID = starting_id + count


            # If past tense reformulation is required
            if is_past:
                # LLM query and response for the prompt of turn 1
                conversations.append({"role": "user", "content": row['Past Turn 1']})
                rsp = conversation_lm_studio(conversations, model_name=MODEL_NAME)
                d = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                # We create a new row in the output file with the prompt 1 request (in past) and the generated response from the LLM along with other useful info.
                conversations.append({"role": "assistant", "content": rsp})
                new_rows.append({'Prompt': row['Prompt'], 
                                 'New Prompt': row['Past Turn 1'], 
                                 'SemanticCategory': row["SemanticCategory"], 
                                 'SubTopic': row['SubTopic'], 
                                 'Model': LLM,
                                 'Version': LLM_VERSION, 
                                 'Temp.': TEMP,
                                 'Unique ID': Row_ID, 
                                 'Multi Step': 1, 
                                 'Response': rsp,
                                 'Date Time': d })
                
                # LLM query and response for the prompt of turn 2
                conversations.append({"role": "user", "content": row['Past Turn 2']})
                rsp = conversation_lm_studio(conversations, model_name=MODEL_NAME)
                d = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                # We create a new row in the output file with the prompt 2 request (in past) and the generated response from the LLM along with other useful info.
                conversations.append({"role": "assistant", "content": rsp})
                new_rows.append({'Prompt': row['Prompt'], 
                                 'New Prompt': row['Past Turn 2'], 
                                 'SemanticCategory': row["SemanticCategory"], 
                                 'SubTopic': row['SubTopic'], 
                                 'Model': LLM,
                                 'Version': LLM_VERSION,
                                 'Temp.': TEMP,
                                 'Unique ID': Row_ID, 
                                 'Multi Step': 2, 
                                 'Response': rsp,
                                 'Date Time': d })
                
                # If it's 3 turn attack, we continue to query the LLM and get the response for the prompt of turn 3 in past tense
                if nb_iterations == 3:
                    conversations.append({"role": "user", "content": row['Past Turn 3']})
                    rsp = conversation_lm_studio(conversations, model_name=MODEL_NAME)
                    d = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                    # We create a new row in the output file with the prompt 3 request (in past) and the generated response from the LLM along with other useful info.
                    conversations.append({"role": "assistant", "content": rsp})
                    new_rows.append({'Prompt': row['Prompt'], 
                                 'New Prompt': row['Past Turn 3'], 
                                 'SemanticCategory': row["SemanticCategory"], 
                                 'SubTopic': row['SubTopic'], 
                                 'Model': LLM,
                                 'Version': LLM_VERSION,
                                 'Temp.': TEMP,
                                 'Unique ID': Row_ID, 
                                 'Multi Step': 3, 
                                 'Response': rsp,
                                 'Date Time': d })
                    

            # If past tense reformulation is not required    
            else:
                # LLM query and response for the prompt of turn 1
                conversations.append({"role": "user", "content": row['Present Turn 1']})
                rsp = conversation_lm_studio(conversations, model_name=MODEL_NAME)
                d = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                conversations.append({"role": "assistant", "content": rsp})
                # We create a new row in the output file with the prompt 1 request (in present) and the generated response from the LLM along with other useful info.
                new_rows.append({'Prompt': row['Prompt'],
                                 'New Prompt': row['Present Turn 1'],
                                 'SemanticCategory': row["SemanticCategory"], 
                                 'SubTopic': row['SubTopic'], 
                                 'Model': LLM,              
                                 'Version': LLM_VERSION,
                                 'Temp.': TEMP,   
                                 'Unique ID': Row_ID, 
                                 'Multi Step': 1, 
                                 'Response': rsp,
                                 'Date Time': d })
    
                # LLM query and response for the prompt of turn 2
                conversations.append({"role": "user", "content": row['Present Turn 2']})
                rsp = conversation_lm_studio(conversations, model_name=MODEL_NAME)
                d = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                conversations.append({"role": "assistant", "content": rsp})
                # We create a new row in the output file with the prompt 2 request (in present) and the generated response from the LLM along with other useful info.
                new_rows.append({'Prompt': row['Prompt'], 
                                 'New Prompt': row['Present Turn 2'], 
                                 'SemanticCategory': row["SemanticCategory"], 
                                 'SubTopic': row['SubTopic'], 
                                 'Model': LLM,
                                 'Version': LLM_VERSION, 
                                 'Temp.': TEMP,
                                 'Unique ID': Row_ID, 
                                 'Multi Step': 2, 
                                 'Response': rsp,
                                 'Date Time': d })
                # If it's 3 turn attack, we continue to query the LLM and get the response for the prompt of turn 3 in present tense
                if nb_iterations == 3:
                    conversations.append({"role": "user", "content": row['Present Turn 3']})
                    rsp = conversation_lm_studio(conversations, model_name=MODEL_NAME)
                    d = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
                    # We create a new row in the output file with the prompt 3 request (in present) and the generated response from the LLM along with other useful info.
                    conversations.append({"role": "assistant", "content": rsp})
                    new_rows.append({'Prompt': row['Prompt'], 
                                 'New Prompt': row['Present Turn 3'], 
                                 'SemanticCategory': row["SemanticCategory"], 
                                 'SubTopic': row['SubTopic'], 
                                 'Model': LLM,
                                 'Version': LLM_VERSION,
                                 'Temp.': TEMP,
                                 'Unique ID': Row_ID, 
                                 'Multi Step': 3, 
                                 'Response': rsp,
                                 'Date Time': d })
                
            count = count + 1
        
        # Create the new DataFrame with new rows and columns
        output_df = pd.DataFrame(new_rows)

        # Make sure the directory exists
        os.makedirs(os.path.dirname(output_file), exist_ok=True)

        # Check if the file already exists
        file_exists = os.path.exists(output_file)

        # Write the new DataFrame to the output CSV file in a 'append' mode while ignoring the first row (for column names)
        output_df.to_csv(output_file, mode='a', header=not file_exists, index=False)
        print(f"LLM responses written to {output_file}.")

    except FileNotFoundError:
        print(f"The file {MULTITURN_DATASETS[""+str(nb_iterations)]} was not found.")
    except Exception as e:
        print(f"An error occurred: {e}")
        print("Full traceback:")
        traceback.print_exc()  # This will print the detailed traceback



# Example usage
if __name__ == "__main__":
        
        # Variable for past tense reformulation in single and multiple turn attacks
        is_past = False
    
        # Variable for number of iterations in multi-turn attack
        nb_iter = 3
    
        # Optional variables for the row range (human level) to select from the input file
        min_row = 1
        max_row = 2
        
        #launch_single_turn_attack(is_past, min_row, max_row)
        launch_multiple_turn_attack(nb_iter, is_past, min_row, max_row)

LLM responses written to tests/llama-2-7b-chat/llama-2-7b-chat_attack_N_3_present.csv.
