In [None]:
# Install required libraries
!pip install transformers pandas requests



In [None]:
import pandas as pd

# Load the CSV file, skipping lines with errors
df = pd.read_csv('/content/cti-mcq.tsv', sep='\t')

# Print the first few rows of the DataFrame
df.head()

Unnamed: 0,URL,Question,Option A,Option B,Option C,Option D,Prompt,GT
0,https://attack.mitre.org/techniques/T1548/,Which of the following mitigations involves pr...,Audit,Execution Prevention,Operating System Configuration,User Account Control,You are given a multiple-choice question (MCQ)...,B
1,https://attack.mitre.org/techniques/T1548/,Which data source is recommended for monitorin...,Command,File,Process,User Account,You are given a multiple-choice question (MCQ)...,D
2,https://attack.mitre.org/techniques/T1548/,What does mitigation ID M1028 suggest to preve...,Limiting privileges of cloud accounts,Preventing unsigned applications from running,Minimizing applications with setuid or setgid ...,Enforcing the highest UAC level,You are given a multiple-choice question (MCQ)...,C
3,https://attack.mitre.org/techniques/T1548/,Which process creation is an indicator of pote...,C:\Windows\System32\services.exe,C:\Windows\System32\cmd.exe,C:\Windows\System32\rundll32.exe,C:\Windows\System32\notepad.exe,You are given a multiple-choice question (MCQ)...,B
4,https://attack.mitre.org/techniques/T1548/,"In a Linux environment, what is recommended to...",Monitor Windows Registry Key Modification,Monitor OS API Execution,Monitor file metadata for setuid or setgid bit...,Audit process metadata changes,You are given a multiple-choice question (MCQ)...,C


In [None]:
import os
from huggingface_hub import login

# Log in using your Hugging Face API token
login(token="")



The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
# Initialize the Model with Authentication
from transformers import pipeline

# Initialize the pipeline
pipe = pipeline("text-generation", model="meta-llama/Meta-Llama-3-8B")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [None]:
df = df.drop(['URL', 'GT'], axis=1)
df.head()

Unnamed: 0,Question,Option A,Option B,Option C,Option D,Prompt
0,Which of the following mitigations involves pr...,Audit,Execution Prevention,Operating System Configuration,User Account Control,You are given a multiple-choice question (MCQ)...
1,Which data source is recommended for monitorin...,Command,File,Process,User Account,You are given a multiple-choice question (MCQ)...
2,What does mitigation ID M1028 suggest to preve...,Limiting privileges of cloud accounts,Preventing unsigned applications from running,Minimizing applications with setuid or setgid ...,Enforcing the highest UAC level,You are given a multiple-choice question (MCQ)...
3,Which process creation is an indicator of pote...,C:\Windows\System32\services.exe,C:\Windows\System32\cmd.exe,C:\Windows\System32\rundll32.exe,C:\Windows\System32\notepad.exe,You are given a multiple-choice question (MCQ)...
4,"In a Linux environment, what is recommended to...",Monitor Windows Registry Key Modification,Monitor OS API Execution,Monitor file metadata for setuid or setgid bit...,Audit process metadata changes,You are given a multiple-choice question (MCQ)...


In [None]:
!pip install tqdm
from tqdm import tqdm



In [11]:
def format_input(row):
    question = row['Question']
    options = f"Option A: {row['Option A']}, Option B: {row['Option B']}, Option C: {row['Option C']}, Option D: {row['Option D']}"
    return {"question": question, "options": options}

formatted_inputs = df.head(5).apply(format_input, axis=1)

In [12]:
def generate_responses(formatted_inputs, batch_size=10):
    results = []
    num_batches = len(formatted_inputs) // batch_size + 1
    for i in tqdm(range(num_batches)):
        batch = formatted_inputs[i*batch_size:(i+1)*batch_size]
        for item in batch:
            question = item['question']
            options = item['options']
            responses = {}
            for option in options.split(", "):
                input_text = f"{question} {option}"
                response = pipe(input_text, max_length=50, num_return_sequences=1)[0]['generated_text']
                responses[option] = response

            # Implement a better scoring mechanism
            # For simplicity, we'll use the length of the generated text as a heuristic
            best_answer = max(responses, key=lambda x: len(responses[x]))
            results.append({"question": question, "best_answer": best_answer})
    return results

# Generate responses for the dataset
results = generate_responses(formatted_inputs)
print(results)


  0%|          | 0/1 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_

[{'question': "Which of the following mitigations involves preventing applications from running that haven't been downloaded from legitimate repositories?", 'best_answer': 'Option C: Operating System Configuration'}, {'question': 'Which data source is recommended for monitoring commands that may circumvent mechanisms designed to control elevation of privileges?', 'best_answer': 'Option C: Process'}, {'question': 'What does mitigation ID M1028 suggest to prevent privilege escalation exploits on a system?', 'best_answer': 'Option A: Limiting privileges of cloud accounts'}, {'question': 'Which process creation is an indicator of potential SYSTEM privilege escalation according to the detection section?', 'best_answer': 'Option D: C:\\Windows\\System32\\notepad.exe'}, {'question': 'In a Linux environment, what is recommended to monitor for detecting privilege escalation via sudo?', 'best_answer': 'Option A: Monitor Windows Registry Key Modification'}]





In [None]:
# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Save to CSV
results_df.to_csv('path_to_save_results.csv', index=False)
