<a href="https://colab.research.google.com/github/Journalwere/wordlist_generator/blob/main/wordlist_generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch
!pip install transformers
!pip install tqdm

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.19.1-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.1/311.1 kB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m76.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m75.2 MB/s[0m eta [36m0:00:00[0m
Col

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Mon Nov 13 23:29:54 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    23W / 300W |      0MiB / 16384MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from google.colab import drive
from transformers import BertTokenizer, BertForMaskedLM
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import torch

# Specify the path to your wordlist file in Google Drive
file_path = '/content/drive/MyDrive/training_wordlist/directory-list-2.3-small.txt'  # Update with the actual path to your file

# Read data from the file
with open(file_path, 'r', encoding='utf-8') as file:
    wordlist_data = file.readlines()

# Tokenize the data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenized_data = tokenizer(wordlist_data, return_tensors='pt', padding=True, truncation=True)

# Prepare data for training
input_ids = tokenized_data['input_ids']
labels = input_ids.clone()

# Create a masked language model (MLM) task
for i in range(len(labels)):
    # Randomly mask 15% of the tokens in each sequence
    mask_indices = torch.randperm(input_ids.size(1))[:int(0.15 * input_ids.size(1))]
    labels[i, mask_indices] = -100  # Set masked tokens to -100

dataset = TensorDataset(input_ids, labels)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

# Load pre-trained BERT model
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

# Fine-tune the model
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

# Training loop
num_epochs = 5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for batch in tqdm(dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}"):
        # Ensure that batch is a dictionary
        if isinstance(batch, list):
            batch = {f: t for f, t in zip(['input_ids', 'labels'], batch)}

        batch = {k: v.to(device) for k, v in batch.items()}

        inputs = batch['input_ids']
        attention_mask = (inputs != tokenizer.pad_token_id).float()
        labels = batch['labels']

        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {average_loss}")

# Save the fine-tuned model
model.save_pretrained('fine_tuned_model')

# Use the fine-tuned model to generate wordlists
seed_word = "security"
input_ids = tokenizer.encode(seed_word, return_tensors='pt').to(device)
output = model.generate(input_ids, max_length=20, num_beams=5)

generated_wordlist = tokenizer.decode(output[0], skip_special_tokens=True)
print(f"Generated Wordlist: {generated_wordlist}")


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'bert.pooler.dense.weight', 'bert.pooler.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Epoch 1/5: 100%|██████████| 10958/10958 [09:09<00:00, 19.93it/s]


Epoch 1/5, Average Loss: 0.046851433631868035


Epoch 2/5: 100%|██████████| 10958/10958 [09:06<00:00, 20.05it/s]


Epoch 2/5, Average Loss: 0.0001693724221622212


Epoch 3/5: 100%|██████████| 10958/10958 [09:15<00:00, 19.72it/s]


Epoch 3/5, Average Loss: 1.404376186541544e-05


Epoch 4/5: 100%|██████████| 10958/10958 [09:15<00:00, 19.72it/s]


Epoch 4/5, Average Loss: 9.061287925672245e-06


Epoch 5/5: 100%|██████████| 10958/10958 [09:11<00:00, 19.87it/s]


Epoch 5/5, Average Loss: 1.378801739792272e-06
Generated Wordlist: security


In [None]:
from transformers import BertForMaskedLM, BertTokenizer

# Load the fine-tuned model
model = BertForMaskedLM.from_pretrained('/content/drive/MyDrive/fine_tuned_model')

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Seed word or phrase
seed_word = "images"

# Encode the seed word
input_ids = tokenizer.encode(seed_word, return_tensors='pt')

# Generate multiple wordlists using the model
output = model.generate(input_ids, max_length=50, num_beams=5, num_return_sequences=3)

# Decode and print the generated wordlists
for i, sequence in enumerate(output):
    generated_wordlist = tokenizer.decode(sequence, skip_special_tokens=True)
    print(f"Generated Wordlist {i + 1}: {generated_wordlist}")

Generated Wordlist 1: images
Generated Wordlist 2: images
Generated Wordlist 3: imagesed
