<a href="https://colab.research.google.com/github/HYAD-Yassin/Password_Factory/blob/main/LTSM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!ls drive/MyDrive/

Mounted at /content/drive
 100K_LSTM_passwords.txt
 10K_LSTM_passwords.txt
 1M_LSTM_passwords.txt
 acoustique_voy_orales_20loc_ESTER_NCCFr_contexte_freqLex_distCentroide.csv
 AllDataSet_Filtred.txt
 AllDataSet.txt
'Alternance (1).gdoc'
 Alternance.gdoc
 archive.zip
 Ashley-Madison_Ini.txt
 Ashley-Madison.txt
'Colab Notebooks'
 DATABASE_Password.zip
 data.zip
 Filtered-Ashley-Madison.txt
 Filtered_PWD.txt
 generated100k_GRU_passwords.txt
 generated100K_LSTM_passwords.txt
 generated10K_LSTM_passwords.txt
 generated1M_GRU_passwords.txt
 generated200K_LSTM_passwords.txt
 generated2_passwords.txt
 generated50K_LSTM_passwords.txt
 generated_GRU_passwords.txt
 generated_LSTM_passwords.txt
 generated_passwords.txt
 gru_model.pth
 histo2.png
 histogram1.png
 Letter.gdoc
 lstm_model2.pth
 lstm_model.pth
 my_model
 nameGeneration.py
 Passwords.txt
 pwd_2Rnn.pth
 pwd_Rnn.pth
 __pycache__
 reduced_Ashley-Madison.txt
 rnn.pt
'Untitled document (1).gdoc'
'Untitled document.gdoc'


#**Dataset**

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import Counter
import os
from torch.utils.data import DataLoader, TensorDataset

# Check if CUDA is available, else use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

file_path = '/content/drive/My Drive/Ashley-Madison.txt'

# Read the data
with open(file_path, 'r', encoding='utf-8') as file:
    passwords = file.read().splitlines()  # Each password is a line

print(f"The total passwords in The DataSet is: {len(passwords)}")


# Preprocessing: Create a dictionary to convert characters to integers and back
all_chars = ''.join(set(''.join(passwords)))
n_characters = len(all_chars)
char_to_int = {char: i for i, char in enumerate(all_chars)}
int_to_char = {i: char for i, char in enumerate(all_chars)}

The total passwords in The DataSet is: 338333


#**LSTM Model**

In [None]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, n_layers):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.n_layers = n_layers

        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=n_layers)
        self.i2o = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(0.1)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        lstm_out, hidden = self.lstm(input.view(1, 1, -1), hidden)
        output = self.i2o(lstm_out.view(1, -1))
        output = self.dropout(output)
        output = self.softmax(output)
        return output, hidden

    def init_hidden(self):
        return (torch.zeros(self.n_layers, 1, self.hidden_size),
                torch.zeros(self.n_layers, 1, self.hidden_size))

#**Training Loop:**

In [None]:
def char_tensor(string):
    tensor = torch.zeros(len(string), n_characters)  # Create a 2D tensor
    for c in range(len(string)):
        char_idx = char_to_int[string[c]]
        tensor[c][char_idx] = 1  # One-hot encoding
    return tensor

# Training function
def train_on_full_dataset(passwords, lstm, criterion, optimizer):
    total_loss = 0

    for password in passwords:
        input_line_tensor = char_tensor(password[:-1]).to(device)  # Move to device
        target_line_tensor = char_tensor(password[1:]).to(device)  # Move to device

        hidden, cell = lstm.init_hidden()
        hidden = hidden.to(device)  # Move to device
        cell = cell.to(device)  # Move to device

        lstm.zero_grad()
        loss = 0

        for i in range(input_line_tensor.size(0)):
            input_tensor = input_line_tensor[i].unsqueeze(0)  # already one-hot encoded
            target_char = target_line_tensor[i].argmax().unsqueeze(0)  # Get the index of the target character

            output, (hidden, cell) = lstm(input_tensor, (hidden, cell))
            l = criterion(output, target_char)
            loss += l

        loss.backward()
        optimizer.step()

        total_loss += loss.item() / input_line_tensor.size(0)

    return total_loss / len(passwords)

# Initialize network, optimizer, and loss function
n_characters = len(char_to_int)  # Number of unique characters
hidden_size = 128
output_size = n_characters
n_layers = 2  # Number of LSTM layers
lstm = LSTM(n_characters, hidden_size, output_size, n_layers).to(device)
optimizer = optim.Adam(lstm.parameters(), lr=0.005)
criterion = nn.CrossEntropyLoss()

# Training loop
n_epochs = 2
for epoch in range(1, n_epochs + 1):
    loss = train_on_full_dataset(passwords, lstm,  criterion, optimizer)
    print(f'Epoch: {epoch} of {n_epochs}, Loss: {loss:.4f}')


KeyboardInterrupt: 

In [None]:
torch.save(lstm.state_dict(), '/content/drive/My Drive/lstm_model2.pth')

lstm.load_state_dict(torch.load('/content/drive/My Drive/lstm_model2.pth'))
lstm.eval()

LSTM(
  (lstm): LSTM(92, 128, num_layers=2)
  (i2o): Linear(in_features=128, out_features=92, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (softmax): LogSoftmax(dim=1)
)

In [None]:
rnn = LSTM(n_characters, hidden_size, n_characters, n_layers).to(device)


#**Passwords Generation**

In [None]:
import random
import torch

def generate_password(model, temperature, all_chars, char_to_int, device, int_to_char):
    model.eval()
    hidden, cell = model.init_hidden()  # corrected method name
    hidden = hidden.to(device)  # move to correct device
    cell = cell.to(device)  # move to correct device

    start_str = random.choice(all_chars)
    predict_len = random.randint(6, 15)

    input_tensor = char_tensor(start_str).to(device)
    predicted_str = start_str
    last_char = input_tensor[-1]

    for p in range(predict_len):
        output, (hidden, cell) = model(last_char.unsqueeze(0), (hidden, cell))
        output_dist = output.data.view(-1).div(temperature).exp()
        output_dist = torch.clamp(output_dist, min=0.0001, max=1.0)
        output_dist = output_dist / torch.sum(output_dist)
        top_char = torch.multinomial(output_dist, 1)[0]

        predicted_char = int_to_char[top_char.item()]
        predicted_str += predicted_char
        last_char = char_tensor(predicted_char).to(device)

    return predicted_str

def generate_passwords_to_file(model, num_pwd, file_path, temperature, all_chars, char_to_int, device, int_to_char):
    model.to(device)
    with open(file_path, 'w') as file:
        for _ in range(num_pwd):
            password = generate_password(model, temperature, all_chars, char_to_int, device, int_to_char)  # corrected argument list
            file.write(password + '\n')

# Example usage:
num_pwd = 1000000
output_file_path = '/content/drive/My Drive/generated1M_LSTM_passwords.txt'
generate_passwords_to_file(lstm, num_pwd, output_file_path, 0.85, all_chars, char_to_int, device, int_to_char)


#**Calculate Accuracy**

In [None]:
def calculate_accuracy_and_matches(original_dataset_path, generated_file_path):
    # Load the original dataset
    with open(original_dataset_path, 'r') as file:
        original_passwords = set(file.read().splitlines())

    # Load generated passwords
    with open(generated_file_path, 'r') as file:
        generated_passwords = file.read().splitlines()

    # Find matches
    matches = [password for password in generated_passwords if password in original_passwords]
    dub = set(matches)
    # Calculate accuracy based on full line matches
    accuracy = (len(dub) / len(generated_passwords)) * 100 if generated_passwords else 0
    return accuracy, matches

# Paths to the files
original_dataset_path = '/content/drive/My Drive/Ashley-Madison_Ini.txt'  # Modify as needed
generated_file_path = '/content/drive/My Drive/100K_RNN_Passwords.txt'  # Modify as needed

# Calculate accuracy and get matches
accuracy, matching_passwords = calculate_accuracy_and_matches(original_dataset_path, generated_file_path)
print(f"Accuracy: {accuracy:.2f}%")
print("Matching Passwords Number:", len(matching_passwords))
print("Matching Passwords:", set(matching_passwords))

Accuracy: 1.41%
Matching Passwords Number: 9282
Matching Passwords: {'one', '29', 'cola', 'sss3', 'conn', 'pareja', 'cum', 'swt', 'R2', '8757', '96581', 'cera', 's', 'res', 'aff', '1111', '1212', '5903', '110', 'wa', 'care', 'ramadi', '9000', '621', '813', 'tecone', 'ella', 'tire', 'mare', '9448', 'me420', '185', 'gf', 'bone', '3232', '139', 'hamada', '712', 'nani', 'Chen', '121', '469', 'romy', '90', '4570', 'go', 'mrtr', '8083', '74720', 'fang', '3204', '10', '111', '97', '727', 'donde', 'vase', 'chonch', 'jaci', 'miken', 'm', 'ssssss', 'ro', 'nerd', 'sones', '223', 'AS19', '125', 'have', '730', 'ndes', 'lll', '269', 'rocko', '2435', 'lonis', 'nebo', 'nole', 'hire', '3467', '96', '80505', '990', '8995', 'md', '7012', '1009', 'rolla', 'gene', 'febe', 'mack', '814', 'tona', 'sabi', 'adee', 'mally', '217', 'lena', '1220', '2002', 'monet', 'fliar', '2113', '2133', 'mandi', '803', '3007', 'roe', 'assss', '3442', 'wetone', '1245', 're1121', 'nay', '8812', 'cman', '324', 'sowa', 'mone', '52