In [1]:
from tensorflow import keras
from keras import layers

import pandas as pd
import tensorflow as tf
import numpy as np

In [2]:
train_data = pd.read_csv("data/train.txt", header=None, names=['input'])
train_data = train_data.drop_duplicates()

# Remove passwords with length < 4
train_data = train_data[train_data["input"].str.len() >= 4]

# Remove passwords with length > 20
train_data = train_data[train_data["input"].str.len() <= 20]

# Add col target
train_data["target"] = train_data["input"] + "\n"

# Add \t at the start of each input
train_data["input"] = "\t" + train_data["input"]

chars = sorted(list(set("\n".join(train_data["input"]))))

print("Number of passwords:", len(train_data))

vocab_size = len(chars)
print("Total chars:", vocab_size)

char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

inputs = []
outputs = []

for line in train_data["input"]:
    tmp = []
    for char in line:
        tmp.append(char_indices[char])
    inputs.append(tmp)

for line in train_data["target"]:
    tmp = []
    for char in line:
        tmp.append(char_indices[char])
    outputs.append(tmp)

X_train = tf.ragged.constant(inputs)
X_train = tf.one_hot(X_train, depth=vocab_size)

y_train = tf.ragged.constant(outputs)
y_train = tf.one_hot(y_train, depth=vocab_size)

Number of passwords: 374863
Total chars: 94


In [3]:
chars


['\t',
 '\n',
 ' ',
 '!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '|',
 '~']

In [4]:
def onehot2indices(hotmax):
    return tf.math.argmax(hotmax, axis=-1).numpy().flatten()

def onehot2chars(hotmax):
    return [indices_char[i] for i in onehot2indices(hotmax)]

def word2indices(word):
    return [char_indices[c] for c in word]

def indices2word(indices):
    return ''.join([indices_char[i] for i in indices])

In [5]:
word2indices("Hello")

[41, 70, 77, 77, 80]

In [6]:
for i in range(0, 5):
    print(onehot2chars(X_train[i]), " -> ", onehot2chars(y_train[i]))

['\t', '-', '1', '2', '6', '9']  ->  ['-', '1', '2', '6', '9', '\n']
['\t', '0', '.', '1', '2', '3', '4', '5']  ->  ['0', '.', '1', '2', '3', '4', '5', '\n']
['\t', '1', '0', '0', '0']  ->  ['1', '0', '0', '0', '\n']
['\t', '1', '0', '0', '1']  ->  ['1', '0', '0', '1', '\n']
['\t', '1', '0', '0', '2']  ->  ['1', '0', '0', '2', '\n']


In [7]:
def get_model(file = None, epoch = None):
    if file is not None:
        return keras.models.load_model("models/{}/{}_e{}.h5".format(file, file, epoch))
    return get_stacked_lstm()
    # return get_gru()
    # return get_lstm()
    # return bidirectional_lstm()

def get_stacked_lstm():
    model = keras.Sequential()
    model.add(keras.Input(shape=(None, vocab_size)))
    model.add(layers.LSTM(128, return_sequences=True))
    model.add(layers.LSTM(128, return_sequences=True))
    model.add(layers.Dense(vocab_size, activation="softmax"))

    model.compile(loss="categorical_crossentropy", optimizer="adam")

    model.summary()
    return model

def get_gru():
    model = keras.Sequential()
    model.add(keras.Input(shape=(None, vocab_size)))
    model.add(layers.GRU(128, return_sequences=True))
    model.add(layers.Dense(vocab_size, activation="softmax"))

    model.compile(loss="categorical_crossentropy", optimizer="adam")

    model.summary()
    return model
    
def get_lstm():
    model = keras.Sequential()
    model.add(keras.Input(shape=(None, vocab_size)))
    model.add(layers.LSTM(128, return_sequences=True))
    model.add(layers.Dense(vocab_size, activation="softmax"))

    model.compile(loss="categorical_crossentropy", optimizer="adam")

    model.summary()
    return model

def get_bidirectional_lstm():
    model = keras.Sequential()
    model.add(keras.Input(shape=(None, vocab_size)))
    model.add(layers.Bidirectional(layers.LSTM(128, return_sequences=True)))
    model.add(layers.Dense(vocab_size, activation="softmax"))

    model.compile(loss="categorical_crossentropy", optimizer="adam")

    model.summary()
    return model

In [8]:
def generate_word(model, max_length=20):
    out = [0] # Start with \t token
    input = tf.one_hot(out, depth=vocab_size) # One hot encode
    input = tf.reshape(input, (1, input.shape[0], input.shape[1])) # Add batch dimension

    for i in range(max_length):
        prediction = model.predict(input, verbose=0) # Predict next letter
        prediction = prediction[:,-1:,:][0] # Last letter softmax probabilities
        id = tf.squeeze(tf.random.categorical(tf.math.log(prediction), 1)).numpy() # Sample from softmax

        if indices_char[id] == "\n": # Stop if \n token is predicted
            break

        out.append(id) # Append predicted letter
        input = tf.one_hot(out, depth=vocab_size) # Update input
        input = tf.reshape(input, (1, input.shape[0], input.shape[1])) 
    
    # Remove \t token 
    out = out[1:]
    del input, prediction, id
    return indices2word(out)
        
def generate_n_words(model, n, max_length=20):
    words = []
    for i in range(n):
        print("Generating word {}/{}".format(i, n), end="\r")
        words.append(generate_word(model, max_length))
    return words

In [9]:
# def get_closest_word_in_test(word):
#     if word in test_data["input"].values:
#         return (word, 0)

#     score = 999
#     closest_word = ""
#     for index, row in test_data.iterrows():
#         lev_score = lev.distance(word, row["input"])
#         score = min(score, lev_score)
#         if score == lev_score:
#             closest_word = row["input"]
#     return (closest_word, score)

# def get_mean_levenshtein_distance(words):
#     total = 0
#     for word in words:
#         total += get_closest_word_in_test(word)[1]
#     return total / len(words)

# def closests_words(words):
#     scores = []
#     for word in words:
#         scores.append(("", 999))

#     for _, row in test_data.iterrows():
#         for i, word in enumerate(words):
#             lev_score = lev.distance(word, row["input"])
#             if lev_score < scores[i][1]:
#                 scores[i] = (row["input"], lev_score)
#     return scores

# def get_mean_levenshtein_distance(words):
#     tuples = closests_words(words)
#     total = 0
#     for tuple in tuples:
#         total += tuple[1]
#     return total / len(tuples)

In [12]:
loss = []
batch_size = 32
words_each_epochs = (2000, 5)
model_name = "lstm_h256_b32"

with open("models/{}/{}_loss.txt".format(model_name, model_name), "r") as f:
    loss = f.read().removeprefix("[").removesuffix("]").split(", ")
    loss = [float(x) for x in loss]

epoch = len(loss)
print("Starting from epoch {}".format(epoch))

model = get_model(model_name, epoch - 1)

# epoch = 0
# model = get_model()

history = model.fit(X_train, y_train, batch_size, epochs=1)

loss.append(history.history["loss"][0])
# Save metrics to file
with open("models/{}/{}_loss.txt".format(model_name, model_name), "w") as f:
    f.write(str(loss))
# # Save model
model.save("models/{}/{}_e{}.h5".format(model_name, model_name, epoch))

# Generate words
if epoch % words_each_epochs[1] == 0:
    words = generate_n_words(model, words_each_epochs[0])
    with open("models/{}/{}_e{}_words.txt".format(model_name, model_name, epoch), "w") as f:
        f.write(str(words))


Starting from epoch 73
  649/11715 [>.............................] - ETA: 5:10 - loss: 2.3188

KeyboardInterrupt: 

# Plot

In [25]:
from scipy.interpolate import make_interp_spline, BSpline
import matplotlib
import matplotlib.pyplot as plt
matplotlib.use("pgf")
matplotlib.rcParams.update({
    "pgf.texsystem": "pdflatex",
    'font.family': 'serif',
    'text.usetex': True,
    'pgf.rcfonts': False,
})

In [None]:
train_data = pd.read_csv('data/train.txt', names=['password'])
train_data['password'] = train_data['password'].astype(str)

# with open('models\lstm_h256_b32\lstm_h256_b32_e70_words.txt', 'r') as f:
#     train_data = f.read().replace('[', '').replace(']', '').replace('\'', '').split(', ')

# # to dataframe
# train_data = pd.DataFrame(train_data, columns=['password'])

# # Convert to string
# train_data['password'] = train_data['password'].astype(str)


# Remove passwords with length > 14 (pas assez) and < 3 (Code pin)
clean_train_data = train_data[(train_data['password'].str.len() < 14) & (train_data['password'].str.len() > 3)]

# Add length of the password as a feature
clean_train_data['length'] = clean_train_data['password'].str.len()

# Add digit only as a feature
clean_train_data['digit_only'] = clean_train_data['password'].str.isdigit()

# Add letter only as a feature
clean_train_data['letter_only'] = clean_train_data['password'].str.isalpha()

# Add special characters as a feature
clean_train_data['special'] = ~clean_train_data['password'].str.isalnum()

## password_types

In [20]:
# Plot stacked bar for digit only, letter only and special characters
digit_only = clean_train_data[clean_train_data['digit_only'] == True]
letter_only = clean_train_data[clean_train_data['letter_only'] == True]
special = clean_train_data[clean_train_data['special'] == True]

def contains_number(string):
    return any(char.isdigit() for char in string)

def contains_letter(string):
    return any(char.isalpha() for char in string)

def contains_special(string):
    return any(not char.isalnum() for char in string)

letter_and_digit_no_special = clean_train_data[clean_train_data['password'].apply(contains_number) & clean_train_data['password'].apply(contains_letter) & ~clean_train_data['password'].apply(contains_special)]
letter_and_special_no_digit = clean_train_data[clean_train_data['password'].apply(contains_special) & clean_train_data['password'].apply(contains_letter) & ~clean_train_data['password'].apply(contains_number)]
special_and_digit_no_letter = clean_train_data[clean_train_data['password'].apply(contains_special) & clean_train_data['password'].apply(contains_number) & ~clean_train_data['password'].apply(contains_letter)]

digit_only = digit_only.groupby('length').count()['digit_only']
letter_only = letter_only.groupby('length').count()['letter_only']
letter_and_digit_no_special = letter_and_digit_no_special.groupby('length').count()['password']
letter_and_special_no_digit = letter_and_special_no_digit.groupby('length').count()['password']
special_and_digit_no_letter = special_and_digit_no_letter.groupby('length').count()['password']
other = clean_train_data.groupby('length').count()['password'] - digit_only - letter_only - letter_and_digit_no_special - letter_and_special_no_digit - special_and_digit_no_letter

# add missing values
digit_only = digit_only.reindex(range(4, 14), fill_value=0)
letter_only = letter_only.reindex(range(4, 14), fill_value=0)
letter_and_digit_no_special = letter_and_digit_no_special.reindex(range(4, 14), fill_value=0)
letter_and_special_no_digit = letter_and_special_no_digit.reindex(range(4, 14), fill_value=0)
special_and_digit_no_letter = special_and_digit_no_letter.reindex(range(4, 14), fill_value=0)
other = other.reindex(range(4, 14), fill_value=0)

plt.figure().set_size_inches(w=3.5, h=3.5)
plt.bar(digit_only.index, digit_only.values, label='Digit only')
plt.bar(letter_only.index, letter_only.values, bottom=digit_only.values, label='Letter only')
plt.bar(letter_and_digit_no_special.index, letter_and_digit_no_special.values, bottom=digit_only.values + letter_only.values, label='Letter and \ndigit no special')
plt.bar(letter_and_special_no_digit.index, letter_and_special_no_digit.values, bottom=digit_only.values + letter_only.values + letter_and_digit_no_special.values, label='Letter and \nspecial no digit')
plt.bar(special_and_digit_no_letter.index, special_and_digit_no_letter.values, bottom=digit_only.values + letter_only.values + letter_and_digit_no_special.values + letter_and_special_no_digit.values, label='Special and \ndigit no letter')
plt.bar(other.index, other.values, bottom=digit_only.values + letter_only.values + letter_and_digit_no_special.values + letter_and_special_no_digit.values + special_and_digit_no_letter.values, label='Mixed')
plt.xlabel('Length')
plt.ylabel('Number of passwords')
plt.title('Password types')
plt.legend(fontsize = 6)
plt.savefig('fig/password_types.png')
plt.savefig('fig/password_types.pgf')

## loss

In [22]:
with open('models\stacked_lstm_h128_b32\stacked_lstm_h128_b32_loss.txt') as f:
    loss = f.read().replace('[', '').replace(']', '').split(', ')

loss = [float(i) for i in loss]

plt.figure().set_size_inches(w=3.7, h=3.5)
plt.plot(loss)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.savefig('fig\stacked_lstm_h128_b32_loss.pgf')
plt.savefig('fig\stacked_lstm_h128_b32_loss.png')

## in_train / in_rockyou

In [30]:
train_data = pd.read_csv("data/train.txt", header=None, names=['input'])
with open('data/rockyou.txt', errors="ignore") as f:
    rockyou_data = f.read().splitlines()

path = "models\gru_h128_b32\\"
file_name = "gru_h128_b32_e"
max = 21

for fi in range(0, max, 5):
    print("File: ", fi, "/", max)
    with open(path + file_name + str(fi) + '_words.txt') as f:
        in_test = 0
        in_train = 0
        words = f.read().replace('[', '').replace(']', '').replace("'", '').split(', ')

        for i, word in enumerate(words):
            print("{}/{}".format(i, len(words)), end="\r")
            if word in rockyou_data:
                in_test += 1
            if word in train_data['input'].values:
                in_train += 1
        
        with open("{}\Res.csv".format(path), "a") as f:
            f.write("{}, {}, {}\n".format(fi, in_test, in_train))


File:  20 / 21
1999/2000

In [31]:
# Load data 
data = pd.read_csv("models\gru_h128_b32\Res.csv", header=None)
data.columns = ['Epoch', 'Rockyou', 'Train']
# convert to int
data['Epoch'] = data['Epoch'].astype(int)
data['Rockyou'] = data['Rockyou'].astype(int)
data['Train'] = data['Train'].astype(int)

In [32]:
x_new = np.linspace(data['Epoch'].min(), data['Epoch'].max(), 300) #300 represents number of points to make between T.min and T.max
spl = make_interp_spline(data['Epoch'], data['Rockyou'], k=3) #BSpline object
y_smooth = spl(x_new)
plt.clf()
plt.figure().set_size_inches(w=3.7, h=3.5)
plt.plot(x_new, y_smooth, label='In rockyou.txt', color='purple', alpha=0.9)
plt.plot(data['Epoch'], data['Rockyou'], label=None, color="purple", alpha=0.4)

spl = make_interp_spline(data['Epoch'], data['Train'], k=3) #BSpline object
y_smooth = spl(x_new)
plt.plot(x_new, y_smooth, label='In train.txt', color='green', alpha=0.9)
plt.plot(data['Epoch'], data['Train'], label=None, color="green", alpha=0.4)

plt.xlabel('Epoch')
plt.ylabel('Number of passwords')
plt.legend()

plt.savefig('fig\gru_h128_b32_in_test_rockyou.pgf')
plt.savefig('fig\gru_h128_b32_in_test_rockyou.png')