# 1. Install Dependencies and Setup

In [None]:
!pip install tensorflow

In [1]:
import tensorflow as tf
import os

## 1.1 Prevent OOM Errors

In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu,  True)

# 2. Load Data

In [3]:
password_path = os.path.join('data', 'password-list-1.txt')
password_dup_path = os.path.join('data', 'password-list-dup.txt')

In [16]:
def read_file(file_path):
    with tf.io.gfile.GFile(file_path, 'r') as file:
        content = file.read()
    return content.split()

In [20]:
password_content = read_file(password_path)
password_dup_content = read_file(password_dup_path)

In [24]:
passwords = password_content + password_dup_content

In [None]:
passwords

## 2.1 Preprocess

In [39]:
from tensorflow.keras.layers import TextVectorization

In [40]:
MAX_FEATURES = 200_000

In [41]:
vectorizer = TextVectorization(max_tokens=MAX_FEATURES, 
                               output_mode='int', 
                               output_sequence_length=1000)

In [None]:
vectorizer.adapt(passwords)

In [None]:
vectorizer('password abc-123')[:5]

In [None]:
vectorized_text = vectorizer(passwords)

In [None]:
vectorized_text

In [None]:
len(passwords)

In [None]:
original_dataset = tf.data.Dataset.from_tensor_slices(vectorized_text)
cached_dataset = original_dataset.cache()
shuffle_dataset = cached_dataset.shuffle(200_000)
batched_dataset = shuffle_dataset.batch(32)

dataset = batched_dataset.prefetch(16)

In [None]:
batch_X = dataset.as_numpy_iterator().next()

In [None]:
percent_90 = int(len(dataset)*.9)
percent_70 = int(len(dataset)*.7)
percent_20 = int(len(dataset)*.2)
percent_10 = int(len(dataset)*.1)

In [None]:
train = dataset.take(percent_70)
val = dataset.skip(percent_70).take(percent_20)
test = dataset.skip(percent_90).take(percent_10)

In [None]:
train_generator = train.as_numpy_iterator()

In [None]:
train_generator.next()