# Password Strength AI - Kaggle Training

This notebook reproduces the training pipeline for the Password Strength AI project. It is designed for Kaggle and walks through loading the data, preparing balanced train/test splits, training the neural network, exporting TensorFlow Lite files, and trying the model interactively.

## How to use this notebook

- Upload or attach a Kaggle dataset that contains the `pass_with_strength.csv` file used by the project.
- Ensure the cracked passwords URL is accessible from Kaggle (the default SecLists URL works at the time of writing).
- Adjust the configuration cell below (paths, max password length, batch size, etc.) before running the workflow top to bottom.
- When the run completes, download the artifacts from the Kaggle `Outputs` section (Keras SavedModel and the `.tflite` export).

In [None]:
# Uncomment the next line if you need to install or pin specific package versions.
# !pip install --quiet tensorflow==2.15.0 keras==2.15.0 pandas

In [None]:
import os
import random
import string
from urllib.request import urlopen

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

print(tf.__version__)

In [None]:
# ---------------------------------------------------------------------------
# Configuration (edit to match your Kaggle dataset and desired hyperparameters)
# ---------------------------------------------------------------------------
CSV_PATH = "/kaggle/input/pass-strength-ai/pass_with_strength.csv"  # Update to match your input dataset path
CRACKED_PASSWORDS_URL = "https://raw.githubusercontent.com/danielmiessler/SecLists/refs/heads/master/Passwords/Common-Credentials/xato-net-10-million-passwords-100000.txt"

PASSWORD_MAX_LENGTH = 12
TEST_RATIO = 0.25
MAX_SAMPLES_PER_CLASS = 40000  # Set to None to use every available sample up to the balanced count

BATCH_SIZE = 512
EPOCHS = 75
PATIENCE = 12
LEARNING_RATE = 1e-3

OUTPUT_DIR = "/kaggle/working/pass_strength_model"
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"Artifacts will be saved to: {OUTPUT_DIR}")

In [None]:
CHARS = string.ascii_letters + string.digits + string.punctuation
LABEL_LOOKUP = {
    0: "Cracked",
    1: "Ridiculous",
    2: "Weak",
    3: "Moderate",
    4: "Strong",
    5: "Very strong",
}

def load_cracked_passwords(url: str, max_length: int) -> list[str]:
    with urlopen(url) as response:
        lines = [line.decode("utf-8", errors="ignore").strip() for line in response.readlines()]
    return [line for line in lines if line and len(line) <= max_length]


def load_csv_passwords(csv_path: str, max_length: int) -> dict[int, list[str]]:
    passwords_by_strength = {i: [] for i in range(5)}
    with open(csv_path, "r", encoding="utf-8", errors="ignore") as csv_file:
        for raw_line in csv_file:
            line = raw_line.strip()
            if not line:
                continue
            try:
                password, strength_raw = line.rsplit(",", 1)
            except ValueError:
                continue
            strength_raw = strength_raw.strip()
            if not strength_raw.isdigit():
                continue
            strength = int(strength_raw)
            if strength not in passwords_by_strength:
                continue
            if len(password) <= max_length:
                passwords_by_strength[strength].append(password)

    for strength_list in passwords_by_strength.values():
        random.shuffle(strength_list)

    return passwords_by_strength


def build_balanced_dataset(
    csv_passwords: dict[int, list[str]],
    cracked_passwords: list[str],
    max_length: int,
    test_ratio: float,
    max_samples_per_class: int | None = None,
):
    class_passwords = {
        0: cracked_passwords,
        1: csv_passwords.get(1, []),
        2: csv_passwords.get(2, []),
        3: csv_passwords.get(3, []),
        4: csv_passwords.get(4, []),
        5: csv_passwords.get(4, []),  # Reuse strongest bucket for the "Very strong" label
    }

    candidate_lengths = [len(passwords) for label, passwords in class_passwords.items() if label != 0 and len(passwords) > 0]
    if not candidate_lengths:
        raise ValueError("No CSV passwords available to build the dataset. Check CSV_PATH and max length filters.")

    base_limit = min(candidate_lengths)
    if max_samples_per_class is not None:
        base_limit = min(base_limit, max_samples_per_class)

    train_passwords, train_labels = [], []
    test_passwords, test_labels = [], []

    for label, passwords in class_passwords.items():
        if not passwords:
            continue
        selected = passwords[:base_limit]
        test_count = max(1, int(len(selected) * test_ratio))
        if test_count >= len(selected):
            test_count = max(1, len(selected) // 2)

        test_passwords.extend(selected[:test_count])
        test_labels.extend([label] * test_count)

        train_passwords.extend(selected[test_count:])
        train_labels.extend([label] * (len(selected) - test_count))

    train_combined = list(zip(train_passwords, train_labels))
    random.shuffle(train_combined)
    if train_combined:
        train_passwords, train_labels = zip(*train_combined)
    else:
        train_passwords, train_labels = [], []

    test_combined = list(zip(test_passwords, test_labels))
    random.shuffle(test_combined)
    if test_combined:
        test_passwords, test_labels = zip(*test_combined)
    else:
        test_passwords, test_labels = [], []

    if not train_passwords or not test_passwords:
        raise ValueError("Failed to generate train/test splits; try lowering PASSWORD_MAX_LENGTH or MAX_SAMPLES_PER_CLASS.")

    return list(train_passwords), list(train_labels), list(test_passwords), list(test_labels)


def one_hot_encode(password: str, max_length: int) -> np.ndarray:
    encoding = np.zeros((max_length, len(CHARS)), dtype=np.float32)
    for i, char in enumerate(password[:max_length]):
        try:
            index = CHARS.index(char)
        except ValueError:
            continue
        encoding[i, index] = 1.0
    return encoding


def vectorize_passwords(passwords: list[str], max_length: int) -> np.ndarray:
    if not passwords:
        return np.zeros((0, max_length, len(CHARS)), dtype=np.float32)
    return np.stack([one_hot_encode(password, max_length) for password in passwords]).astype(np.float32)


def summarise_labels(labels: list[int]) -> pd.Series:
    if not labels:
        return pd.Series(dtype=int)
    return pd.Series(labels).map(LABEL_LOOKUP).value_counts().sort_index()


In [None]:
csv_passwords = load_csv_passwords(CSV_PATH, PASSWORD_MAX_LENGTH)
cracked_passwords = load_cracked_passwords(CRACKED_PASSWORDS_URL, PASSWORD_MAX_LENGTH)

print("Password counts per CSV strength bucket (after max length filter):")
for strength, passwords in csv_passwords.items():
    print(f"  Strength {strength}: {len(passwords):,} samples")
print(f"Cracked passwords (<= {PASSWORD_MAX_LENGTH} chars): {len(cracked_passwords):,} samples")

In [None]:
train_passwords, train_labels, test_passwords, test_labels = build_balanced_dataset(
    csv_passwords=csv_passwords,
    cracked_passwords=cracked_passwords,
    max_length=PASSWORD_MAX_LENGTH,
    test_ratio=TEST_RATIO,
    max_samples_per_class=MAX_SAMPLES_PER_CLASS,
)

train_summary = summarise_labels(train_labels)
test_summary = summarise_labels(test_labels)

display(pd.DataFrame({"train": train_summary, "test": test_summary}).fillna(0).astype(int))
print(f"Train samples: {len(train_passwords):,}")
print(f"Test samples: {len(test_passwords):,}")

In [None]:
train_x = vectorize_passwords(train_passwords, PASSWORD_MAX_LENGTH)
test_x = vectorize_passwords(test_passwords, PASSWORD_MAX_LENGTH)
num_classes = len(LABEL_LOOKUP)

train_y = keras.utils.to_categorical(train_labels, num_classes=num_classes, dtype="float32")
test_y = keras.utils.to_categorical(test_labels, num_classes=num_classes, dtype="float32")

print("Train tensor shape:", train_x.shape)
print("Test tensor shape:", test_x.shape)
print("Class tensor width:", num_classes)

In [None]:
def build_model(input_length: int, charset_size: int, num_classes: int) -> keras.Model:
    inputs = keras.layers.Input(shape=(input_length, charset_size))

    x = keras.layers.Conv1D(64, 3, activation="relu")(inputs)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.MaxPooling1D(1)(x)
    x = keras.layers.Dropout(0.25)(x)

    x = keras.layers.Conv1D(128, 3, activation="relu")(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.MaxPooling1D(1)(x)
    x = keras.layers.Dropout(0.5)(x)

    x = keras.layers.Conv1D(256, 3, activation="relu")(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.MaxPooling1D(2)(x)
    x = keras.layers.Dropout(0.25)(x)

    x = keras.layers.Bidirectional(keras.layers.LSTM(128, return_sequences=True))(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Dropout(0.25)(x)

    x = keras.layers.Bidirectional(keras.layers.LSTM(256, return_sequences=True))(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Dropout(0.25)(x)

    x = keras.layers.Bidirectional(keras.layers.LSTM(512))(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Dropout(0.25)(x)

    x = keras.layers.Dense(1024, activation="relu", kernel_regularizer=keras.regularizers.l2(0.01))(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Dropout(0.25)(x)

    x = keras.layers.Dense(512, activation="tanh", kernel_regularizer=keras.regularizers.l2(0.01))(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Dropout(0.25)(x)

    x = keras.layers.Dense(256, activation="tanh", kernel_regularizer=keras.regularizers.l2(0.01))(x)
    x = keras.layers.BatchNormalization()(x)
    x = keras.layers.Dropout(0.25)(x)

    outputs = keras.layers.Dense(num_classes, activation="softmax")(x)
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model


model = build_model(PASSWORD_MAX_LENGTH, len(CHARS), num_classes)
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=LEARNING_RATE),
    loss=keras.losses.CategoricalCrossentropy(),
    metrics=["accuracy"],
)

model.summary()

In [None]:
early_stopping = keras.callbacks.EarlyStopping(
    monitor="val_loss",
    patience=PATIENCE,
    verbose=1,
    restore_best_weights=True,
)

history = model.fit(
    train_x,
    train_y,
    validation_data=(test_x, test_y),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=[early_stopping],
    verbose=2,
)


In [None]:
import matplotlib.pyplot as plt

history_df = pd.DataFrame(history.history)
history_df.to_csv(os.path.join(OUTPUT_DIR, "training_history.csv"), index=False)

fig, axes = plt.subplots(1, 2, figsize=(12, 4))
axes[0].plot(history_df["loss"], label="train")
axes[0].plot(history_df["val_loss"], label="val")
axes[0].set_title("Loss")
axes[0].set_xlabel("Epoch")
axes[0].set_ylabel("Loss")
axes[0].legend()

axes[1].plot(history_df["accuracy"], label="train")
axes[1].plot(history_df["val_accuracy"], label="val")
axes[1].set_title("Accuracy")
axes[1].set_xlabel("Epoch")
axes[1].set_ylabel("Accuracy")
axes[1].legend()

plt.tight_layout()
plt.show()

In [None]:
loss, accuracy = model.evaluate(test_x, test_y, verbose=0)
print(f"Test loss: {loss:.4f}")
print(f"Test accuracy: {accuracy:.4f}")

In [None]:
keras_model_path = os.path.join(OUTPUT_DIR, "saved_model")
tflite_path = os.path.join(OUTPUT_DIR, "model.tflite")

model.save(keras_model_path)

converter = tf.lite.TFLiteConverter.from_keras_model(model)
converter.target_spec.supported_ops = [
    tf.lite.OpsSet.TFLITE_BUILTINS,
    tf.lite.OpsSet.SELECT_TF_OPS,
]
tflite_model = converter.convert()

with open(tflite_path, "wb") as f:
    f.write(tflite_model)

print("Saved artifacts:")
for root, _, files in os.walk(OUTPUT_DIR):
    for file_name in files:
        print(os.path.join(root, file_name))

In [None]:
def predict_password_strength(password: str):
    vector = vectorize_passwords([password], PASSWORD_MAX_LENGTH)
    probabilities = model.predict(vector, verbose=0)[0]
    predicted_label = int(np.argmax(probabilities))
    return LABEL_LOOKUP.get(predicted_label, "Unknown"), probabilities[predicted_label] * 100

sample_passwords = [
    "password123",
    "Qwerty!23",
    "5uP3r$3cur3",
]

for sample in sample_passwords:
    label, confidence = predict_password_strength(sample)
    print(f"{sample:>15} -> {label} ({confidence:.2f}% confidence)")

## Next steps

- Use the Kaggle *Save Version* button to produce an executable run that captures the model artifacts.
- Download the contents of the `pass_strength_model` folder from the Kaggle `Output` tab.
- Copy the TFLite file into your repository's `models/` directory and update any deployment scripts as needed.
- Iterate on the notebook (hyperparameters, architecture tweaks, dataset sampling) to reach your target accuracy.