1. Patrones Morfológicas y de estructura:

In [1]:
# --- Generación de features morfológicas ---
import pandas as pd
import string
from pathlib import Path

HERE = Path.cwd()
ROOT = HERE.parent  # sube a Proyecto_PasswordStrength
DATA_PATH = ROOT / "data" / "processed" / "passwords_final.csv"

# Cargar dataset
df = pd.read_csv(DATA_PATH)


In [2]:
import string

def count_digits(pw): return sum(c.isdigit() for c in str(pw))
def count_upper(pw): return sum(c.isupper() for c in str(pw))
def count_lower(pw): return sum(c.islower() for c in str(pw))
def count_symbols(pw): return sum(c in string.punctuation for c in str(pw))

def charset_size(pw):
    has_lower   = any(c.islower() for c in str(pw))
    has_upper   = any(c.isupper() for c in str(pw))
    has_digits  = any(c.isdigit() for c in str(pw))
    has_symbols = any(c in string.punctuation for c in str(pw))

    size = 0
    if has_lower:   size += 26
    if has_upper:   size += 26
    if has_digits:  size += 10
    if has_symbols: size += len(string.punctuation)
    return size

# Agregar columnas nuevas
df["length"]       = df["password"].astype(str).str.len()
df["num_digits"]   = df["password"].astype(str).apply(count_digits)
df["num_upper"]    = df["password"].astype(str).apply(count_upper)
df["num_lower"]    = df["password"].astype(str).apply(count_lower)
df["num_symbols"]  = df["password"].astype(str).apply(count_symbols)
df["charset_size"] = df["password"].astype(str).apply(charset_size)

df.head()


Unnamed: 0,password,length,has_seq_alpha,has_seq_num,has_repeat,has_keyboard,has_year,has_common,num_digits,num_upper,num_lower,num_symbols,charset_size
0,martha1987,10,False,False,False,False,True,False,4,0,6,0,36
1,green_issra1234,15,False,True,False,True,False,True,4,0,10,1,68
2,Freedom9,8,False,False,False,False,False,True,1,1,6,0,62
3,agent000,8,False,False,True,False,False,False,3,0,5,0,36
4,Casey_N_Jess,12,False,False,False,False,False,True,0,3,7,2,84


2. Patrones comunes

In [3]:
# Lista de columnas booleanas de patrones
pattern_cols = [
    "has_seq_alpha",
    "has_seq_num",
    "has_repeat",
    "has_keyboard",
    "has_year",
    "has_common"
]

# Convertir True/False → 1/0
for col in pattern_cols:
    if col in df.columns:  # solo si existe en el dataset
        df[col] = df[col].astype(int)

df[pattern_cols].head()


Unnamed: 0,has_seq_alpha,has_seq_num,has_repeat,has_keyboard,has_year,has_common
0,0,0,0,0,1,0
1,0,1,0,1,0,1
2,0,0,0,0,0,1
3,0,0,1,0,0,0
4,0,0,0,0,0,1


3. Entropia

In [4]:
import math
from collections import Counter

# Función: Shannon entropy
def entropy_shannon(pw):
    if not pw: return 0
    counts = Counter(pw)
    n = len(pw)
    probs = [c/n for c in counts.values()]
    return -sum(p * math.log2(p) for p in probs)

# Función: Rényi entropy (q=2)
def entropy_renyi2(pw):
    if not pw: return 0
    counts = Counter(pw)
    n = len(pw)
    probs = [c/n for c in counts.values()]
    return -math.log2(sum(p**2 for p in probs))

# Función: NIST entropy (length * log2(charset))
def entropy_nist(pw):
    if not pw: return 0
    cset = charset_size(pw)  # usamos la función que ya definimos antes
    return len(pw) * math.log2(cset) if cset > 0 else 0

# Función: combinatorial entropy (idéntica a NIST en este caso)
def entropy_combinatorial(pw):
    return entropy_nist(pw)

# Función: perplexity = 2^shannon
def perplexity(pw):
    return 2 ** entropy_shannon(pw)

# Agregar columnas al DataFrame
df["entropy_shannon"]       = df["password"].astype(str).apply(entropy_shannon)
df["entropy_nist"]          = df["password"].astype(str).apply(entropy_nist)
df["entropy_renyi2"]        = df["password"].astype(str).apply(entropy_renyi2)
df["entropy_combinatorial"] = df["password"].astype(str).apply(entropy_combinatorial)
df["perplexity"]            = df["password"].astype(str).apply(perplexity)

df.head()


Unnamed: 0,password,length,has_seq_alpha,has_seq_num,has_repeat,has_keyboard,has_year,has_common,num_digits,num_upper,num_lower,num_symbols,charset_size,entropy_shannon,entropy_nist,entropy_renyi2,entropy_combinatorial,perplexity
0,martha1987,10,0,0,0,0,1,0,4,0,6,0,36,3.121928,51.69925,3.058894,51.69925,8.705506
1,green_issra1234,15,0,1,0,1,0,1,4,0,10,1,68,3.506891,91.311943,3.421464,91.311943,11.367874
2,Freedom9,8,0,0,0,0,0,1,1,1,6,0,62,2.75,47.63357,2.678072,47.63357,6.727171
3,agent000,8,0,0,1,0,0,0,3,0,5,0,36,2.405639,41.3594,2.192645,41.3594,5.298702
4,Casey_N_Jess,12,0,0,0,0,0,1,0,3,7,2,84,2.855389,76.707809,2.710493,76.707809,7.236984


3.1 Funciones adicionales

In [5]:
import re

def ends_with_word_plus_digits(s):
    m = re.search(r"([A-Za-z]{3,})\d{2,}$", str(s))
    if not m:
        return None
    return m.group(1).lower()

def has_long_repeated_substring(s, minlen=3):
    s = str(s)
    for L in range(minlen, min(6, len(s)//2 + 1)):
        for i in range(len(s) - 2*L + 1):
            chunk = s[i:i+L]
            if chunk and s.count(chunk*2) > 0:
                return True
    return False

def max_digit_run(s):
    runs = re.findall(r"\d+", str(s))
    return max((len(r) for r in runs), default=0)

def starts_with_digit_run(s, k=6):
    return bool(re.match(rf"^\d{{{k},}}", str(s)))

def ends_with_digit_run(s, k=6):
    return bool(re.search(rf"\d{{{k},}}$", str(s)))

def is_strict_email(s: str) -> bool:
    s = str(s)

    # 1) Debe haber exactamente un @
    if s.count("@") != 1:
        return False
    local, domain = s.split("@", 1)

    # 2) Longitudes razonables (aprox RFC)
    if not (1 <= len(local) <= 64):
        return False
    if not (4 <= len(domain) <= 253):
        return False
    if len(s) > 254:
        return False

    # 3) Dominio debe tener al menos un punto (p.ej. gmail.com)
    if "." not in domain:
        return False

    # 4) Dominio SOLO minúsculas (evita casos mezclados tipo 'poOgH')
    #    y debe usar solo a-z0-9- y puntos.
    if domain != domain.lower():
        return False
    if not re.fullmatch(r"[a-z0-9.-]+", domain):
        return False

    # 5) Validar labels del dominio y TLD (solo letras minúsculas 2–24)
    labels = domain.split(".")
    if len(labels) < 2:
        return False
    tld = labels[-1]
    if not re.fullmatch(r"[a-z]{2,24}", tld):
        return False
    # label: no vacía, no empieza/termina con '-', 1–63 chars, solo a-z0-9-
    label_re = re.compile(r"(?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?)")
    for lab in labels:
        if not label_re.fullmatch(lab):
            return False

    # 6) Local-part pragmático (proveedores comunes)
    #    - sin dobles puntos ni empezar/terminar con '.'
    if local[0] == "." or local[-1] == "." or ".." in local:
        return False
    #    - caracteres típicos permitidos (no incluimos '/', '`', '~', etc.)
    if not re.fullmatch(r"[A-Za-z0-9._%+-]+", local):
        return False

    # 7) Requisito suave: al menos una letra en dominio de segundo nivel
    #    (evita dominios demasiado "ruidosos" tipo 'x9' + tld)
    sld = labels[-2]
    if not re.search(r"[a-z]", sld):
        return False

    return True


# año de 4 dígitos en rango
YEAR4_RE = re.compile(r"(?<!\d)(19[3-9]\d|20[0-2]\d|2025)(?!\d)")
def contains_year4(s):
    return bool(YEAR4_RE.search(str(s)))

def contains_word_plus_year2(s):
    m = re.search(r"[A-Za-z]{3,}(\d{2})(?!\d)", str(s))
    return bool(m)

def contains_word_plus_year4(s):
    return bool(re.search(r"[A-Za-z]{3,}(19[3-9]\d|20[0-2]\d|2025)", str(s)))

def max_same_char_run(s):
    s = str(s)
    if not s: return 0
    best = 1; cur = 1
    for i in range(1, len(s)):
        if s[i] == s[i-1]:
            cur += 1; best = max(best, cur)
        else:
            cur = 1
    return best

def starts_with_increasing_seq4(s):
    """abcd / ABCD / 1234 al inicio (≥4 consecutivos, dígitos modulo 10 permisivo)"""
    s = str(s)
    if len(s) < 4: return 0
    # num
    m = re.match(r"(\d{4,})", s)
    if m:
        chunk = m.group(1)
        ok = True
        for i in range(1, len(chunk)):
            prev = int(chunk[i-1]); cur = int(chunk[i])
            if (prev + 1) % 10 != cur: ok = False; break
        if ok: return 1
    # alpha lower
    m = re.match(r"([a-z]{4,})", s)
    if m:
        chunk = m.group(1)
        if all(ord(chunk[i]) == ord(chunk[i-1]) + 1 for i in range(1, len(chunk))):
            return 1
    # alpha upper
    m = re.match(r"([A-Z]{4,})", s)
    if m:
        chunk = m.group(1)
        if all(ord(chunk[i]) == ord(chunk[i-1]) + 1 for i in range(1, len(chunk))):
            return 1
    return 0

def ends_with_increasing_seq4(s):
    """...abcd / ...ABCD / ...1234 al final (≥4 consecutivos, dígitos modulo 10)"""
    s = str(s)
    if len(s) < 4: return 0
    # num
    m = re.search(r"(\d{4,})$", s)
    if m:
        chunk = m.group(1)
        ok = True
        for i in range(1, len(chunk)):
            prev = int(chunk[i-1]); cur = int(chunk[i])
            if (prev + 1) % 10 != cur: ok = False; break
        if ok: return 1
    # alpha lower
    m = re.search(r"([a-z]{4,})$", s)
    if m:
        chunk = m.group(1)
        if all(ord(chunk[i]) == ord(chunk[i-1]) + 1 for i in range(1, len(chunk))):
            return 1
    # alpha upper
    m = re.search(r"([A-Z]{4,})$", s)
    if m:
        chunk = m.group(1)
        if all(ord(chunk[i]) == ord(chunk[i-1]) + 1 for i in range(1, len(chunk))):
            return 1
    return 0

def count_separators(s):
    s = str(s)
    return sum(1 for c in s if not c.isalnum())




# Aux columns (idempotentes)
df["_max_digit_run"]       = df["password"].astype(str).apply(max_digit_run)
df["_is_email_strict"] = df["password"].astype(str).apply(is_strict_email).astype(int)
df["_ends_word_plus_digits"] = df["password"].astype(str).apply(ends_with_word_plus_digits)
df["_has_long_repeats"]    = df["password"].astype(str).apply(lambda s: has_long_repeated_substring(s, minlen=3)).astype(int)
df["_has_year4"]           = df["password"].astype(str).apply(contains_year4).astype(int)
df["_has_word_year4"]      = df["password"].astype(str).apply(contains_word_plus_year4).astype(int)
df["_has_word_year2"]      = df["password"].astype(str).apply(contains_word_plus_year2).astype(int)
df["_ends_digit_run6"]     = df["password"].astype(str).apply(lambda s: int(ends_with_digit_run(s, 6)))
df["_starts_digit_run6"]   = df["password"].astype(str).apply(lambda s: int(starts_with_digit_run(s, 6)))
df["_max_same_char_run"]   = df["password"].astype(str).apply(max_same_char_run)
df["_starts_inc_seq4"]     = df["password"].astype(str).apply(starts_with_increasing_seq4).astype(int)
df["_ends_inc_seq4"]       = df["password"].astype(str).apply(ends_with_increasing_seq4).astype(int)


In [6]:
import re
from datetime import datetime

YEAR_MIN, YEAR_MAX = 1930, 2050  

def _try_date(y, m, d):
    try:
        datetime(year=y, month=m, day=d)
        return True
    except ValueError:
        return False

def _yy_to_year(yy):
    # Mapear yy a 19xx o 20xx para que caiga entre [YEAR_MIN, YEAR_MAX]
    yy = int(yy)
    cand1 = 1900 + yy
    cand2 = 2000 + yy
    # preferir el que esté en rango; si ambos, el más “cercano” a YEAR_MAX
    candidates = [y for y in (cand1, cand2) if YEAR_MIN <= y <= YEAR_MAX]
    if candidates:
        # el más reciente dentro del rango
        return max(candidates)
    # si ninguno cae en rango, devolver fuera de rango para invalidar
    return -1

def _has_date_tokens(s):
    s = str(s)
    has8 = False
    has6 = False
    date_at_end = False

    # Tomar todos los runs de dígitos contiguos
    for run in re.findall(r"\d+", s):
        n = len(run)

        # ---- 8 dígitos ----
        if n >= 8:
            # revisar todas las ventanas de 8 dentro del run
            for i in range(n - 8 + 1):
                chunk = run[i:i+8]
                mm, dd, yyyy = int(chunk[0:2]), int(chunk[2:4]), int(chunk[4:8])
                dd2, mm2, yyyy2 = int(chunk[0:2]), int(chunk[2:4]), int(chunk[4:8])  # (nombres iguales, pero usaremos dos interpretaciones)

                # mmddyyyy
                if 1 <= mm <= 12 and YEAR_MIN <= yyyy <= YEAR_MAX and _try_date(yyyy, mm, dd):
                    has8 = True
                    # ¿está al final de la contraseña?
                    if s.endswith(chunk):
                        date_at_end = True
                # ddmmyyyy
                ddx, mmx, yx = int(chunk[0:2]), int(chunk[2:4]), int(chunk[4:8])
                if 1 <= mmx <= 12 and YEAR_MIN <= yx <= YEAR_MAX and _try_date(yx, mmx, ddx):
                    has8 = True
                    if s.endswith(chunk):
                        date_at_end = True

        # ---- 6 dígitos ----
        if n >= 6:
            for i in range(n - 6 + 1):
                chunk = run[i:i+6]
                mm, dd, yy = int(chunk[0:2]), int(chunk[2:4]), chunk[4:6]
                ddx, mmx, yyx = int(chunk[0:2]), int(chunk[2:4]), chunk[4:6]

                # mmddyy
                if 1 <= mm <= 12:
                    year = _yy_to_year(yy)
                    if YEAR_MIN <= year <= YEAR_MAX and _try_date(year, mm, int(dd)):
                        has6 = True
                        if s.endswith(chunk):
                            date_at_end = True

                # ddmmyy
                if 1 <= mmx <= 12:
                    year2 = _yy_to_year(yyx)
                    if YEAR_MIN <= year2 <= YEAR_MAX and _try_date(year2, mmx, int(ddx)):
                        has6 = True
                        if s.endswith(chunk):
                            date_at_end = True

    return has8, has6, date_at_end

# Precalcular columnas auxiliares de fechas
tmp = df["password"].astype(str).apply(_has_date_tokens)
df["_has_date8"]   = tmp.apply(lambda t: int(t[0]))
df["_has_date6"]   = tmp.apply(lambda t: int(t[1]))
df["_date_at_end"] = tmp.apply(lambda t: int(t[2])) 

4. Variable Target (Weak / Strong)

In [8]:
#Etiquetado externo con zxcvbn
# Requiere una columna con la contraseña cruda, e.g. 'password' o 'pwd'.
PASSWORD_COL_CANDIDATES = ["password", "pwd", "pass", "contraseña"]
PASSWORD_COL = None
for c in PASSWORD_COL_CANDIDATES:
    if c in df.columns:
        PASSWORD_COL = c
        break

if PASSWORD_COL is None:
    raise ValueError(f"No se encontró una columna de contraseña en {PASSWORD_COL_CANDIDATES}. "
                     "Agrega/renombra la columna de texto crudo de la contraseña para poder etiquetar.")

try:
    from zxcvbn import zxcvbn
except Exception as e:
    raise ImportError("No se pudo importar zxcvbn. Instala con: pip install zxcvbn") from e

def label_from_zxcvbn(pw, score_threshold=3):
    try:
        res = zxcvbn(pw if isinstance(pw, str) else str(pw))
        score = res.get("score", 0)  # 0..4
        return 1 if score >= score_threshold else 0
    except Exception:
        return 0

df["target_bin_zxcvbn"] = df[PASSWORD_COL].apply(label_from_zxcvbn)

# ====== Plantilla morfológica para GroupSplit ======
import re
def pattern_template(pw):
    if not isinstance(pw, str):
        pw = str(pw)
    patt = []
    for ch in pw:
        if ch.isupper():
            patt.append("A")
        elif ch.islower():
            patt.append("a")
        elif ch.isdigit():
            patt.append("9")
        else:
            patt.append("!")
    return "".join(patt)

df["pattern_template"] = df[PASSWORD_COL].apply(pattern_template)

# ====== Export seguro sin contraseña cruda ======
cols_to_drop_in_export = [PASSWORD_COL]
df_for_export = df.drop(columns=[c for c in cols_to_drop_in_export if c in df.columns]).copy()

print("Etiquetado externo listo. target_bin_zxcvbn creado. 'pattern_template' creada.")


Etiquetado externo listo. target_bin_zxcvbn creado. 'pattern_template' creada.


In [11]:
from pathlib import Path
out_dir = Path("D:\\Universidad\\ULTIMO_SEMESTRE\\Tesis\\Proyecto_PasswordStrength")
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / "passwords_dataset_target_externo.csv"
df_for_export.to_csv(out_path, index=False)
print(f"Dataset exportado en: {out_path.resolve()}")


Dataset exportado en: D:\Universidad\ULTIMO_SEMESTRE\Tesis\Proyecto_PasswordStrength\passwords_dataset_target_externo.csv
