# CAS KAGGLE: Analisis de la seguretat de contrasenyes

### David Candela

In [None]:
#pandas
#numpy
#scipy
#scikit-learn
#pytorch
#from torchvision: datasets, transforms
#tensorflow
#onnx
#xgboost
#catboost
#argparse

https://datauab.github.io/  
https://www.kaggle.com/bhavikbb/password-strength-classifier-dataset

1. Predictability:
 + Mean of predictability for the next character
    - Train to give probabilities for any $p_i$ given $\left\{p_{i-1},\,\ldots,\,p_{i-k}\right\}$
    - Join probabilities $\sum_i{p_i} < Treshold$ or $\prod_i{p_i} < Treshold$
2. Character distribution
 + Extract characteristics of password (character type amounts, type sequence length, repeate character count)
 + 

In [None]:
Minuscules = [chr(c) for c in range(ord('a'), ord('z') + 1)]
Majuscules = [chr(c) for c in range(ord('A'), ord('Z') + 1)]
Xifres = [str(i) for i in range(10)]
Especials = ['.', ';', '-', '_', '+', '*', '<', '>', '[', ']', '{', '}', \
             '(', ')', '@', '#', '$', '%', '&', '/', '\\', '?', '!', '=', \
             '^', '~', ' ']
CaractersValids = Minuscules + Majuscules + Xifres + Especials

def isValid(contrasenya):
    try:
        # Treure les contrasenyes amb caracters que no consideri valids
        #   ja que no importa el format de descodificació especificat,
        #   Python es incapaç de llegir correctament tots els diferents caràcters
        #   caracters i sempre en surten de l'estil '\x03', '\x0f', '\x8d'
        #   o també §, ¶, ­, þ, ¤, ...
        return all(c in CaractersValids for c in contrasenya)
    except:
        # Truere les contrasenyes que Pandas converteixi continuament a float
        #   tot i que s'ha marcat la columna de passwords com strings
        return False

In [None]:
import os.path
import pandas as pd
import numpy as np

path = '../data/'
data_name = 'data.csv'
clean_name = 'clean_data.csv'
data_file = path + data_name
clean_file = path + clean_name
regenerate_file = False

if regenerate_file or not os.path.isfile(clean_file):
    assert os.path.isfile(data_file)
    # Saltarse les files on les dades no estiguin ben formategades
    dataset = pd.read_csv(data_file, on_bad_lines='skip', encoding='utf-8', dtype={'password': str, 'strength': np.int64})
    # Treure les dades que contenen caracters que no acceptem
    dataset = dataset[dataset.apply(lambda s: isValid(s['password']), axis=1)]
    # Guardar el nou dataset a un fitxer apart
    dataset.to_csv(clean_file, index=False)
    
dataset = pd.read_csv(clean_file)
dades = dataset.values
contrasenyes = dades[:,0]
proteccio = dades[:,1]
noms = dataset.columns.values

In [None]:
dataset.head()

In [None]:
dataset.describe()

In [None]:
dataset.hist()

## Predictabilitat

In [None]:
def recompte(contrasenya, individual, predictabilitat):
    c_ = ''
    for c in contrasenya:
        predictabilitat[c_][c] += 1
        predictabilitat[c_]['total'] += 1
        individual[c] += 1
        individual['total'] += 1
        c_ = c

individual = {c:0 for c in CaractersValids + ['total']}
predictabilitat = {c:{c:1 if not c == 'total' else len(CaractersValids) for c in CaractersValids + ['total']} for c in CaractersValids + ['']}

for p in dades[:,0]:
    recompte(p, individual, predictabilitat)

individual_total = individual['total']
predictabilitat_total = {c:predictabilitat[c]['total'] for c in predictabilitat.keys()}

individual.pop('total')
for k in predictabilitat.keys():
    predictabilitat[k].pop('total')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm, Normalize

plt.subplots(figsize=(20, 18))
plt.bar(individual.keys(), individual.values())
plt.show()

plt.subplots(figsize=(20, 18))
sns.heatmap([[e for e in d.values()] for d in predictabilitat.values()], \
            xticklabels=CaractersValids, yticklabels=CaractersValids + ['inici'], square=True, norm=LogNorm())
plt.xlabel("Segon")
plt.ylabel("Primer")
plt.title("Sequencies")
plt.show()

In [None]:
def probabilitat_caracters(x):
    res = 1
    for c in x:
        res *= individual.get(c, 1)
    return res / (individual_total ** len(x))

def probabilitat_sequencia(x):
    res = 1
    c_ = ''
    for c in x:
        if c not in CaractersValids:
            continue
        res *= predictabilitat[c_][c] / predictabilitat_total[c_]
        c_ = c
    return res

def aleatorietat(x):
    res = 1
    c_ = ''
    for c in x:
        if c not in CaractersValids:
            continue
        res *= individual[c] / predictabilitat[c_][c] * predictabilitat_total[c_] / individual_total
        c_ = c
    return res

def llargada(x):
    return len(x)

In [None]:
dades_caracters = np.vectorize(probabilitat_caracters)(contrasenyes)
dades_sequencia = np.vectorize(probabilitat_sequencia)(contrasenyes)
dades_aleatorietat = np.vectorize(aleatorietat)(contrasenyes)
dades_llargada = np.vectorize(llargada)(contrasenyes)

dades_predictabilitat = np.stack((dades_caracters, dades_sequencia, dades_aleatorietat, dades_llargada), axis=-1)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import Normalizer

def regression(x, y):
    # Creem un objecte de regressió de sklearn
    regr = LogisticRegression(max_iter=1000)

    # Entrenem el model per a predir y a partir de x
    regr.fit(x, y)

    # Retornem el model entrenat
    return regr

def visualize_confusion_matrix(y_pred, y_real):
    #mostra la matriu de confusió
    cm = confusion_matrix(y_real, y_pred)
    plt.subplots(figsize=(10, 6))
    sns.heatmap(cm, annot = True, fmt = 'g')
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")
    plt.show()

objectius = dades[:,1].astype(int)
dades_test = dades_predictabilitat
# dades_test = dades_test[objectius != 2, :][:,[2]]
# objectius = objectius[objectius != 2]

transformer = Normalizer().fit(dades_test)
r = regression(transformer.transform(dades_test), objectius)

%matplotlib inline
prediction = np.array([0 if len(x) < 8 else (2 if len(x) > 13 else 1) for x in dades[:,0]])
visualize_confusion_matrix(prediction, objectius)


In [None]:
plt.subplots(figsize=(10, 6))
ax = plt.gca()
ax.set_xscale('log')
plt.scatter(x=dades_llargada, y=objectius)
plt.show()

#Feature tools

In [None]:
for i in range(3):
    print(min(dades_llargada[objectius == i]), max(dades_llargada[objectius == i]))

In [None]:
import ipywidgets as widgets
password = widgets.Text()
probabilitat = widgets.Label()
sequential = widgets.Label()
randomnes = widgets.Label()
strength = widgets.Label()

def form(x):
    return np.array([[probabilitat_caracters(x), probabilitat_sequencia(x), aleatorietat(x), llargada(x)]])

def on_value_change(change):
    probabilitat.value='probability: ' + str(probabilitat_caracters(change['new']))
    sequential.value='sequential: ' + str(probabilitat_sequencia(change['new']))
    randomnes.value='randomnes: ' + str(aleatorietat(change['new']))
    strength.value='strength: ' + str(0 if len(change['new']) < 8 else (2 if len(change['new']) > 13 else 1))

password.observe(on_value_change, names='value')
display(password, probabilitat, sequential, randomnes, strength)