# CAS KAGGLE: Analisis de la seguretat de contrasenyes

### David Candela

# Introducció

Que faig i perquè.

# 1. Netejar i visualitzar el dataset

https://www.kaggle.com/bhavikbb/password-strength-classifier-dataset

In [None]:
Minuscules = [chr(c) for c in range(ord('a'), ord('z') + 1)]
Majuscules = [chr(c) for c in range(ord('A'), ord('Z') + 1)]
Xifres = [str(i) for i in range(10)]
Especials = ['.', ';', '-', '_', '+', '*', '<', '>', '[', ']', '{', '}', \
             '(', ')', '@', '#', '$', '%', '&', '/', '\\', '?', '!', '=', \
             '^', '~', ' ']
CaractersValids = Minuscules + Majuscules + Xifres + Especials

def isValid(contrasenya):
    try:
        # Treure les contrasenyes amb caracters que no consideri valids
        #   ja que no importa el format de descodificació especificat,
        #   Python es incapaç de llegir correctament tots els diferents caràcters
        #   caracters i sempre en surten de l'estil '\x03', '\x0f', '\x8d'
        #   o també §, ¶, ­, þ, ¤, ...
        return all(c in CaractersValids for c in contrasenya)
    except:
        # Truere les contrasenyes que Pandas converteixi continuament a float
        #   tot i que s'ha marcat la columna de passwords com strings
        return False

In [None]:
import os.path
import pandas as pd
import numpy as np

path = '../data/'
data_name = 'data.csv'
clean_name = 'clean_data.csv'
data_file = path + data_name
clean_file = path + clean_name
regenerate_file = False

if regenerate_file or not os.path.isfile(clean_file):
    assert os.path.isfile(data_file)
    # Saltarse les files on les dades no estiguin ben formategades
    dataset = pd.read_csv(data_file, on_bad_lines='skip', encoding='utf-8', dtype={'password': str, 'strength': np.int64})
    # Treure les dades que contenen caracters que no acceptem
    dataset = dataset[dataset.apply(lambda s: isValid(s['password']), axis=1)]
    # Guardar el nou dataset a un fitxer apart
    dataset.to_csv(clean_file, index=False)
    
dataset = pd.read_csv(clean_file)
dades = dataset.values
contrasenyes = dades[:,0]
dades_proteccio = dades[:,1].astype(np.int64)
noms = dataset.columns.values

In [None]:
dataset.head()

In [None]:
dataset.describe()

In [None]:
dataset.hist()

# 2. Extreure dades

## Predictabilitat

In [None]:
def recompte(contrasenya, individual, predictabilitat):
    c_ = ''
    for c in contrasenya:
        predictabilitat[c_][c] += 1
        predictabilitat[c_]['total'] += 1
        individual[c] += 1
        individual['total'] += 1
        c_ = c

individual = {c:0 for c in CaractersValids + ['total']}
predictabilitat = {c:{c:1 if not c == 'total' else len(CaractersValids) for c in CaractersValids + ['total']} for c in CaractersValids + ['']}

for p in dades[:,0]:
    recompte(p, individual, predictabilitat)

individual_total = individual['total']
predictabilitat_total = {c:predictabilitat[c]['total'] for c in predictabilitat.keys()}

individual.pop('total')
for k in predictabilitat.keys():
    predictabilitat[k].pop('total')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm, Normalize

plt.subplots(figsize=(20, 18))
plt.bar(individual.keys(), individual.values())
plt.show()

plt.subplots(figsize=(20, 18))
sns.heatmap([[e for e in d.values()] for d in predictabilitat.values()], \
            xticklabels=CaractersValids, yticklabels=CaractersValids + ['inici'], square=True, norm=LogNorm())
plt.xlabel("Segon")
plt.ylabel("Primer")
plt.title("Sequencies")
plt.show()

In [None]:
def probabilitat_caracters(x):
    res = 1
    for c in x:
        res *= individual.get(c, 1)
    return res / (individual_total ** len(x))

def probabilitat_sequencia(x):
    res = 1
    c_ = ''
    for c in x:
        if c not in CaractersValids:
            continue
        res *= predictabilitat[c_][c] / predictabilitat_total[c_]
        c_ = c
    return res

def aleatorietat(x):
    res = 1
    c_ = ''
    for c in x:
        if c not in CaractersValids:
            continue
        res *= individual[c] / predictabilitat[c_][c] * predictabilitat_total[c_] / individual_total
        c_ = c
    return res

def llargada(x):
    return len(x)

In [None]:
dades_caracters = np.vectorize(probabilitat_caracters)(contrasenyes)
dades_sequencia = np.vectorize(probabilitat_sequencia)(contrasenyes)
dades_aleatorietat = np.vectorize(aleatorietat)(contrasenyes)
dades_llargada = np.vectorize(llargada)(contrasenyes)

dades_predictabilitat = np.stack((dades_caracters, dades_sequencia, dades_aleatorietat, dades_llargada, dades_proteccio), axis=-1)

In [None]:
DP = dades_predictabilitat
MDP = pd.DataFrame(data=DP, columns=['caracters', 'sequencia', 'aleatorietat', 'llargada', 'proteccio'])

In [None]:
sns.histplot(data=MDP, x="caracters", hue="proteccio", multiple="layer", log_scale=True, binrange=[-39, -7], element="poly")

In [None]:
sns.histplot(data=MDP, x="sequencia", hue="proteccio", multiple="layer", log_scale=True, binrange=[-36, -5], element="poly")

In [None]:
sns.histplot(data=MDP, x="aleatorietat", hue="proteccio", multiple="layer", log_scale=True, binrange=[-10, 6], element="poly")

In [None]:
sns.histplot(data=MDP, x="llargada", hue="proteccio", multiple="stack", discrete=True, binrange=[1, 20])

In [None]:
plt.scatter(x=dades_caracters, y=dades_llargada, c=dades_proteccio, alpha=0.01)
plt.xscale('log')
plt.xlim(1e-60, 1e-4)
plt.ylim(0, 35)

### Probar eficacia de les dades

In [None]:
def make_subset(X, y, p=1, index=None):
    if p == 1:
        l = len(y)
        unique, counts = np.unique(y, return_counts=True)
        values = {u: l / len(unique) / c for u, c in zip(unique, counts)}
        return X, y, np.array([values[yi] for yi in y])
    elif type(p) == list:
        
        return
    else:
        l = len(y)
        new_l = int(l * p)
        if index == None:
            index = np.array([i for i in range(l)])
        ind = np.random.choice(index, new_l, replace=False)
        index = np.setdiff1d(index, ind)
        new_X = X[ind]
        new_y = y[ind]
        unique, counts = np.unique(new_y, return_counts=True)
        values = {u: new_l / len(unique) / c for u, c in zip(unique, counts)}
        return new_X, new_y, np.array([values[yi] for yi in new_y])

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn import linear_model
from sklearn import svm
from sklearn import preprocessing

class oneVsRest:
    def __init__(self, model, **kwargs):
        self.model = model
        self.modelArgs = kwargs
        
    def fit(self, x, y, w=None):
        yTo0 = np.copy(y)
        yTo0[yTo0 > 0] = -1
        yTo0[yTo0 != -1] = 1
        self.classTo0 = self.model(**self.modelArgs)
        self.classTo0.fit(x, yTo0, sample_weight=w)
        yTo1 = np.copy(y)
        yTo1[yTo1 > 1] = -1
        yTo1[yTo1 != -1] = 1
        self.classTo1 = self.model(**self.modelArgs)
        self.classTo1.fit(x, yTo1, sample_weight=w)
    
    def predict(self, x):
        return (2 - self.classTo0.predict(x) - self.classTo1.predict(x)) / 2

class LogPipe:
    err = 1e-200
    def __init__ (self, columns):
        self.column_transform = columns
    
    def fit (self, X, **args):
        return self
    
    def transform(self, X, copy=True):
        if copy == True:
            X_tr = np.copy(X)
        else:
            X_tr = X
        X_tr[:, self.column_transform] = np.log(X_tr[:, self.column_transform] + LogPipe.err)
        return X_tr
    
    def fit_transform(self, X, y=None, copy=True, **args):
        return self.transform(X, copy)

def regression(x, y, model=linear_model.LogisticRegression, w=None, **kwargs):
    # Creem un objecte de regressió de sklearn
    regr = oneVsRest(model, **kwargs)

    # Entrenem el model per a predir y a partir de x
    regr.fit(x, y, w=w)

    # Retornem el model entrenat
    return regr

def visualize_confusion_matrix(y_pred, y_real):
    #mostra la matriu de confusió
    cm = confusion_matrix(y_real, y_pred)
    plt.subplots(figsize=(10, 6))
    sns.heatmap(cm, annot = True, fmt = 'g')
    plt.xlabel("Predicted")
    plt.ylabel("Real")
    plt.title("Confusion Matrix")
    plt.show()

In [None]:
dades_t, objectiu_t, w_t = make_subset(DP[:, 0:4], DP[:, 4], .2)
dades_cv, objectiu_cv, w_cv = make_subset(DP[:, 0:4], DP[:, 4], 1)
log_norm = LogPipe([0, 1, 2]).fit(dades_t)
norm = preprocessing.StandardScaler().fit(log_norm.transform(dades_t))
r = regression(norm.transform(log_norm.transform(dades_t)), objectiu_t, w=w_t, model=svm.LinearSVC, max_iter=1e6)
# linear_model.LogisticRegression
# svm.LinearSVC

In [None]:
%matplotlib inline
prediccio = r.predict(norm.transform(log_norm.transform(dades_cv)))
print(f'Accuracy: {sum(objectiu_cv==prediccio) / len(prediccio)}')
print(f'Weighted accuracy: {sum(w_cv * (objectiu_cv==prediccio)) / len(prediccio)}')
visualize_confusion_matrix(prediccio, objectiu_cv)

In [None]:
dades_t, objectiu_t, w_t = make_subset(DP[:, 0:3], DP[:, 4], .2)
dades_cv, objectiu_cv, w_cv = make_subset(DP[:, 0:3], DP[:, 4], 1)
log_norm = LogPipe([0, 1, 2]).fit(dades_t)
norm = preprocessing.StandardScaler().fit(log_norm.transform(dades_t))
r = regression(norm.transform(log_norm.transform(dades_t)), objectiu_t, model=svm.LinearSVC, max_iter=1e6)
# linear_model.LogisticRegression
# svm.LinearSVC

In [None]:
%matplotlib inline
prediccio = r.predict(norm.transform(log_norm.transform(dades_cv)))
print(f'Accuracy: {sum(objectiu_cv==prediccio) / len(prediccio)}')
print(f'Weighted accuracy: {sum(w_cv * (objectiu_cv==prediccio)) / len(prediccio)}')
visualize_confusion_matrix(prediccio, objectiu_cv)

In [None]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([('log', LogPipe([0, 1, 2])), ('scaler', preprocessing.StandardScaler()), ('model', linear_model.LogisticRegression())])

Y_cv, W_cv, prediccio = [], [], []

from sklearn.model_selection import KFold
kf = KFold(n_splits=5)
for train_index, test_index in kf.split(DP):
    x_t, y_t, w_t = make_subset(DP[train_index, 0:3], DP[train_index, 4])
    x_cv, y_cv, w_cv = make_subset(DP[test_index, 0:3], DP[test_index, 4])
    pipe.fit(x_t, y_t, model__sample_weight=w_t)

    Y_cv.append(y_cv)
    W_cv.append(w_cv)
    prediccio.append(pipe.predict(x_cv))
    
Y_cv = np.concatenate(Y_cv)
W_cv = np.concatenate(W_cv)
prediccio = np.concatenate(prediccio)

print(f'Accuracy: {sum(Y_cv==prediccio) / len(prediccio)}')
print(f'Weighted accuracy: {sum(W_cv * (Y_cv==prediccio)) / len(prediccio)}')
visualize_confusion_matrix(prediccio, Y_cv)

In [None]:
plt.subplots(figsize=(10, 6))
ax = plt.gca()
ax.set_xscale('log')
plt.scatter(x=dades_llargada, y=dades_proteccio)
plt.show()

#Feature tools

In [None]:
for i in range(3):
    print(min(dades_llargada[dades_proteccio == i]), max(dades_llargada[dades_proteccio == i]))

In [None]:
%matplotlib inline
prediccio = np.array([0 if len(x) < 8 else (2 if len(x) > 13 else 1) for x in contrasenyes])
visualize_confusion_matrix(prediccio, dades_proteccio)

In [None]:
import ipywidgets as widgets
password = widgets.Text()
probabilitat = widgets.Label()
sequential = widgets.Label()
randomnes = widgets.Label()
strength = widgets.Label()

def form(x):
    return np.array([[probabilitat_caracters(x), probabilitat_sequencia(x), aleatorietat(x), llargada(x)]])

def on_value_change(change):
    probabilitat.value='probability: ' + str(probabilitat_caracters(change['new']))
    sequential.value='sequential: ' + str(probabilitat_sequencia(change['new']))
    randomnes.value='randomnes: ' + str(aleatorietat(change['new']))
    strength.value='strength: ' + str(0 if len(change['new']) < 8 else (2 if len(change['new']) > 13 else 1))

password.observe(on_value_change, names='value')
display(password, probabilitat, sequential, randomnes, strength)