In [1]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.pipeline import Pipeline

import joblib
import numpy as np
import pandas as pd
import time

In [2]:
df = pd.read_csv("./data/stratified_sample_data.csv")
df.head()

Unnamed: 0,password,strength
0,csillik,0.180594
1,huniihuu,0.177778
2,chaipy,0.172331
3,876876b,0.155556
4,miiwhy,0.154795


In [3]:
X = df["password"]
y = df["strength"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [4]:
class LenTransform(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X["len"] = X["password"].apply(lambda x: self._lenTransform(x))
        transformed_X = X["len"].to_numpy()
        return np.array(transformed_X).reshape(-1, 1)

    def _lenTransform(self, text: str) -> int:
        return len(text)

In [5]:
class AlphaUCTransform(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X["alphaUC"] = X["password"].apply(lambda x: self._alphaUCTransform(x))
        transformed_X = X["alphaUC"].to_numpy()
        return np.array(transformed_X).reshape(-1, 1)
    
    def _alphaUCTransform(self, text: str) -> int:
        return sum(1 for a in text if a.isupper())

In [6]:
class AlphaLCTransform(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X["alphaLC"] = X["password"].apply(lambda x: self._alphaLCTransform(x))
        transformed_X = X["alphaLC"].to_numpy()
        return np.array(transformed_X).reshape(-1, 1)

    def _alphaLCTransform(self, text: str) -> int:
        return sum(1 for a in text if a.islower())

In [7]:
class NumberTransform(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X["number"] = X["password"].apply(lambda x: self._numberTransform(x))
        transformed_X = X["number"].to_numpy()
        return np.array(transformed_X).reshape(-1, 1)
    
    def _numberTransform(self, text: str) -> int:
        return sum(1 for a in text if a.isdecimal())

In [8]:
class SymbolTransform(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X["symbol"] = X["password"].apply(lambda x: self._symbolTransform(x))
        transformed_X =  X["symbol"].to_numpy()
        return np.array(transformed_X).reshape(-1, 1)
    
    def _symbolTransform(self, text: str) -> int:
        return sum(a in set("!@#$%^&*") for a in text)

In [9]:
class MidCharTransform(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X["midChar"] = X["password"].apply(lambda x: self._midCharTransform(x))
        transformed_X = X["midChar"].to_numpy()
        return np.array(transformed_X).reshape(-1, 1)
    
    def _midCharTransform(self, text: str) -> int:
        return sum(
                bool(
                    (a.isdecimal() or (a in set("!@#$%^&*")))
                    and ix > 0
                    and ix < len(text) - 1
                )
                for ix, a in enumerate(text)
            )

In [10]:
class RepCharTransform(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X["repChar"] = X["password"].apply(lambda x: self._repCharTransform(x))
        transformed_X = X["repChar"].to_numpy()
        return np.array(transformed_X).reshape(-1, 1)
    
    def _repCharTransform(self, text: str) -> int:
        return len(text) - len(list(set(text)))

In [11]:
class UniqueCharTransform(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X["uniqueChar"] = X["password"].apply(lambda x: self._uniqueCharTransform(x))
        transformed_X = X["uniqueChar"].to_numpy()
        return np.array(transformed_X).reshape(-1, 1)
    
    def _uniqueCharTransform(self, text: str) -> int:
        return len(list(set(text)))

In [12]:
class ConsecAlphaUCTransform(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X["consecAlphaUC"] = X["password"].apply(lambda x: self._consecAlphaUCTransform(x))
        transformed_X = X["consecAlphaUC"].to_numpy()
        return np.array(transformed_X).reshape(-1, 1)
    
    def _consecAlphaUCTransform(self, text: str) -> int:
        temp = ""
        nConsecAlphaUC = 0
        for a in text:
            if a.isupper():
                if temp and temp[-1] == a:
                    nConsecAlphaUC += 1
                temp = a
        return nConsecAlphaUC


In [13]:
class ConsecAlphaLCTransform(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X["consecAlphaLC"] = X["password"].apply(lambda x: self._consecAlphaLCTransform(x))
        transformed_X = X["consecAlphaLC"].to_numpy()
        return np.array(transformed_X).reshape(-1, 1)
    
    def _consecAlphaLCTransform(self, text: str) -> int:
        temp = ""
        nConsecAlphaLC = 0
        for a in text:
            if a.islower():
                if temp and temp[-1] == a:
                    nConsecAlphaLC += 1
                temp = a
        return nConsecAlphaLC


In [14]:
class ConsecNumberTransform(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X["consecNumber"] = X["password"].apply(lambda x: self._consecNumberTransform(x))
        transformed_X = X["consecNumber"].to_numpy()
        return np.array(transformed_X).reshape(-1, 1)
    
    def _consecNumberTransform(self, text: str) -> int:
        temp = ""
        nConsecNumber = 0
        for a in text:
            if a.isdecimal():
                if temp and temp[-1] == a:
                    nConsecNumber += 1
                temp = a
        return nConsecNumber


In [15]:
class ConsecSymbolTransform(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X["consecSymbol"] = X["password"].apply(lambda x: self._consecSymbolTransform(x))
        transformed_X = X["consecSymbol"].to_numpy()
        return np.array(transformed_X).reshape(-1, 1)
    
    def _consecSymbolTransform(self, text: str) -> int:
        temp = ""
        nConsecSymbol = 0
        for a in text:
            if a in set("!@#$%^&*"):
                if temp and temp[-1] == a:
                    nConsecSymbol += 1
                temp = a
        return nConsecSymbol

In [16]:
class SeqAlphaTransform(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X["seqAlpha"] = X["password"].apply(lambda x: self._seqAlphaTransform(x))
        transformed_X = X["seqAlpha"].to_numpy()
        return np.array(transformed_X).reshape(-1, 1)
    
    def _seqAlphaTransform(self, text: str) -> int:
        sAlphas = "abcdefghijklmnopqrstuvwxyz"
        nSeqAlpha = 0
        for s in range(len(sAlphas) - 2):
            sFwd = sAlphas[s : s + 3]
            sRev = sFwd[::-1]
            if sFwd in text.lower() or sRev in text.lower():
                nSeqAlpha += 1
        return nSeqAlpha


In [17]:
class SeqNumberTransform(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X["seqNumber"] = X["password"].apply(lambda x: self._seqNumberTransform(x))
        transformed_X = X["seqNumber"].to_numpy()
        return np.array(transformed_X).reshape(-1, 1)
    
    def _seqNumberTransform(self, text: str) -> int:
        sNumerics = "01234567890"
        nSeqNumber = 0
        for s in range(len(sNumerics) - 2):
            sFwd = sNumerics[s : s + 3]
            sRev = sFwd[::-1]
            if sFwd in text.lower() or sRev in text.lower():
                nSeqNumber += 1
        return nSeqNumber

In [18]:
class SeqKeyboardTransform(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X["seqKeyboard"] = X["password"].apply(lambda x: self._seqKeyboardTransform(x))
        transformed_X = X["seqKeyboard"].to_numpy()
        return np.array(transformed_X).reshape(-1, 1)
    
    def _seqKeyboardTransform(self, text: str) -> int:
        sTopRow = "qwertyuiop"
        sHomeRow = "asdfghjkl"
        sBottomRow = "zxcvbnm"
        nKeyboard = 0
        sRows = [sTopRow, sHomeRow, sBottomRow]
        
        for sRow in sRows:
            for s in range(len(sRow) - 2):
                sFwd = sRow[s : s + 3]
                sRev = sFwd[::-1]
                if sFwd in text.lower() or sRev in text.lower():
                    nKeyboard += 1
                    
        return nKeyboard

In [19]:
feature = ['password']

preprocess = ColumnTransformer([
    ('len', LenTransform(), feature),
    ('alpha_uc', AlphaUCTransform(), feature),
    ('alpha_lc', AlphaLCTransform(), feature),
    ('number', NumberTransform(), feature),
    ('symbol', SymbolTransform(), feature),
    ('mid_char', MidCharTransform(), feature),
    ('rep_char', RepCharTransform(), feature),
    ('unique_char', UniqueCharTransform(), feature),
    ('consec_alpha_uc', ConsecAlphaUCTransform(), feature),
    ('consec_alpha_lc', ConsecAlphaLCTransform(), feature),
    ('consec_number', ConsecNumberTransform(), feature),
    ('consec_symbol', ConsecSymbolTransform(), feature),
    ('seq_alpha', SeqAlphaTransform(), feature),
    ('seq_number', SeqNumberTransform(), feature),
    ('seq_keyboard', SeqKeyboardTransform(), feature),
])

In [20]:
pipeline = Pipeline([
    ('preprocess', preprocess),
    ('regressor', ExtraTreesRegressor(n_jobs=-1))
])

In [21]:
pipeline

In [22]:
X_train_df = X_train.to_frame()
X_test_df = X_test.to_frame()

In [23]:
k_fold = KFold(n_splits=10, shuffle=True, random_state=42)

In [24]:
mae_scores = []
mse_scores = []
rmse_scores = []
r2_scores = []
tt_scores = []

print("Train model")

for k, (train_index, val_index) in enumerate(k_fold.split(X_train, y_train)):
    X_train_fold, X_val_fold = X_train_df.iloc[train_index], X_train_df.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

    start_time = time.time()
    pipeline.fit(X_train_fold, y_train_fold)
    y_pred = pipeline.predict(X_val_fold)
    end_time = time.time()

    mae_scores.append(round(mean_absolute_error(y_val_fold, y_pred), 4))
    mse_scores.append(round(mean_squared_error(y_val_fold, y_pred), 4))
    rmse_scores.append(round(mean_squared_error(y_val_fold, y_pred, squared=False), 4))
    r2_scores.append(round(r2_score(y_val_fold, y_pred), 4))
    tt_scores.append(round(end_time - start_time, 4))

    print(
        f"Fold: {k}\tMAE: {mae_scores[-1]:.4f}\tMSE: {mse_scores[-1]:.4f}\tRMSE: {rmse_scores[-1]:.4f}\tR2: {r2_scores[-1]:.4f}\tTT: {tt_scores[-1]:.4f}"
    )

print("\nTest model")
start_time = time.time()
y_pred = pipeline.predict(X_test_df)
end_time = time.time()

mae_scores.append(round(mean_absolute_error(y_test, y_pred), 4))
mse_scores.append(round(mean_squared_error(y_test, y_pred), 4))
rmse_scores.append(round(mean_squared_error(y_test, y_pred, squared=False), 4))
r2_scores.append(round(r2_score(y_test, y_pred), 4))
tt_scores.append(round(end_time - start_time, 4))

print(
    f"Fold: -\tMAE: {mae_scores[-1]:.4f}\tMSE: {mse_scores[-1]:.4f}\tRMSE: {rmse_scores[-1]:.4f}\tR2: {r2_scores[-1]:.4f}\tTT: {tt_scores[-1]:.4f}"
)

Train model
Fold: 0	MAE: 0.0002	MSE: 0.0000	RMSE: 0.0009	R2: 1.0000	TT: 1.9374
Fold: 1	MAE: 0.0003	MSE: 0.0000	RMSE: 0.0038	R2: 0.9998	TT: 1.9686
Fold: 2	MAE: 0.0004	MSE: 0.0000	RMSE: 0.0025	R2: 0.9999	TT: 1.4158
Fold: 3	MAE: 0.0002	MSE: 0.0000	RMSE: 0.0011	R2: 1.0000	TT: 1.3921
Fold: 4	MAE: 0.0002	MSE: 0.0000	RMSE: 0.0015	R2: 1.0000	TT: 2.0610
Fold: 5	MAE: 0.0002	MSE: 0.0000	RMSE: 0.0017	R2: 1.0000	TT: 2.1899
Fold: 6	MAE: 0.0003	MSE: 0.0000	RMSE: 0.0033	R2: 0.9998	TT: 2.0749
Fold: 7	MAE: 0.0004	MSE: 0.0000	RMSE: 0.0041	R2: 0.9998	TT: 1.7467
Fold: 8	MAE: 0.0004	MSE: 0.0000	RMSE: 0.0049	R2: 0.9996	TT: 1.7480
Fold: 9	MAE: 0.0002	MSE: 0.0000	RMSE: 0.0017	R2: 1.0000	TT: 1.8086

Test model
Fold: -	MAE: 0.0002	MSE: 0.0000	RMSE: 0.0026	R2: 0.9999	TT: 0.4320


In [38]:
from sklearn.model_selection import GridSearchCV

In [39]:
param_grid = {
    'regressor__n_estimators': [1,5,10,20,50,75,100,150,200]
}

In [40]:
grid_search = GridSearchCV(pipeline, param_grid, scoring='r2', error_score='raise')

In [42]:
grid_search.fit(X_train_df, y_train)

In [43]:
results = grid_search.cv_results_


In [44]:
import plotly.express as px


In [47]:
fig = px.bar(results, x=[1,5,10,20,50,75,100,150,200], y='mean_test_score', error_y='std_test_score')
fig.show()

From fig, we observe that r2 score for n_estimators=1 is equal to n_estimators=200. So we need only one tree to minimize our complexity. Therefore, we will consider using `Decision Tree Regressor`.