In [1]:
import pandas as pd
import numpy as np
import time

In [2]:
df = pd.read_csv("./data/transform_sample_data.csv")
df.head()

Unnamed: 0,password,strength,len,alphaUC,alphaLC,number,symbol,midChar,repChar,uniqueChar,consecAlphaUC,consecAlphaLC,consecNumber,consecSymbol,seqAlpha,seqNumber,seqKeyboard
0,csillik,0.180594,7,0,7,0,0,0,2,5,0,1,0,0,0,0,0
1,huniihuu,0.177778,8,0,8,0,0,0,4,4,0,2,0,0,0,0,0
2,chaipy,0.172331,6,0,6,0,0,0,0,6,0,0,0,0,0,0,0
3,876876b,0.155556,7,0,1,6,0,5,3,4,0,0,0,0,0,1,0
4,miiwhy,0.154795,6,0,6,0,0,0,1,5,0,1,0,0,0,0,0


In [3]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import (
    GradientBoostingRegressor,
    RandomForestRegressor,
    ExtraTreesRegressor,
    AdaBoostRegressor,
)
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import (
    LassoLars,
    Ridge,
    BayesianRidge,
    Lasso,
    LinearRegression,
    HuberRegressor,
    PassiveAggressiveRegressor,
    ElasticNet,
)
from sklearn.neighbors import KNeighborsRegressor
from sklearn.dummy import DummyRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [4]:
models = {
    "Gradient Boosting Regressor": GradientBoostingRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "CatBoost Regressor": CatBoostRegressor(verbose=0, allow_writing_files=False),
    "Light Gradient Boosting Machine": LGBMRegressor(),
    "Extra Trees Regressor": ExtraTreesRegressor(),
    "AdaBoost Regressor": AdaBoostRegressor(),
    "Extreme Gradient Boosting": XGBRegressor(),
    "Lasso Least Angle Regression": LassoLars(),
    "Ridge Regression": Ridge(),
    "Bayesian Ridge": BayesianRidge(),
    "Least Angle Regression": LassoLars(),
    "Lasso Regression": Lasso(),
    "Linear Regression": LinearRegression(),
    "Huber Regressor": HuberRegressor(max_iter=1000),
    "Decision Tree Regressor": DecisionTreeRegressor(),
    "Orthogonal Matching Pursuit": PassiveAggressiveRegressor(),
    "Passive Aggressive Regressor": PassiveAggressiveRegressor(),
    "Elastic Net": ElasticNet(),
    "K Neighbors Regressor": KNeighborsRegressor(),
    "Dummy Regressor": DummyRegressor(),
}

In [5]:
k_fold = KFold(n_splits=10, shuffle=True, random_state=42)

In [18]:
X = df.drop(["strength",'password'], axis=1)
y = df["strength"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [7]:
mae = []
mse = []
rmse = []
r2 = []
tt = []

test_mae = []
test_mse = []
test_rmse = []
test_r2 = []
test_tt = []

for model_name, model in models.items():
    mae_scores = []
    mse_scores = []
    rmse_scores = []
    r2_scores = []
    tt_scores = []
    print(f"Model name: {model_name}\n")
    print("Train model")

    for k, (train_index, val_index) in enumerate(k_fold.split(X_train, y_train)):
        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

        start_time = time.time()
        model.fit(X_train_fold, y_train_fold)
        y_pred = model.predict(X_val_fold)
        end_time = time.time()

        mae_scores.append(mean_absolute_error(y_val_fold, y_pred))
        mse_scores.append(mean_squared_error(y_val_fold, y_pred))
        rmse_scores.append(
            mean_squared_error(y_val_fold, y_pred, squared=False)
        )
        r2_scores.append(r2_score(y_val_fold, y_pred))
        tt_scores.append(round(end_time - start_time, 4))

        print(
            f"Fold: {k}\tMAE: {mae_scores[-1]:.4f}\tMSE: {mse_scores[-1]:.4f}\tRMSE: {rmse_scores[-1]:.4f}\tR2: {r2_scores[-1]:.4f}\tTT: {tt_scores[-1]:.4f}"
        )

    mae.append(np.mean(mae_scores))
    mse.append(np.mean(mse_scores))
    rmse.append(np.mean(rmse_scores))
    r2.append(np.mean(r2_scores))
    tt.append(np.mean(tt_scores))

    print("\nTest model")
    start_time = time.time()
    y_pred = model.predict(X_test)
    end_time = time.time()

    mae_scores.append(mean_absolute_error(y_test, y_pred))
    mse_scores.append(mean_squared_error(y_test, y_pred))
    rmse_scores.append(mean_squared_error(y_test, y_pred, squared=False))
    r2_scores.append(r2_score(y_test, y_pred))
    tt_scores.append(round(end_time - start_time, 4))

    print(
        f"Fold: -\tMAE: {mae_scores[-1]:.4f}\tMSE: {mse_scores[-1]:.4f}\tRMSE: {rmse_scores[-1]:.4f}\tR2: {r2_scores[-1]:.4f}\tTT: {tt_scores[-1]:.4f}"
    )

    test_mae.append(mae_scores[-1])
    test_mse.append(mse_scores[-1])
    test_rmse.append(rmse_scores[-1])
    test_r2.append(r2_scores[-1])
    test_tt.append(tt_scores[-1])

    print("\n" + "=" * 100 + "\n")

Model name: Gradient Boosting Regressor

Train model
Fold: 0	MAE: 0.0021	MSE: 0.0000	RMSE: 0.0030	R2: 0.9999	TT: 4.7752
Fold: 1	MAE: 0.0023	MSE: 0.0000	RMSE: 0.0048	R2: 0.9996	TT: 1.7018
Fold: 2	MAE: 0.0024	MSE: 0.0000	RMSE: 0.0041	R2: 0.9997	TT: 1.2483
Fold: 3	MAE: 0.0024	MSE: 0.0000	RMSE: 0.0033	R2: 0.9998	TT: 1.5315
Fold: 4	MAE: 0.0023	MSE: 0.0000	RMSE: 0.0032	R2: 0.9998	TT: 1.9080
Fold: 5	MAE: 0.0024	MSE: 0.0000	RMSE: 0.0036	R2: 0.9998	TT: 1.9622
Fold: 6	MAE: 0.0022	MSE: 0.0000	RMSE: 0.0042	R2: 0.9997	TT: 1.7978
Fold: 7	MAE: 0.0025	MSE: 0.0000	RMSE: 0.0040	R2: 0.9998	TT: 1.3138
Fold: 8	MAE: 0.0026	MSE: 0.0000	RMSE: 0.0041	R2: 0.9997	TT: 1.5807
Fold: 9	MAE: 0.0022	MSE: 0.0000	RMSE: 0.0031	R2: 0.9999	TT: 1.8202

Test model
Fold: -	MAE: 0.0021	MSE: 0.0000	RMSE: 0.0035	R2: 0.9998	TT: 0.0118


Model name: Random Forest Regressor

Train model
Fold: 0	MAE: 0.0002	MSE: 0.0000	RMSE: 0.0015	R2: 1.0000	TT: 4.7250
Fold: 1	MAE: 0.0003	MSE: 0.0000	RMSE: 0.0044	R2: 0.9997	TT: 3.1737
Fold: 2	MAE: 

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LassoLars())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * np.sqrt(n_samples). 


Fold: 0	MAE: 0.2262	MSE: 0.0647	RMSE: 0.2543	R2: -0.0002	TT: 1.0692
Fold: 1	MAE: 0.2289	MSE: 0.0663	RMSE: 0.2575	R2: -0.0005	TT: 0.0391
Fold: 2	MAE: 0.2262	MSE: 0.0655	RMSE: 0.2559	R2: -0.0000	TT: 0.0396
Fold: 3	MAE: 0.2234	MSE: 0.0655	RMSE: 0.2559	R2: -0.0008	TT: 0.0339
Fold: 4	MAE: 0.2152	MSE: 0.0607	RMSE: 0.2464	R2: -0.0022	TT: 0.0231


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LassoLars())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * np.sqrt(n_samples). 
If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LassoLars())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)



Fold: 5	MAE: 0.2221	MSE: 0.0645	RMSE: 0.2540	R2: -0.0021	TT: 0.0097
Fold: 6	MAE: 0.2335	MSE: 0.0691	RMSE: 0.2628	R2: -0.0020	TT: 0.0182
Fold: 7	MAE: 0.2329	MSE: 0.0688	RMSE: 0.2622	R2: -0.0003	TT: 0.0206
Fold: 8	MAE: 0.2278	MSE: 0.0660	RMSE: 0.2568	R2: -0.0005	TT: 0.0170
Fold: 9	MAE: 0.2300	MSE: 0.0677	RMSE: 0.2601	R2: -0.0010	TT: 0.0136

Test model
Fold: -	MAE: 0.2252	MSE: 0.0652	RMSE: 0.2554	R2: -0.0001	TT: 0.0032


Model name: Ridge Regression

Train model


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LassoLars())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * np.sqrt(n_samples). 
If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LassoLars())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)



Fold: 0	MAE: 0.0327	MSE: 0.0019	RMSE: 0.0433	R2: 0.9710	TT: 0.2675
Fold: 1	MAE: 0.0329	MSE: 0.0026	RMSE: 0.0510	R2: 0.9607	TT: 0.0298
Fold: 2	MAE: 0.0342	MSE: 0.0028	RMSE: 0.0530	R2: 0.9571	TT: 0.0153
Fold: 3	MAE: 0.0331	MSE: 0.0033	RMSE: 0.0570	R2: 0.9503	TT: 0.0165
Fold: 4	MAE: 0.0340	MSE: 0.0035	RMSE: 0.0593	R2: 0.9419	TT: 0.0140
Fold: 5	MAE: 0.0327	MSE: 0.0025	RMSE: 0.0504	R2: 0.9605	TT: 0.0078
Fold: 6	MAE: 0.0339	MSE: 0.0024	RMSE: 0.0490	R2: 0.9652	TT: 0.0079
Fold: 7	MAE: 0.0332	MSE: 0.0023	RMSE: 0.0483	R2: 0.9661	TT: 0.0078
Fold: 8	MAE: 0.0334	MSE: 0.0018	RMSE: 0.0426	R2: 0.9725	TT: 0.0081
Fold: 9	MAE: 0.0312	MSE: 0.0018	RMSE: 0.0424	R2: 0.9734	TT: 0.0078

Test model
Fold: -	MAE: 0.0333	MSE: 0.0027	RMSE: 0.0522	R2: 0.9583	TT: 0.0037


Model name: Bayesian Ridge

Train model
Fold: 0	MAE: 0.0327	MSE: 0.0019	RMSE: 0.0433	R2: 0.9710	TT: 0.2553
Fold: 1	MAE: 0.0329	MSE: 0.0026	RMSE: 0.0510	R2: 0.9607	TT: 0.0185
Fold: 2	MAE: 0.0342	MSE: 0.0028	RMSE: 0.0530	R2: 0.9571	TT: 0.0209
Fold: 3	

If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LassoLars())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)

Set parameter alpha to: original_alpha * np.sqrt(n_samples). 
If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LassoLars())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)



Fold: 8	MAE: 0.2278	MSE: 0.0660	RMSE: 0.2568	R2: -0.0005	TT: 0.0091
Fold: 9	MAE: 0.2300	MSE: 0.0677	RMSE: 0.2601	R2: -0.0010	TT: 0.0159

Test model
Fold: -	MAE: 0.2252	MSE: 0.0652	RMSE: 0.2554	R2: -0.0001	TT: 0.0121


Model name: Lasso Regression

Train model
Fold: 0	MAE: 0.1659	MSE: 0.0338	RMSE: 0.1838	R2: 0.4776	TT: 0.1966
Fold: 1	MAE: 0.1685	MSE: 0.0347	RMSE: 0.1864	R2: 0.4757	TT: 0.0211
Fold: 2	MAE: 0.1659	MSE: 0.0341	RMSE: 0.1847	R2: 0.4788	TT: 0.0152
Fold: 3	MAE: 0.1631	MSE: 0.0338	RMSE: 0.1839	R2: 0.4833	TT: 0.0147
Fold: 4	MAE: 0.1578	MSE: 0.0314	RMSE: 0.1773	R2: 0.4808	TT: 0.0071
Fold: 5	MAE: 0.1634	MSE: 0.0338	RMSE: 0.1837	R2: 0.4757	TT: 0.0079
Fold: 6	MAE: 0.1709	MSE: 0.0359	RMSE: 0.1896	R2: 0.4785	TT: 0.0081
Fold: 7	MAE: 0.1708	MSE: 0.0361	RMSE: 0.1901	R2: 0.4741	TT: 0.0094
Fold: 8	MAE: 0.1675	MSE: 0.0347	RMSE: 0.1863	R2: 0.4735	TT: 0.0072
Fold: 9	MAE: 0.1696	MSE: 0.0356	RMSE: 0.1887	R2: 0.4732	TT: 0.0070

Test model
Fold: -	MAE: 0.1659	MSE: 0.0343	RMSE: 0.1853	R2: 0.4734	TT

In [8]:
results = pd.DataFrame(
    {
        "Model": list(models.keys()),
        "MAE": mae,
        "MSE": mse,
        "RMSE": rmse,
        "R2": r2,
        "TT (Sec)": tt,
    }
)

results.sort_values("R2", axis=0, ascending=False).reset_index(drop=True)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,TT (Sec)
0,Extra Trees Regressor,0.000263,6e-06,0.002164,0.999914,2.44686
1,Extreme Gradient Boosting,0.000392,7e-06,0.002214,0.999895,1.32255
2,CatBoost Regressor,0.000503,7e-06,0.002415,0.999894,6.08424
3,Random Forest Regressor,0.000284,1e-05,0.002744,0.999855,2.97326
4,Decision Tree Regressor,0.00027,1.2e-05,0.003137,0.999816,0.0301
5,Light Gradient Boosting Machine,0.000833,1.3e-05,0.00339,0.999808,1.55079
6,Gradient Boosting Regressor,0.002327,1.4e-05,0.00374,0.999783,1.96395
7,K Neighbors Regressor,0.00448,0.000125,0.011127,0.998096,0.14613
8,AdaBoost Regressor,0.018504,0.000559,0.023626,0.99148,1.45317
9,Bayesian Ridge,0.033145,0.002494,0.049634,0.961869,0.04461


In [9]:
test_results = pd.DataFrame(
    {
        "Model": list(models.keys()),
        "MAE": test_mae,
        "MSE": test_mse,
        "RMSE": test_rmse,
        "R2": test_r2,
        "TT (Sec)": test_tt,
    }
)

test_results.sort_values("R2", axis=0, ascending=False).reset_index(drop=True)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,TT (Sec)
0,Extra Trees Regressor,0.000217,4e-06,0.002004,0.999938,0.1229
1,Extreme Gradient Boosting,0.00046,8e-06,0.002846,0.999876,0.003
2,CatBoost Regressor,0.000478,9e-06,0.002924,0.999869,0.0396
3,Gradient Boosting Regressor,0.002099,1.2e-05,0.003453,0.999817,0.0118
4,Random Forest Regressor,0.000345,1.3e-05,0.003614,0.9998,0.1055
5,Decision Tree Regressor,0.000301,1.4e-05,0.003797,0.999779,0.0088
6,Light Gradient Boosting Machine,0.000858,1.6e-05,0.00403,0.999751,0.0044
7,K Neighbors Regressor,0.004112,9.5e-05,0.009759,0.998539,0.0955
8,AdaBoost Regressor,0.01717,0.000514,0.022663,0.992123,0.0604
9,Bayesian Ridge,0.033253,0.002721,0.052168,0.958261,0.0032


In [10]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [15]:
# Create subplots with 5 rows and 1 column
fig = make_subplots(rows=5, cols=1, subplot_titles=['MAE', 'MSE', 'RMSE', 'R2', 'TT (Sec)'])

# Add bar plots to each subplot
fig.add_trace(go.Bar(x=results['Model'], y=results['MAE']), row=1, col=1)
fig.add_trace(go.Bar(x=results['Model'], y=results['MSE']), row=2, col=1)
fig.add_trace(go.Bar(x=results['Model'], y=results['RMSE']), row=3, col=1)
fig.add_trace(go.Bar(x=results['Model'], y=results['R2']), row=4, col=1)
fig.add_trace(go.Bar(x=results['Model'], y=results['TT (Sec)']), row=5, col=1)

# Update layout
fig.update_layout(height=3000, width=1300, showlegend=False, title_text='Performance Metrics for train data')

# Show the plot
fig.show()

In [16]:
# Create subplots with 5 rows and 1 column
fig = make_subplots(rows=5, cols=1, subplot_titles=['MAE', 'MSE', 'RMSE', 'R2', 'TT (Sec)'])

# Add bar plots to each subplot
fig.add_trace(go.Bar(x=test_results['Model'], y=test_results['MAE']), row=1, col=1)
fig.add_trace(go.Bar(x=test_results['Model'], y=test_results['MSE']), row=2, col=1)
fig.add_trace(go.Bar(x=test_results['Model'], y=test_results['RMSE']), row=3, col=1)
fig.add_trace(go.Bar(x=test_results['Model'], y=test_results['R2']), row=4, col=1)
fig.add_trace(go.Bar(x=test_results['Model'], y=test_results['TT (Sec)']), row=5, col=1)

# Update layout
fig.update_layout(height=3000, width=1300, showlegend=False, title_text='Performance Metrics for test data')

# Show the plot
fig.show()

In [26]:
import plotly.graph_objects as go

feature_names = X.columns.to_list()
regressor = ExtraTreesRegressor(n_estimators=100, random_state=0)
regressor.fit(X, y)
model = regressor.estimators_[0]

In [37]:
labels = [""] * model.tree_.node_count
parents = [""] * model.tree_.node_count
labels[0] = "root"


for i, (f, t, l, r) in enumerate(
    zip(
        model.tree_.feature,
        model.tree_.threshold,
        model.tree_.children_left,
        model.tree_.children_right,
    )
):
    if l != r:
        labels[l] = f"{feature_names[f]} <= {t:g}"
        labels[r] = f"{feature_names[f]} > {t:g}"
        parents[l] = parents[r] = labels[i]

fig = go.Figure(
    go.Treemap(
        branchvalues="total",
        labels=labels,
        parents=parents,
        values=model.tree_.n_node_samples,
        textinfo="label+value+percent root",
        marker=dict(colors=model.tree_.impurity),
        customdata=list(map(str, model.tree_.value)),
        hovertemplate="""
            <b>%{label}</b><br>
            impurity: %{color}<br>
            samples: %{value} (%{percentRoot:%.2f})<br>
            value: %{customdata}""",
    )
)

# Update layout
fig.update_layout(height=2500, width=1300, showlegend=False, title_text='Tree Map')

fig.show()

In [38]:
# Get the feature importances
importances = regressor.feature_importances_

# Create a horizontal bar chart
fig = go.Figure(data=go.Bar(
    x=importances,
    y=feature_names,
    orientation='h'
))

# Set plot layout
fig.update_layout(
    title="Feature Importances",
    xaxis_title="Importance",
    yaxis_title="Features"
)

# Show the plot
fig.show()

## Conclusion

By comparing all the performance measures of model we select `Extra Trees Regressor`