### This password evaluator is a support function to be used in the project of fit5120/22

In [1]:
# For Data manipulation
import pandas as pd
import numpy as np

# For Data Visulization
import matplotlib.pyplot as plt
import seaborn as sns

#For Modelling and evaluation
from sklearn.model_selection import train_test_split,StratifiedKFold, cross_val_score,KFold
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier,StackingClassifier,BaggingClassifier
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

#For text preprocessing
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

#For model parameter saving and loading
import pickle

In [2]:
# Import dataset
df = pd.read_csv('data.csv',on_bad_lines='skip') #ignore badline from dataset
df.isna().sum()
df.duplicated().sum()
#only 1 missing value, and no duplicate found,will drop the bad record directly
df.dropna(inplace = True)

In [3]:
#Import text data for all the weak passwords from rock you leak
df2 = pd.read_csv('rockyou.txt',delimiter='\t',header = None, names = ['password'],encoding='ISO-8859-1')
df2.dropna(inplace = True)
df2.drop_duplicates(inplace = True)
df2['strength'] = 0

In [4]:
df_full = pd.concat([df,df2],ignore_index=True)

# Compute the value counts of the Gender column
value_counts = df_full['strength'].value_counts()

# Set the number of samples to be drawn from each group
n_samples = value_counts.min()

# Group the dataframe by Gender and sample n_samples from each group
sampled_df = df.groupby('strength').apply(lambda x: x.sample(n=n_samples)).reset_index(drop=True)

# Print the sampled dataframe
print(sampled_df)

                password  strength
0                 wwf598         0
1                tommy93         0
2                menelg1         0
3                 kenar1         0
4                xj66dkv         0
...                  ...       ...
249406  ZEmAYPzQxMggt8VF         2
249407  zrF8CLTgyNAkiC0p         2
249408  gcrM80zcyNQXMGGq         2
249409  tE1QyfTAzMQ2C2pu         2
249410    550AjApYMoqAqy         2

[249411 rows x 2 columns]


In [5]:
sampled_df.isna().sum()

password    0
strength    0
dtype: int64

In [6]:
X = sampled_df['password']
y = sampled_df['strength']
#tokenize password
def word_to_char(word):
    return list(word)
vectorizer = TfidfVectorizer(tokenizer=word_to_char)
X = vectorizer.fit_transform(X)

with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)



In [7]:
#Data preprocessing
def train_val_test_split(X,y,ratio):
    X_train,X_,y_train,y_ = train_test_split(X,y,test_size=ratio,stratify=y,random_state=1)
    X_val,X_test,y_val,y_test = train_test_split(X_,y_,test_size=.5,stratify=y_,random_state=1)
    return X_train,X_val,X_test,y_train,y_val,y_test

In [8]:
X_train,X_val,X_test,y_train,y_val,y_test = train_val_test_split(X,y,ratio=.3)

In [9]:
#Model training with CV

models = []  # Empty list to store all the models

# Appending models into the list

models.append(("Random forest", RandomForestClassifier(random_state=1)))
models.append(("Bagging", BaggingClassifier(random_state=1)))
models.append(("Xgboost", XGBClassifier(random_state=1, eval_metric="logloss")))
models.append(("lgbm", lgb.LGBMClassifier(random_state=1, eval_metric="logloss")))

results = []  # Empty list to store all model's CV scores
names = []  # Empty list to store name of the models
score = []

# loop through all models to get the mean cross validated score

print("\n" "Cross-Validation Performance:" "\n")

for name, model in models:
    kfold = StratifiedKFold(
        n_splits=5, shuffle=True, random_state=1
    )  # Setting number of splits equal to 5
    cv_result = cross_val_score(
        estimator=model, X=X_train, y=y_train, scoring='f1_macro', cv=kfold
    )
    results.append(cv_result)
    names.append(name)
    print("{}: {}".format(name, cv_result.mean()))


Cross-Validation Performance:

Random forest: 0.9442035681894781
Bagging: 0.9219457394123618
Xgboost: 0.9810177984098212
lgbm: 0.9621568742404788


In [10]:
print("\n" "Training Performance:" "\n")

for name, model in models:
    model.fit(X_train, y_train)
    scores = metrics.f1_score(y_train, model.predict(X_train),average='macro')
    print("{}: {}".format(name, scores))


Training Performance:

Random forest: 0.999891171571467
Bagging: 0.9956067805243928
Xgboost: 0.9938909830656067
lgbm: 0.9688391856539122


In [11]:
print("\n" "Validation Performance:" "\n")

for name, model in models:
    model.fit(X_train, y_train)
    scores = metrics.f1_score(y_val, model.predict(X_val),average='macro')
    print("{}: {}".format(name, scores))


Validation Performance:

Random forest: 0.9487219579052363
Bagging: 0.9246642958210404
Xgboost: 0.9821373154455623
lgbm: 0.9641671748892238


In [12]:
#Using Xgboost provided the best f1_marco result, thus we can fine tune it
# defining model - XGBoost Hyperparameter Tuning
model = XGBClassifier(random_state=1, eval_metric="logloss")

# Parameter grid to pass in RandomizedSearchCV
param_grid = {
    "n_estimators": np.arange(150, 300, 50),
    "learning_rate": [0.0001,0.001,0.01],
    "gamma": [0, 3, 5],
    "subsample": [0.5, 0.9,0.2],
}
# Calling RandomizedSearchCV
randomized_cv = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_grid,
    n_iter=20,
    scoring='f1_macro',
    cv=3,
    random_state=1,
    n_jobs=-1,
)

# Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_train, y_train)

print(
    "Best parameters are {} with CV score={}:".format(
        randomized_cv.best_params_, randomized_cv.best_score_
    )
)

Best parameters are {'subsample': 0.2, 'n_estimators': 250, 'learning_rate': 0.01, 'gamma': 3} with CV score=0.9028494908952561:


In [13]:
# building model with best parameters
xgb_tuned = XGBClassifier(
**randomized_cv.best_params_,eval_metric = 'logloss',random_state = 1
)

# Fit the model on training data
xgb_tuned.fit(X_train, y_train)

In [14]:
y_pred = xgb_tuned.predict(X_test)
metrics.f1_score(y_test,y_pred,average='macro')



0.9034945949285352

In [15]:
metrics.recall_score(y_test,y_pred,average='macro')

0.9040950862724322

In [16]:
metrics.precision_score(y_test,y_pred,average='macro')

0.9034720781264691

In [17]:
metrics.accuracy_score(y_test,y_pred)


0.9040949427991019

In [18]:
# Save the model to a file
with open("xgb_model.pkl", "wb") as f:
    pickle.dump(xgb_tuned, f)

In [19]:
import pickle

# Load the model from the file
with open("xgb_model.pkl", "rb") as f:
    model = pickle.load(f)

In [22]:
df

Unnamed: 0,password,strength
0,kzde5577,1
1,kino3434,1
2,visi7k1yr,1
3,megzy123,1
4,lamborghin1,1
...,...,...
669635,10redtux10,1
669636,infrared1,1
669637,184520socram,1
669638,marken22a,1


In [34]:
text = '123456'

def word_to_char(word):
    return list(word)
# Load the vectorizer from the file
with open("vectorizer.pkl", "rb") as f:
    vectorizer = pickle.load(f)

X  = vectorizer.transform([text])
y_pred = model.predict_proba(X)
y_pred

array([[0.77070576, 0.16072197, 0.06857225]], dtype=float32)

In [46]:
max(y_pred[0])

0.77070576