### This password evaluator is a support function to be used in the project of fit5120/22

In [1]:
# For Data manipulation
import pandas as pd
import numpy as np

# For Data Visulization
import matplotlib.pyplot as plt
import seaborn as sns

#For Modelling and evaluation
from sklearn.model_selection import train_test_split,StratifiedKFold, cross_val_score
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,BaggingClassifier,StackingClassifier
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

#For text preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer

#For model parameter saving and loading
import pickle

In [2]:
# Import dataset
df = pd.read_csv('data.csv',on_bad_lines='skip') #ignore badline from dataset
df.isna().sum()
df.duplicated().sum()
#only 1 missing value, and no duplicate found,will drop the bad record directly
df.dropna(inplace = True)

In [3]:
#Import text data for all the weak passwords from rock you leak
df2 = pd.read_csv('rockyou.txt',delimiter='\t',header = None, names = ['password'],encoding='ISO-8859-1')
df2.dropna(inplace = True)
df2.drop_duplicates(inplace = True)
df2['strength'] = 0

In [4]:
df_full = pd.concat([df,df2],ignore_index=True)

# Compute the value counts of the Gender column
value_counts = df_full['strength'].value_counts()

# Set the number of samples to be drawn from each group
n_samples = value_counts.min()

# Group the dataframe by Gender and sample n_samples from each group
sampled_df = df.groupby('strength').apply(lambda x: x.sample(n=n_samples)).reset_index(drop=True)

# Print the sampled dataframe
print(sampled_df)

                password  strength
0                lipo3ak         0
1                 lynd0n         0
2                pallek1         0
3                qqnanm3         0
4                 josh20         0
...                  ...       ...
249406  JEW7LwjEyMQLnm4h         2
249407  N19emMjYwOABRfZZ         2
249408   736DOceQoZYxyKy         2
249409    418ezEdiRucugy         2
249410  0wubFZjU2MgUn8pa         2

[249411 rows x 2 columns]


In [5]:
X = sampled_df['password']
y = sampled_df['strength']
#tokenize password

vectorizer = TfidfVectorizer(analyzer = 'char')
X = vectorizer.fit_transform(X)

with open("vectorizer2.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

In [6]:
#Data preprocessing
def train_val_test_split(X,y,ratio):
    X_train,X_,y_train,y_ = train_test_split(X,y,test_size=ratio,stratify=y,random_state=1)
    X_val,X_test,y_val,y_test = train_test_split(X_,y_,test_size=.5,stratify=y_,random_state=1)
    return X_train,X_val,X_test,y_train,y_val,y_test

In [7]:
X_train,X_val,X_test,y_train,y_val,y_test = train_val_test_split(X,y,ratio=.25)

In [8]:
#Model training with CV

models = []  # Empty list to store all the models

# Appending models into the list

models.append(("Random forest", RandomForestClassifier(random_state=1)))
models.append(("Bagging", BaggingClassifier(random_state=1)))
models.append(("Xgboost", XGBClassifier(random_state=1, eval_metric="logloss")))
models.append(("lgbm", lgb.LGBMClassifier(random_state=1)))

results = []  # Empty list to store all model's CV scores
names = []  # Empty list to store name of the models
score = []

# loop through all models to get the mean cross validated score

print("\n" "Cross-Validation Performance:" "\n")

for name, model in models:
    kfold = StratifiedKFold(
        n_splits=5, shuffle=True, random_state=1
    )  # Setting number of splits equal to 5
    cv_result = cross_val_score(
        estimator=model, X=X_train, y=y_train, scoring='f1_macro', cv=kfold
    )
    results.append(cv_result)
    names.append(name)
    print("{}: {}".format(name, cv_result.mean()))


Cross-Validation Performance:

Random forest: 0.9456721865928752
Bagging: 0.9218988232481464
Xgboost: 0.9817256899409784
lgbm: 0.9635410536172795


In [9]:
print("\n" "Training Performance:" "\n")

for name, model in models:
    model.fit(X_train, y_train)
    scores = metrics.f1_score(y_train, model.predict(X_train),average='macro')
    print("{}: {}".format(name, scores))


Training Performance:



In [None]:
print("\n" "Validation Performance:" "\n")

for name, model in models:
    model.fit(X_train, y_train)
    scores = metrics.f1_score(y_val, model.predict(X_val),average='macro')
    print("{}: {}".format(name, scores))

In [None]:
#Using Xgboost provided the best f1_marco result, thus we can fine tune it
# defining model - XGBoost Hyperparameter Tuning
model = XGBClassifier(random_state=1, eval_metric="logloss")

# Parameter grid to pass in RandomizedSearchCV
param_grid = {
    "n_estimators": np.arange(150, 300, 50),
    "learning_rate": [0.0001,0.001,0.01,0.0015],
    "gamma": [0, 3, 5,7],
    "subsample": [0.5, 0.9,0.2,0.35],
    'reg_alpha':[0,1],
    'reg_lambda':[0,1]
}
# Calling RandomizedSearchCV
randomized_cv = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_grid,
    n_iter=20,
    scoring='f1_macro',
    cv=3,
    random_state=1,
    n_jobs=-1,
)

# Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_train, y_train)

print(
    "Best parameters are {} with CV score={}:".format(
        randomized_cv.best_params_, randomized_cv.best_score_
    )
)

In [None]:
est = [('lgbm', lgb.LGBMClassifier(eval_metric='logloss', random_state=1))]

In [None]:
final_model = XGBClassifier(**randomized_cv.best_params_,eval_metric = 'logloss',random_state = 1) #use best performed model as final_model
estimator = models #droping XGB and use others as init estimators

reg = StackingClassifier(
estimators=est,
final_estimator=final_model)

In [None]:
#Evaluate the stakeclassifer using validation set
reg.fit(X_train,y_train)
y_pred = reg.predict(X_val)
print(f"Stacking Method, F1: {metrics.f1_score(y_val,y_pred,average='macro')}")

In [None]:
# building model with best parameters
xgb_tuned = XGBClassifier(
**randomized_cv.best_params_,eval_metric = 'logloss',random_state = 1
)
# Fit the model on training data
xgb_tuned.fit(X_train, y_train)

In [None]:
y_pred = xgb_tuned.predict(X_test)
metrics.f1_score(y_test,y_pred,average='macro')



In [None]:
y_pred = reg.predict(X_test)
metrics.f1_score(y_test,y_pred,average='macro')


In [None]:
# Save the model to a file
with open("stack_model4.pkl", "wb") as f:
    pickle.dump(reg, f)

In [1]:
import pickle

# Load the model from the file
with open("xgb_model2.pkl", "rb") as f:
    model = pickle.load(f)

In [2]:
text = 'apple123'

# Load the vectorizer from the file
with open("vectorizer2.pkl", "rb") as f:
    vectorizer = pickle.load(f)
X  = vectorizer.transform([text])
y_pred = model.predict(X)
y_pred

array([1])