In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split,StratifiedKFold, cross_val_score,KFold
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier,StackingClassifier,BaggingClassifier
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV


In [None]:
# Import dataset
df = pd.read_csv('data.csv',on_bad_lines='skip')

In [None]:
df.dropna(inplace = True)

In [None]:
print(f"Result Ratio: 0 - {df['strength'].value_counts()[0]/len(df):.2f}, 1 - {df['strength'].value_counts()[1]/len(df):.2f}, 2 - {df['strength'].value_counts()[2]/len(df):.2f}")

In [None]:
def word_to_char(word):
    return list(word)

In [None]:
X = df['password']
y = df['strength']

In [None]:
vectorizer = TfidfVectorizer(tokenizer=word_to_char)
X = vectorizer.fit_transform(X)

In [None]:
def train_val_test_spliter(X,y,ratio):
    X_train,X_,y_train,y_ = train_test_split(X,y,test_size=ratio,random_state=23)
    X_val,X_test,y_val,y_test = train_test_split(X_,y_,test_size=.5,random_state=23)
    return  X_train,X_val, X_test, y_train,y_val, y_test

In [None]:
X_train,X_val, X_test, y_train,y_val, y_test = train_val_test_spliter(X, y, ratio=0.20)

In [None]:
models_un = []  # Empty list to store all the models

# Appending models into the list


models_un.append(("Random forest", RandomForestClassifier(random_state=1)))
models_un.append(("Bagging", BaggingClassifier(random_state=1)))
models_un.append(("Xgboost", XGBClassifier(random_state=1, eval_metric="logloss")))
models_un.append(('LGBMclassifier',lgb.LGBMClassifier(random_state=1)))
models_un.append(('CatBoostClassifier',CatBoostClassifier(random_state=1)))

results_un = []  # Empty list to store all model's CV scores
names_un = []  # Empty list to store name of the models
score_un = []

# loop through all models to get the mean cross validated score

print("\n" "Cross-Validation Performance:" "\n")

for name, model in models_un:
    kfold = StratifiedKFold(
        n_splits=5, shuffle=True, random_state=1
    ) 
    cv_result = cross_val_score(
        estimator=model, X=X_train, y=y_train, scoring='accuracy', cv=kfold
    )
    results_un.append(cv_result)
    names_un.append(name)
    print("{}: {}".format(name, cv_result.mean()))


In [None]:
# defining model - XGBoost Hyperparameter Tuning
model = XGBClassifier(random_state=1, eval_metric="logloss")

# Parameter grid to pass in RandomizedSearchCV
param_grid = {
    'n_estimators': np.arange(100, 1001, 100),
    'learning_rate': np.linspace(0.01, 0.5, 50),
    'max_depth': np.arange(3, 10),
    'min_child_weight': np.arange(1, 6),
    'subsample': np.linspace(0.1, 1.0, 10),
    'colsample_bytree': np.linspace(0.1, 1.0, 10),
    'gamma': np.arange(0, 6),
    'reg_alpha': np.logspace(-4, 0, 50),
    'reg_lambda': np.logspace(-4, 0, 50),
    'scale_pos_weight': [1, 5, 10, 20]
}


# Type of scoring used to compare parameter combinations
scorer = 'accuracy'

# Calling RandomizedSearchCV
randomized_cv = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_grid,
    n_iter=20,
    scoring=scorer,
    cv=3,
    random_state=1,
    n_jobs=-1,
)

# Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_train, y_train)

print(
    "Best parameters are {} with CV score={}:".format(
        randomized_cv.best_params_, randomized_cv.best_score_
    )
)

In [None]:
xgb_tuned = XGBClassifier(
    **randomized_cv.best_params_
)
# Fit the model on training data
xgb_tuned.fit(X_train, y_train)

In [None]:
y_pred = xgb_tuned.predict(X_test)

In [None]:
metrics.f1_score(y_true = y_test,y_pred= y_pred,average = 'macro')