In [33]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
data = pd.read_csv('/kaggle/input/custom/data.csv', on_bad_lines='skip')
data.head()

Unnamed: 0,password,strength
0,kzde5577,1
1,kino3434,1
2,visi7k1yr,1
3,megzy123,1
4,lamborghin1,1


In [5]:
data.dtypes

password    object
strength     int64
dtype: object

In [6]:
data['password'].dropna(inplace=True)
X = data['password']
data['strength'].dropna(inplace=True)
y = data['strength']

In [7]:
data["length"] = data["password"].str.len()
data["length"]

0          8.0
1          8.0
2          9.0
3          8.0
4         11.0
          ... 
669635    10.0
669636     9.0
669637    12.0
669638     9.0
669639     8.0
Name: length, Length: 669640, dtype: float64

In [8]:
data = data.dropna(subset=['length'])
data['length'] = data['length'].astype(int)

In [9]:
data.dtypes

password    object
strength     int64
length       int64
dtype: object

In [10]:
# Frequency of Lowercase Characters
def freq_lowercase(row):
    return len([char for char in row if char.islower()])/len(row)

In [11]:
# Frequency of uppercase Characters
def freq_uppercase(row):
    return len([char for char in row if char.isupper()])/len(row)

In [12]:
# Frequency of Numeric Characters 
def freq_numerical_case(row):
    return len([char for char in row if char.isdigit()])/len(row)

In [13]:
# Frequency of Special-case Characters 
def freq_special_case(row):
    special_chars = [] 
    for char in row:
        if not char.isalpha() and not char.isdigit():
            special_chars.append(char) 
    return len(special_chars) 

In [14]:
data["lowercase_freq"] = np.round(data["password"].apply(freq_lowercase) , 3)

data["uppercase_freq"] = np.round(data["password"].apply(freq_uppercase) , 3)

data["digit_freq"] = np.round(data["password"].apply(freq_numerical_case) , 3)

data["special_char_freq"] = np.round(data["password"].apply(freq_special_case) , 3)

data["special_char_freq"] = data["special_char_freq"]/data["length"] 

In [15]:
data

Unnamed: 0,password,strength,length,lowercase_freq,uppercase_freq,digit_freq,special_char_freq
0,kzde5577,1,8,0.500,0.0,0.500,0.0
1,kino3434,1,8,0.500,0.0,0.500,0.0
2,visi7k1yr,1,9,0.778,0.0,0.222,0.0
3,megzy123,1,8,0.625,0.0,0.375,0.0
4,lamborghin1,1,11,0.909,0.0,0.091,0.0
...,...,...,...,...,...,...,...
669635,10redtux10,1,10,0.600,0.0,0.400,0.0
669636,infrared1,1,9,0.889,0.0,0.111,0.0
669637,184520socram,1,12,0.500,0.0,0.500,0.0
669638,marken22a,1,9,0.778,0.0,0.222,0.0


In [16]:
dataframe = data.sample(frac=1)
dataframe

Unnamed: 0,password,strength,length,lowercase_freq,uppercase_freq,digit_freq,special_char_freq
179271,zlwang1983,1,10,0.600,0.000,0.400,0.0
635165,yhXkqBTAxNQqX2Ll,2,16,0.438,0.500,0.062,0.0
388312,s1mkuring,1,9,0.889,0.000,0.111,0.0
234405,n19fzrfl,1,8,0.750,0.000,0.250,0.0
470337,secco09,0,7,0.714,0.000,0.286,0.0
...,...,...,...,...,...,...,...
42193,Tr1sx45lioqbijq,2,15,0.733,0.067,0.200,0.0
640926,sevenfold777,1,12,0.750,0.000,0.250,0.0
148719,kqjlvfa4,1,8,0.875,0.000,0.125,0.0
243555,DWBeOeDg5MwhAdlw,2,16,0.500,0.438,0.062,0.0


In [17]:
dataframe.isnull().sum()

password             0
strength             0
length               0
lowercase_freq       0
uppercase_freq       0
digit_freq           0
special_char_freq    0
dtype: int64

In [18]:
vectorizer = TfidfVectorizer(analyzer="char")
x = list(dataframe["password"])
X = vectorizer.fit_transform(x)

In [19]:
dataframe["password"].shape

(669639,)

In [20]:
df2 = pd.DataFrame(X.toarray() , columns=vectorizer.get_feature_names_out())

In [22]:
df2

Unnamed: 0,,,,,,,,,,,...,ƒ,—,‚,‡,…,‹,›,™,length,lowercase_freq
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.500
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.500
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.778
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.625
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.0,0.909
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
669634,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.833
669635,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.600
669636,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.889
669637,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,0.500


In [21]:
df2["length"] = dataframe['length']
df2["lowercase_freq"] = dataframe['lowercase_freq']
y = dataframe["strength"]

In [23]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df2, y, test_size=0.2, random_state=42)

In [24]:
X_train[["length","lowercase_freq"]].isnull().sum()

length            1
lowercase_freq    1
dtype: int64

In [25]:
X_train.dropna(inplace=True)

In [26]:
X_train[["length","lowercase_freq"]].isnull().sum()

length            0
lowercase_freq    0
dtype: int64

In [27]:
print(X_train.shape)
print(y_train.shape)
y_train = y_train.drop(0)
print(y_train.shape)

(535710, 155)
(535711,)
(535710,)


In [28]:
X_test.shape

(133928, 155)

In [29]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

In [30]:
rf_classifier.fit(X_train, y_train)

In [31]:
y_pred = rf_classifier.predict(X_test)

In [35]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.73      0.83     18053
           1       0.93      0.99      0.96     99286
           2       0.97      0.87      0.92     16589

    accuracy                           0.94    133928
   macro avg       0.96      0.87      0.90    133928
weighted avg       0.94      0.94      0.94    133928



In [42]:
def predict():
    password = input("Enter a password : ")
    sample_array = np.array([password])
    
    # 151 dimension
    sample_matrix = vectorizer.transform(sample_array) 
    
    # +2 dimension
    length_pass = len(password)
    length_normalised_lowercase = len([char for char in password if char.islower()])/len(password)
    
    # 151 + 2 
    new_matrix2 = np.append(sample_matrix.toarray() , (length_pass , length_normalised_lowercase)).reshape(1,155)
    
    result = rf_classifier.predict(new_matrix2)
    
    if result == 0 :
        return "Password is weak"
    elif result == 1 :
        return "Password is normal"
    else:
        return "password is strong"

In [56]:
import warnings
warnings.filterwarnings("ignore", message="X does not have valid feature names")

In [55]:
predict()

Enter a password :  asdas


'Password is weak'

In [57]:
predict()

Enter a password :  asbdasd@123


'Password is normal'

In [60]:
predict()

Enter a password :  ajshdja2@!2312!&^7


'password is strong'

In [53]:
print("Accuracy for Random Forest Model :",round(accuracy_score(y_test , y_pred),2)*100 ,"%")

Accuracy for Random Forest Model : 94.0 %


In [54]:
from joblib import dump

dump(rf_classifier, 'keykeg_rf.joblib', compress=3)

['keykeg_rf.joblib']

In [None]:
dump(vectorizer, 'keykeg_rf_vectorizer.joblib', compress=3)

In [None]:
import pickle

In [None]:
pickle.dump(rf_classifier, open('keykeg_rf.pkl','wb'))

In [None]:
pickle.dump(vectorizer, open("vectorizer.pkl", "wb"))