In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,precision_score,recall_score
from sklearn.feature_extraction.text import CountVectorizer
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
warnings.filterwarnings("ignore")

In [4]:
# Load the data
data = pd.read_csv("pass.csv", on_bad_lines='skip')

# Display the first few rows of the dataframe to understand its structure
# print(data.head())
# df = pd.DataFrame(data)

columns_to_remove = ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6']

# Remove the specified columns
data = data.drop(columns=columns_to_remove, errors='ignore')

In [6]:
data["strength"].unique()

array([1, 2, 0], dtype=int64)

0 means->the password strenght is weak

1 means->the password strenght is medium

2 means->the password strenght is strong

In [7]:
data = data.dropna().sample(frac=1).reset_index(drop=True) # Remove null values and shuffle the data

In [8]:
data.isnull().sum()

password    0
strength    0
dtype: int64

In [9]:
data[data["password"].isnull()]

Unnamed: 0,password,strength


In [10]:
#check duplicate
data.duplicated().sum()

32

In [11]:
# count of strengths or class value to be predicted
data['strength'].value_counts()

strength
1    496784
0     89687
2     83126
Name: count, dtype: int64

In [12]:
password_tuple=np.array(data)

In [13]:
password_tuple

array([['b4k3xz', 0],
       ['balle1990', 1],
       ['Suburutoyota123', 2],
       ...,
       ['fnzjotcxv219', 1],
       ['mylife7', 0],
       ['vovannhi1989', 1]], dtype=object)

In [14]:
import random
random.shuffle(password_tuple)

In [15]:
x=[labels[0] for labels in password_tuple]
y=[labels[1] for labels in password_tuple]

In [16]:
def word_divide_char(inputs):
  character=[]
  for i in inputs:
    character.append(i)
  return character
# Convert a given word (string) to a list of individual characters
# def word_to_char(word):
#     return list(word)

In [17]:
vectorizer = TfidfVectorizer(tokenizer=word_divide_char)
X = vectorizer.fit_transform(x)

In [18]:
vectorizer.get_feature_names_out()

array(['\x02', '\x05', '\x06', '\x0f', '\x10', '\x11', '\x12', '\x17',
       '\x19', '\x1c', ' ', '!', '"', '#', '$', '%', '&', '(', ')', '*',
       '+', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8',
       '9', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`',
       'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
       'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
       '{', '|', '}', '~', '\x7f'], dtype=object)

In [19]:
first_document_vector=X[0] 
first_document_vector 

<1x77 sparse matrix of type '<class 'numpy.float64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [20]:
first_document_vector.T.todense()

matrix([[0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.33713378],
        [0.37136951],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0

In [21]:
df=pd.DataFrame(first_document_vector.T.todense(),index=vectorizer.get_feature_names_out(),columns=['TF-IDF'])
df.sort_values(by=['TF-IDF'],ascending=False)

Unnamed: 0,TF-IDF
x,0.476118
z,0.444062
b,0.418745
k,0.386262
4,0.371370
...,...
0,0.000000
/,0.000000
.,0.000000
-,0.000000


In [22]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [23]:
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train, y_train)
y_pred = dt_clf.predict(X_test)

In [24]:
from sklearn.metrics import accuracy_score

print('Accuracy (Decision Tree):', accuracy_score(y_test, y_pred))

Accuracy (Decision Tree): 0.9745967741935484


In [25]:
import pickle 

In [26]:
pickle.dump(vectorizer, open("tfidf_password_strength.pickle", "wb"))
pickle.dump(dt_clf, open("final_model.pickle", "wb"))

In [37]:
# with open("tfidf_password_strength.pickle", 'rb') as file:
#     saved_vectorizer = pickle.load(file) # Load the vectorizer from the pickle file

# with open("final_model.pickle", 'rb') as file:
#     final_model = pickle.load(file) 

In [27]:
def test_password_strength(password, vectorizer, model):
    X_password = np.array([password]) # Convert the password to a numpy array
    X_predict = vectorizer.transform(X_password) # Transform the password using the loaded vectorizer
    y_pred = model.predict(X_predict) # Predict the password strength using the loaded model
    return y_pred

In [39]:
# Print the first password and its predicted strength
password1 = 'abc123'
strength1 = test_password_strength(password1, saved_vectorizer, final_model)
print(f'Password: {password1}, Strength: {strength1}')


Password: abc123, Strength: [1]
