# Using Machine Learning to Predict Password Strength

In [1]:
'''
Steps
1- Purpose
2- Data Collection
3- Data Preprocessing 
4- Feature Engineering
5- Data Analysis
6- Model Training & Evaluation
'''

'\nSteps\n1- Purpose\n2- Data Collection\n3- Data Preprocessing \n4- Feature Engineering\n5- Model Selection\n6- Training\n7- Model Evaluation\n'

# Purpose

In [2]:
'''
The purpose of this project is to build a model that assesses the strength of user's passwords and 
provides a feedback on whether they are secure or vulnerable.
'''

"\nThe purpose of this project is to build a model that assesses the strength of user's passwords and \nprovides a feedback on whether they are secure or vulnerable.\n"

# Data Collection

In [3]:
# Dataset source- Kaggle (https://www.kaggle.com/datasets/bhavikbb/password-strength-classifier-dataset)
# I would import the necessary packages/libraries before importing the data from my system downloads

# Import Necessary Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
import string
import re


In [4]:
# Importing my data using pandas
data = pd.read_csv(r"C:\Users\makeushola\Downloads\data_re - data_re.csv")
print(data.head())

# OBSERVATION_1: The data contains 7 columns with 5 of these columns containing multiple NAN values.

  data = pd.read_csv(r"C:\Users\makeushola\Downloads\data_re - data_re.csv")


      password strength Unnamed: 2 Unnamed: 3  Unnamed: 4 Unnamed: 5  \
0     kzde5577        1        NaN        NaN         NaN        NaN   
1     kino3434        1        NaN        NaN         NaN        NaN   
2    visi7k1yr        1        NaN        NaN         NaN        NaN   
3     megzy123        1        NaN        NaN         NaN        NaN   
4  lamborghin1        1        NaN        NaN         NaN        NaN   

   Unnamed: 6  
0         NaN  
1         NaN  
2         NaN  
3         NaN  
4         NaN  


# Data Cleaning

In [5]:
# Creating a dataframe that contains only the two columns currently needed 
# print(data.columns)
df = data[["password","strength"]]
print(df.head())

      password strength
0     kzde5577        1
1     kino3434        1
2    visi7k1yr        1
3     megzy123        1
4  lamborghin1        1


In [6]:
# Checking the dtype and other information about the columns

df.info()

# OBSERVATION_2: both columns are not of the same length but they are both of datatype object, password contains one null.

# Deleting null values
df = df.dropna()
print(df.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 669879 entries, 0 to 669878
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   password  669878 non-null  object
 1   strength  669879 non-null  object
dtypes: object(2)
memory usage: 10.2+ MB
(669878, 2)


In [7]:
"""
According to the data details, 
1) Strength - three values(0 , 1 , 2) i.e. 0 for weak, 1 for medium, 2 for strong.
2) Strength of the password based on rules(such as containing digits, special symbols , etc.)
Checking the ranges of values in strength column.
"""
len(df['strength'].unique())

# OBSERVATION_3: The strength column contains 239 unique values instead of 3

# Filtering out rows with strength not (0,1 and 2)

filtered_df = df['strength'].isin(['0','1', '2'])

df = df[filtered_df]

len(df['strength'].unique())
print(df.shape)

# OBSERVATION_4: The shape of the dataset has reduced from 669879 to 669639

(669639, 2)


In [8]:
# Checking for duplicates

duplicates = df.duplicated()
print(duplicates)

total_duplicates = duplicates.sum()
print(total_duplicates)

# OBSERVATION_5: There are 38 duplicates
    
# Removing duplicates

df.drop_duplicates(inplace=True)

# OBSERVATION_6: Total dataset ready for feature engineering is 669601

0         False
1         False
2         False
3         False
4         False
          ...  
669874    False
669875    False
669876    False
669877    False
669878    False
Length: 669639, dtype: bool
38


# Feature Engineering 1

In [9]:
"""
The password_features() function takes a single argument password, 
which is a string representing a password, and returns a dictionary containing various features extracted from the password. 
This is to extract useful features from the password string to aid password strength analysis.
"""


def password_features(password):
    features = {
        'length': len(password),
        'num_digits': sum(c.isdigit() for c in password),
        'num_uppercase': sum(c.isupper() for c in password),
        'num_lowercase': sum(c.islower() for c in password),
        'num_special': sum(c in string.punctuation for c in password)
    }
    
    return features

In [10]:
# Apply the password_features function to each row in the DataFrame
features_df = df['password'].apply(lambda x: pd.Series(password_features(x)))

# Concatenate the original DataFrame with the new features DataFrame
df = pd.concat([df, features_df], axis=1)

# Display the updated DataFrame
print(df)

            password strength  length  num_digits  num_uppercase  \
0           kzde5577        1       8           4              0   
1           kino3434        1       8           4              0   
2          visi7k1yr        1       9           2              0   
3           megzy123        1       8           3              0   
4        lamborghin1        1      11           1              0   
...              ...      ...     ...         ...            ...   
669874    10redtux10        1      10           4              0   
669875     infrared1        1       9           1              0   
669876  184520socram        1      12           6              0   
669877     marken22a        1       9           2              0   
669878      fxx4pw4g        1       8           2              0   

        num_lowercase  num_special  
0                   4            0  
1                   4            0  
2                   7            0  
3                   5            0 

# Data Analysis

In [16]:
"""
The next step of data exploration allows us to know more about the features,
and how it contributes to the strength of a password. 
"""
# Feature 1

df[["length" , "strength"]].groupby("strength").agg(["min", "max" , "mean" , "median"])

Unnamed: 0_level_0,length,length,length,length
Unnamed: 0_level_1,min,max,mean,median
strength,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,1,7,6.549483,7.0
1,6,13,9.618844,9.0
2,11,220,15.93213,16.0


In [17]:
# Feature 2
df[["num_digits" , "strength"]].groupby("strength").agg(["min", "max" , "mean" , "median"])

Unnamed: 0_level_0,num_digits,num_digits,num_digits,num_digits
Unnamed: 0_level_1,min,max,mean,median
strength,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,0,7,1.774886,2.0
1,0,12,3.449593,3.0
2,0,39,3.100577,3.0


In [18]:
# Feature 3
df[["num_uppercase" , "strength"]].groupby("strength").agg(["min", "max" , "mean" , "median"])

Unnamed: 0_level_0,num_uppercase,num_uppercase,num_uppercase,num_uppercase
Unnamed: 0_level_1,min,max,mean,median
strength,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,0,7,0.07865,0.0
1,0,12,0.081772,0.0
2,0,80,5.824167,6.0


In [19]:
# Feature 4
df[["num_lowercase" , "strength"]].groupby("strength").agg(["min", "max" , "mean" , "median"])

Unnamed: 0_level_0,num_lowercase,num_lowercase,num_lowercase,num_lowercase
Unnamed: 0_level_1,min,max,mean,median
strength,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,0,7,4.67368,5.0
1,0,12,6.068845,6.0
2,0,164,6.771791,6.0


In [20]:
# Feature 5
df[["num_special" , "strength"]].groupby("strength").agg(["min", "max" , "mean" , "median"])

Unnamed: 0_level_0,num_special,num_special,num_special,num_special
Unnamed: 0_level_1,min,max,mean,median
strength,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,0,7,0.019612,0.0
1,0,10,0.017275,0.0
2,0,21,0.231409,0.0


# Feature Engineering 2

In [23]:
"""
Calculating password entropy is a useful way to measure its complexity and unpredictability.
Shannon entropy is a common method for estimating password entropy. 
It takes into account the length of the password and the character set used. 
so i will be adding this so that it can improve my model's performance
"""
def password_entropy(password, eps=1e-12):
    # Define character sets
    lowercase = 'abcdefghijklmnopqrstuvwxyz'
    uppercase = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
    digits = '0123456789'
    special_characters = '!@#$%^&*()-_=+[]{}|;:,.<>/?`~'

    # Calculate the length of each character set in the password
    n_lowercase = sum(1 for char in password if char in lowercase)
    n_uppercase = sum(1 for char in password if char in uppercase)
    n_digits = sum(1 for char in password if char in digits)
    n_special_characters = sum(1 for char in password if char in special_characters)

    # Calculate the total length of the password
    password_length = len(password)

    # Calculate the probability of each character set
    p_lowercase = (n_lowercase + eps) / (password_length + 4 * eps)
    p_uppercase = (n_uppercase + eps) / (password_length + 4 * eps)
    p_digits = (n_digits + eps) / (password_length + 4 * eps)
    p_special_characters = (n_special_characters + eps) / (password_length + 4 * eps)

    # Calculate the entropy using the Shannon entropy formula
    entropy = -(p_lowercase * np.log2(p_lowercase) +
                 p_uppercase * np.log2(p_uppercase) +
                 p_digits * np.log2(p_digits) +
                 p_special_characters * np.log2(p_special_characters))

    return entropy



In [38]:
# Applying the passsword_entrophy function on my df

df['password_entropy'] = df['password'].apply(password_entropy)

# print(df.head())
# print(df.describe())

# OBSERVATION_12: password_entrophy ranges from -1.890578e-11 to 2.000000e+00 

              length     num_digits  num_uppercase  num_lowercase  \
count  669601.000000  669601.000000  669601.000000  669601.000000   
mean        9.991511       3.181949       0.794264       5.969243   
std         2.819827       2.001442       2.285231       2.379281   
min         1.000000       0.000000       0.000000       0.000000   
25%         8.000000       2.000000       0.000000       5.000000   
50%         9.000000       3.000000       0.000000       6.000000   
75%        11.000000       4.000000       0.000000       7.000000   
max       220.000000      39.000000      80.000000     164.000000   

         num_special  password_entropy  
count  669601.000000      6.696010e+05  
mean        0.044173      8.728052e-01  
std         0.331841      2.527010e-01  
min         0.000000      1.890578e-11  
25%         0.000000      7.219281e-01  
50%         0.000000      8.812909e-01  
75%         0.000000      9.709506e-01  
max        21.000000      2.000000e+00  


# Model Training & Evaluation

In [39]:
# Prepare data for training

X = df[['length', 'num_digits', 'num_uppercase', 'num_lowercase', 'num_special','password_entropy']]
y = df['strength']


# Standardizing the data
scaler = StandardScaler()
X_standardized = scaler.fit_transform(X)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_standardized, y, test_size=0.2, random_state=42)

# Train a Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)




In [40]:
# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     17799
           1       1.00      1.00      1.00     99369
           2       1.00      1.00      1.00     16753

    accuracy                           1.00    133921
   macro avg       1.00      1.00      1.00    133921
weighted avg       1.00      1.00      1.00    133921



In [42]:
from sklearn.metrics import confusion_matrix ,  accuracy_score , classification_report
accuracy_score(y_test , y_pred)

0.9999850658223878

# Validating the accuracy of my model using new data

In [61]:
pip install pwgen
import random
import string
import secrets

def generate_password(length=12, use_special_chars=False):
    chars = string.ascii_letters + string.digits
    if use_special_chars:
        chars += string.punctuation

    password = ''.join(secrets.choice(chars) for _ in range(length))
    return password

# Generate 100 passwords with varying lengths and features
passwordss = []
for _ in range(100):
    length = random.randint(8, 16)  # Randomly choose a length between 8 and 16 characters
    use_special_chars = random.choice([True, False])  # Randomly decide whether to include special characters
    password = generate_password(length, use_special_chars)
    passwordss.append(password)

# Print the generated passwords
print(passwordss)


           Password  Strength
0          LgGE2Jgb         2
1      -~SeVPco?IFc         2
2       p8't-%Z|g4m         2
3  v.gk8aC-LT^r`Gnl         2
4         bZK3OMJxh         2


In [None]:
import pandas as pd
import zxcvbn
from google.colab import files

# Define the strength categories
def categorize_strength(score):
    if score == 0:
        return 0  # Weak
    elif score == 1:
        return 1  # Medium
    else:
        return 2  # Strong


passwords =passwordss
# List to store results
results = []

for password in passwords:
    result = zxcvbn.zxcvbn(password)
    strength_score = result['score']
    categorized_strength = categorize_strength(strength_score)  
    results.append({'Password': password, 'Strength': categorized_strength})

# Create DataFrame
df = pd.DataFrame(results)

# Save DataFrame to CSV
file_path = 'password_strengths.csv'
df.to_csv(file_path, index=False)

# Provide a download link
files.download(file_path)

In [71]:
import pandas as pd
import numpy as np
import string
import math
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

def calculate_features(df, password_col):
    
    def count_uppercase(password):
        return sum(1 for char in password if char.isupper())

    def count_lowercase(password):
        return sum(1 for char in password if char.islower())

    def count_digits(password):
        return sum(1 for char in password if char.isdigit())

    def count_special_chars(password):
        return sum(1 for char in password if char in string.punctuation)

    def calculate_entropy(password):
        # Count character frequencies
        freq = {}
        for char in password:
            if char in freq:
                freq[char] += 1
            else:
                freq[char] = 1
        # Calculate entropy
        length = len(password)
        entropy = 0
        for count in freq.values():
            p = count / length
            entropy -= p * math.log2(p)
        return entropy

    # Apply functions to each password
    df['length'] = df[password_col].apply(len)
    df['num_uppercase'] = df[password_col].apply(count_uppercase)
    df['num_lowercase'] = df[password_col].apply(count_lowercase)
    df['num_digits'] = df[password_col].apply(count_digits)
    df['num_special'] = df[password_col].apply(count_special_chars)
    df['password_entropy'] = df[password_col].apply(calculate_entropy)
    
    return df

def standardize_features(df):
    X = df[['length', 'num_digits', 'num_uppercase', 'num_lowercase', 'num_special', 'password_entropy']]
    scaler = StandardScaler()
    X_standardized = scaler.fit_transform(X)
    return X_standardized

def predict_strength(model, X_standardized):
    return model.predict(X_standardized)



In [72]:

dataad=pd.read_csv(r"C:\Users\makeushola\Downloads\password_strengths (1).csv")
df = pd.DataFrame(dataad)

# Calculate features
df = calculate_features(df, 'Password')

# Standardize features
X_standardized = standardize_features(df)

#Fit the model
model = clf
X_train = X_standardized
y_train = df['Strength']
model.fit(X_train, y_train)

# Predict strength
df['predicted_strength'] = predict_strength(model, X_standardized)

print(df)


            Password  Strength  length  num_uppercase  num_lowercase  \
0           LgGE2Jgb         2       8              4              3   
1       -~SeVPco?IFc         2      12              5              4   
2        p8't-%Z|g4m         2      11              1              4   
3   v.gk8aC-LT^r`Gnl         2      16              4              7   
4          bZK3OMJxh         2       9              5              3   
..               ...       ...     ...            ...            ...   
95       %ocf7eXc$ev         2      11              1              7   
96          sBZ9XIgr         2       8              4              3   
97     :&E3_`"27ZD!0         2      13              3              0   
98   hiGi0G97fLNSgCn         2      15              6              6   
99         6JDU1@]VC         2       9              5              0   

    num_digits  num_special  password_entropy  predicted_strength  
0            1            0          2.750000                   2  

In [75]:
from sklearn.metrics import classification_report
y_true = df['Strength']
y_pred = df['predicted_strength']
report = classification_report(y_true, y_pred)
print(report)

              precision    recall  f1-score   support

           2       1.00      1.00      1.00       100

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100

