In [95]:
# import necessary libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.multiclass import OneVsRestClassifier

In [20]:
# display all columns of the dataframe
pd.options.display.max_columns = None

# display all rows of the dataframe
pd.options.display.max_rows = 50 # None

# returns an output value upto 6 decimals
pd.options.display.float_format = '{:.6f}'.format

In [2]:
# load the data into a dataframe variable
data = pd.read_csv(r'C:\Users\vasud\Password_Strength\passwords.csv')

In [3]:
# check the top of the dataframe
data.head()

Unnamed: 0,password,strength
0,kzde5577,1
1,kino3434,1
2,visi7k1yr,1
3,megzy123,1
4,lamborghin1,1


In [4]:
# code to create a list of all possible characters in the password column
import string
string.ascii_letters
chars = string.printable
print(chars)

In [8]:
# create a copy of the data before further operations
df = data.copy(deep=True)

In [23]:
# check the info of the dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 669876 entries, 0 to 669875
Columns: 102 entries, password to 
dtypes: int64(101), object(1)
memory usage: 521.3+ MB


In [10]:
# check the target column unique values
df['strength'].value_counts()

1    496801
0     89703
2     83372
Name: strength, dtype: int64

In [None]:
# changing the dtype of password column to string
df['password'] = df['password'].astype(str)

In [13]:
# adding columns to df with occurrences of each character in a password
for char in chars:    
    df[char] = df['password'].apply(lambda x: x.count(char))

In [25]:
# check out top of the new dataframe
df.head()

Unnamed: 0,password,strength,0,1,2,3,4,5,6,7,8,9,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z,!,"""",#,$,%,&,',(,),*,+,",",-,.,/,:,;,<,=,>,?,@,[,\,],^,_,`,{,|,},~,Unnamed: 97,\t,\n,\r,Unnamed: 101,Unnamed: 102
0,kzde5577,1,0,0,0,0,0,2,0,2,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,kino3434,1,0,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,visi7k1yr,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,0,1,0,0,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,megzy123,1,0,1,1,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,lamborghin1,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,1,1,0,0,1,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [65]:
# check some statistics of the new dataframe
stats = df.describe().T
stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
strength,669876.000000,0.990549,0.508212,0.000000,1.000000,1.000000,1.000000,2.000000
0,669876.000000,0.373650,0.699819,0.000000,0.000000,0.000000,1.000000,10.000000
1,669876.000000,0.575499,0.717343,0.000000,0.000000,0.000000,1.000000,11.000000
2,669876.000000,0.419242,0.630297,0.000000,0.000000,0.000000,1.000000,8.000000
3,669876.000000,0.320634,0.565532,0.000000,0.000000,0.000000,1.000000,8.000000
...,...,...,...,...,...,...,...,...
\t,669876.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
\n,669876.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
\r,669876.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
,669876.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


We can see some of the characters have zero occurrences, hence there is no point in keeping them

In [70]:
# create a list to drop the character columns which have no occurrences in the passwords
cols_to_drop = stats[stats['max']==0].index
cols_to_drop

Index([''', ',', ':', '\t', '\n', '\r', '
', '
'], dtype='object')

In [71]:
# dropping the columns
df.drop(cols_to_drop, axis=1, inplace=True)

In [73]:
# checking the stats of the df again
df.describe()

Unnamed: 0,strength,0,1,2,3,4,5,6,7,8,9,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z,!,"""",#,$,%,&,(,),*,+,-,.,/,;,<,=,>,?,@,[,\,],^,_,`,{,|,},~,Unnamed: 93
count,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0,669876.0
mean,0.990549,0.37365,0.575499,0.419242,0.320634,0.253787,0.253772,0.22451,0.224668,0.242252,0.293186,0.609338,0.156006,0.19346,0.194476,0.42734,0.115984,0.188623,0.174862,0.378531,0.141908,0.203753,0.251596,0.239186,0.305242,0.372185,0.165999,0.082333,0.29734,0.288767,0.237783,0.239831,0.121324,0.145222,0.103749,0.195096,0.135041,0.064796,0.020071,0.020632,0.044195,0.036484,0.019408,0.020566,0.019895,0.034384,0.020323,0.02026,0.021659,0.060669,0.052495,0.042218,0.020031,0.04853,0.022479,0.023606,0.044457,0.031503,0.018583,0.018845,0.01855,0.031654,0.018757,0.003259,1.3e-05,0.001806,0.001702,0.000611,0.000994,0.00047,0.000545,0.002802,0.001067,0.0067,0.008897,0.001091,0.000545,0.000176,0.000358,0.00014,0.000749,0.008136,0.000224,6e-05,0.000197,0.000505,0.004371,1.9e-05,5.8e-05,5.5e-05,5.5e-05,0.000139,0.001636
std,0.508212,0.699819,0.717343,0.630297,0.565532,0.520092,0.535484,0.511147,0.504291,0.522189,0.589896,0.784936,0.402028,0.441624,0.44383,0.656067,0.353057,0.440005,0.415834,0.595242,0.381828,0.457601,0.509745,0.483202,0.540843,0.619607,0.417007,0.303112,0.521594,0.539414,0.484653,0.488461,0.351129,0.394373,0.33903,0.446723,0.383669,0.297285,0.152051,0.153354,0.231766,0.212996,0.149314,0.153412,0.150783,0.206464,0.152385,0.152168,0.159223,0.282672,0.253451,0.230514,0.151007,0.254334,0.160609,0.166201,0.233391,0.197208,0.145773,0.147085,0.147123,0.19743,0.14702,0.072837,0.005326,0.050374,0.055964,0.031311,0.037764,0.024826,0.027668,0.075453,0.043408,0.106374,0.122978,0.044327,0.028096,0.0163,0.022855,0.014248,0.037849,0.108979,0.018528,0.011064,0.015835,0.028545,0.078986,0.008196,0.007823,0.008552,0.007823,0.01366,0.053679
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,2.0,10.0,11.0,8.0,8.0,8.0,8.0,7.0,9.0,8.0,10.0,10.0,6.0,6.0,28.0,9.0,34.0,8.0,8.0,7.0,11.0,10.0,8.0,8.0,6.0,7.0,6.0,24.0,7.0,23.0,9.0,9.0,6.0,10.0,10.0,10.0,8.0,6.0,6.0,4.0,4.0,5.0,5.0,6.0,4.0,6.0,8.0,6.0,4.0,4.0,7.0,8.0,6.0,6.0,5.0,6.0,5.0,4.0,4.0,6.0,6.0,5.0,6.0,6.0,3.0,6.0,8.0,12.0,6.0,3.0,4.0,9.0,9.0,6.0,8.0,8.0,4.0,5.0,6.0,3.0,6.0,16.0,6.0,3.0,3.0,6.0,9.0,6.0,2.0,2.0,2.0,3.0,8.0


In [76]:
df.head()

Unnamed: 0,password,strength,0,1,2,3,4,5,6,7,8,9,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z,A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,P,Q,R,S,T,U,V,W,X,Y,Z,!,"""",#,$,%,&,(,),*,+,-,.,/,;,<,=,>,?,@,[,\,],^,_,`,{,|,},~,Unnamed: 94
0,kzde5577,1,0,0,0,0,0,2,0,2,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,kino3434,1,0,0,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,visi7k1yr,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,0,1,0,0,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,megzy123,1,0,1,1,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,lamborghin1,1,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,1,1,0,0,1,1,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [84]:
# we separate the target and features
X = df.drop(['strength','password'], axis=1)
y = df['strength']

In [85]:
#  we separate the training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# print dimension of predictors train set
print("The shape of X_train is:",X_train.shape)

# print dimension of predictors test set
print("The shape of X_test is:",X_test.shape)

# print dimension of target train set
print("The shape of y_train is:",y_train.shape)

# print dimension of target test set
print("The shape of y_test is:",y_test.shape)

The shape of X_train is: (468913, 92)
The shape of X_test is: (200963, 92)
The shape of y_train is: (468913,)
The shape of y_test is: (200963,)


### CREATING DIFFERENT ML MODELS FOR PREDICTION OF THE STRENGTH

In [86]:
# creating a decision tree model
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
dt_pred = dt.predict(X_test)

In [88]:
print(classification_report(y_test, dt_pred))

              precision    recall  f1-score   support

           0       0.51      0.63      0.56     26958
           1       0.92      0.89      0.90    148896
           2       0.98      0.95      0.96     25109

    accuracy                           0.86    200963
   macro avg       0.80      0.82      0.81    200963
weighted avg       0.87      0.86      0.87    200963



In [91]:
# creating a random forests model
rf = RandomForestClassifier(n_estimators=1000,random_state=42)
rf.fit(X_train, y_train)
rf_pred = dt.predict(X_test)

In [92]:
print(classification_report(y_test, rf_pred))

              precision    recall  f1-score   support

           0       0.51      0.63      0.56     26958
           1       0.92      0.89      0.90    148896
           2       0.98      0.95      0.96     25109

    accuracy                           0.86    200963
   macro avg       0.80      0.82      0.81    200963
weighted avg       0.87      0.86      0.87    200963



In [96]:
# creating a gaussian naive bayes model
gnb = GaussianNB()
GNB = OneVsRestClassifier(gnb)
GNB.fit(X_train, y_train)
pred = GNB.predict(X_test)

In [98]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.25      0.00      0.01     26958
           1       0.84      0.97      0.90    148896
           2       0.84      0.95      0.89     25109

    accuracy                           0.84    200963
   macro avg       0.64      0.64      0.60    200963
weighted avg       0.76      0.84      0.78    200963



In [102]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score 

In [103]:
# consider an empty list to store error rate
error_rate = []

# use for loop to build a knn model for each K
for i in np.arange(1,30,2):
    
    # setup a knn classifier with k neighbors
    # use the 'euclidean' metric 
    knn = KNeighborsClassifier(i, metric = 'manhattan')
   
    # fit the model using 'cross_val_score'
    # pass the knn model as 'estimator'
    # use 5-fold cross validation
    score = cross_val_score(knn, X_train, y_train, cv = 5)
    
    # calculate the mean score
    score = score.mean()
    
    # compute error rate 
    error_rate.append(1 - score)

# plot the error_rate for different values of K 
plt.plot(range(1,30,2), error_rate)

# add plot and axes labels
# set text size using 'fontsize'
plt.title('Error Rate', fontsize = 15)
plt.xlabel('K', fontsize = 15)
plt.ylabel('Error Rate', fontsize = 15)

# set the x-axis labels
plt.xticks(np.arange(1, 30, step = 2))

# plot a vertical line across the minimum error rate
plt.axvline(x = 21, color = 'red')

# display the plot
plt.show()

KeyboardInterrupt: 