In [50]:
import pandas as pd
import numpy as np
import seaborn as sns
import plotly.express as px
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
import random
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report


warnings.filterwarnings('ignore')

### Reading dataset

In [8]:
data = pd.read_csv('data.csv',error_bad_lines=False)
print(data.shape)
data.head()

Skipping line 2810: expected 2 fields, saw 5
Skipping line 4641: expected 2 fields, saw 5
Skipping line 7171: expected 2 fields, saw 5
Skipping line 11220: expected 2 fields, saw 5
Skipping line 13809: expected 2 fields, saw 5
Skipping line 14132: expected 2 fields, saw 5
Skipping line 14293: expected 2 fields, saw 5
Skipping line 14865: expected 2 fields, saw 5
Skipping line 17419: expected 2 fields, saw 5
Skipping line 22801: expected 2 fields, saw 5
Skipping line 25001: expected 2 fields, saw 5
Skipping line 26603: expected 2 fields, saw 5
Skipping line 26742: expected 2 fields, saw 5
Skipping line 29702: expected 2 fields, saw 5
Skipping line 32767: expected 2 fields, saw 5
Skipping line 32878: expected 2 fields, saw 5
Skipping line 35643: expected 2 fields, saw 5
Skipping line 36550: expected 2 fields, saw 5
Skipping line 38732: expected 2 fields, saw 5
Skipping line 40567: expected 2 fields, saw 5
Skipping line 40576: expected 2 fields, saw 5
Skipping line 41864: expected 2 field

(669640, 2)


Skipping line 525174: expected 2 fields, saw 5
Skipping line 526251: expected 2 fields, saw 5
Skipping line 529611: expected 2 fields, saw 5
Skipping line 531398: expected 2 fields, saw 5
Skipping line 534146: expected 2 fields, saw 5
Skipping line 544954: expected 2 fields, saw 5
Skipping line 553002: expected 2 fields, saw 5
Skipping line 553883: expected 2 fields, saw 5
Skipping line 553887: expected 2 fields, saw 5
Skipping line 553915: expected 2 fields, saw 5
Skipping line 554172: expected 2 fields, saw 5
Skipping line 563534: expected 2 fields, saw 5
Skipping line 565191: expected 2 fields, saw 5
Skipping line 574108: expected 2 fields, saw 5
Skipping line 574412: expected 2 fields, saw 5
Skipping line 575985: expected 2 fields, saw 5
Skipping line 580091: expected 2 fields, saw 5
Skipping line 582682: expected 2 fields, saw 5
Skipping line 585885: expected 2 fields, saw 5
Skipping line 590171: expected 2 fields, saw 5
Skipping line 591924: expected 2 fields, saw 5
Skipping line

Unnamed: 0,password,strength
0,kzde5577,1
1,kino3434,1
2,visi7k1yr,1
3,megzy123,1
4,lamborghin1,1


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 669640 entries, 0 to 669639
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   password  669639 non-null  object
 1   strength  669640 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 10.2+ MB


In [10]:
data['strength'].unique()

array([1, 2, 0], dtype=int64)

### Check all the missing values in my dataset

In [11]:
data.isna().sum()

password    1
strength    0
dtype: int64

In [12]:
data[data['password'].isnull()]

Unnamed: 0,password,strength
367579,,0


In [13]:
data.dropna(inplace=True)

In [14]:
data.isnull().sum()

password    0
strength    0
dtype: int64

In [52]:
px.histogram(data_frame=data,x='strength')

In [53]:
password_tuple = np.array(data)

In [54]:
password_tuple

array([['kzde5577', 1],
       ['kino3434', 1],
       ['visi7k1yr', 1],
       ...,
       ['184520socram', 1],
       ['marken22a', 1],
       ['fxx4pw4g', 1]], dtype=object)

### Shuffling randomly for robustness

In [55]:
import random
random.shuffle(password_tuple)

In [56]:
x=[labels[0] for labels in password_tuple]
y=[labels[1] for labels in password_tuple]


In [78]:
x[:5]

['kzde5577', 'kzde5577', 'visi7k1yr', 'megzy123', 'kino3434']

### Create a custom function to split input into characters of list

In [58]:
def word_divide_char(inputs):
    character=[]
    for i in inputs:
        character.append(i)
    return character

In [59]:
word_divide_char('kzde5577')

['k', 'z', 'd', 'e', '5', '5', '7', '7']

#### Use TF-IDF vectorizer to convert String data into numerical data

In [62]:
vectorizer=TfidfVectorizer(tokenizer=word_divide_char)

### Apply TF-IDF vectorizer on data

In [63]:
X=vectorizer.fit_transform(x)

In [64]:
X.shape

(669639, 129)

In [65]:
vectorizer.get_feature_names_out()

array(['\x02', '\x04', '\x05', '\x06', '\x08', '\x0e', '\x10', '\x11',
       '\x16', '\x17', '\x19', '\x1b', '\x1c', '\x1e', ' ', '!', '"', '#',
       '$', '%', '&', '(', ')', '*', '+', '-', '.', '/', '0', '1', '2',
       '3', '4', '5', '6', '7', '8', '9', ';', '<', '=', '>', '?', '@',
       '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g',
       'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
       'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '\x7f', '\x81',
       '\xa0', '¡', '¨', '«', '°', '±', '²', '³', '´', 'µ', '·', 'º', '¾',
       '¿', '×', 'ß', 'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é',
       'ê', 'í', 'ï', 'ð', 'ñ', 'ò', 'ó', 'ô', 'õ', 'ö', '÷', 'ù', 'ú',
       'û', 'ü', 'ý', 'þ', 'ÿ', '—', '‚', '…'], dtype=object)

In [66]:
first_document_vector=X[0]
first_document_vector

<1x129 sparse matrix of type '<class 'numpy.float64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [67]:
first_document_vector.T.todense()

matrix([[0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.56688024],
        [0.        ],
        [0.59170913],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0

In [68]:
df=pd.DataFrame(first_document_vector.T.todense(),index=vectorizer.get_feature_names_out(),columns=['TF-IDF'])
df.sort_values(by=['TF-IDF'],ascending=False)

Unnamed: 0,TF-IDF
7,0.591709
5,0.566880
z,0.335793
k,0.291964
d,0.286049
...,...
;,0.000000
9,0.000000
8,0.000000
6,0.000000


### Split data into train & test
    train---> To learn the relationship within data, 
    test-->  To do predictions, and this testing data will be unseen to my model

In [69]:
from sklearn.model_selection import train_test_split

In [70]:
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2)

In [71]:
X_train.shape

(535711, 129)

#### Apply Logistic on data as use-case is Classification

In [72]:
clf=LogisticRegression(random_state=0,multi_class='multinomial')

In [73]:
clf.fit(X_train,y_train)

#### doing prediction for specific custom data

In [74]:
dt=np.array(['%@123abcd'])
pred=vectorizer.transform(dt)
clf.predict(pred)

array([1])

#### doing prediction on X-Test data

In [75]:
y_pred=clf.predict(X_test)
y_pred

array([1, 1, 2, ..., 1, 1, 0])

#### Check Accuracy of your model using confusion_matrix,accuracy_score

In [76]:
cm=confusion_matrix(y_test,y_pred)
print(cm)
print(accuracy_score(y_test,y_pred))

[[ 5366 12769    16]
 [ 3768 92952  2649]
 [   38  4988 11382]]
0.8190968281464668


##### Create report of your model

In [77]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.59      0.30      0.39     18151
           1       0.84      0.94      0.88     99369
           2       0.81      0.69      0.75     16408

    accuracy                           0.82    133928
   macro avg       0.74      0.64      0.68    133928
weighted avg       0.80      0.82      0.80    133928

