In [2]:
import pandas as pd
import subprocess

In [3]:
# import data
df = pd.read_csv('./Data/passwords_per_country.csv')
df

Unnamed: 0,country_code,country,Rank,Password,User_count,Time_to_crack,Global_rank,Time_to_crack_in_seconds
0,au,Australia,1,123456,308483,< 1 second,1.0,0
1,au,Australia,2,password,191880,< 1 second,5.0,0
2,au,Australia,3,lizottes,98220,3 Hours,,10800
3,au,Australia,4,password1,86884,< 1 second,16.0,0
4,au,Australia,5,123456789,75856,< 1 second,2.0,0
...,...,...,...,...,...,...,...,...
9795,vn,Vietnam,196,hongngoc,2660,3 Hours,,10800
9796,vn,Vietnam,197,anhtien,2628,17 Minutes,,1020
9797,vn,Vietnam,198,lanhuong,2620,3 Hours,,10800
9798,vn,Vietnam,199,congacon,2584,2 Hours,,7200


In [4]:
# keep only the data we care about
df = df[['country_code', 'country', 'Rank', 'Password', 'User_count']]
df

Unnamed: 0,country_code,country,Rank,Password,User_count
0,au,Australia,1,123456,308483
1,au,Australia,2,password,191880
2,au,Australia,3,lizottes,98220
3,au,Australia,4,password1,86884
4,au,Australia,5,123456789,75856
...,...,...,...,...,...
9795,vn,Vietnam,196,hongngoc,2660
9796,vn,Vietnam,197,anhtien,2628
9797,vn,Vietnam,198,lanhuong,2620
9798,vn,Vietnam,199,congacon,2584


In [8]:
# 0 cannot be a password (min 6 character) let's replace it with 000000
df['Password'] = df['Password'].apply(lambda x : '000000' if x == '0' else x)
df['Length'] = df['Password'].apply(lambda x : len(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Password'] = df['Password'].apply(lambda x : '000000' if x == '0' else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Length'] = df['Password'].apply(lambda x : len(x))


In [9]:
df

Unnamed: 0,country_code,country,Rank,Password,User_count,Length
0,au,Australia,1,123456,308483,6
1,au,Australia,2,password,191880,8
2,au,Australia,3,lizottes,98220,8
3,au,Australia,4,password1,86884,9
4,au,Australia,5,123456789,75856,9
...,...,...,...,...,...,...
9795,vn,Vietnam,196,hongngoc,2660,8
9796,vn,Vietnam,197,anhtien,2628,7
9797,vn,Vietnam,198,lanhuong,2620,8
9798,vn,Vietnam,199,congacon,2584,8


In [8]:
def passw_type(passw):
    '''
    Given a password it determine if contains only numbers, only text, numbers and text or special characters as well
    '''
    
    if passw.isdigit():
        return 'Numeric'
    elif passw.isalpha():
        return 'Text String'
    elif passw.isalnum():
        return 'Alphanumeric'
    
    return 'Special'

In [9]:
df['Password_type'] = df['Password'].apply(lambda x: passw_type(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Password_type'] = df['Password'].apply(lambda x: passw_type(x))


In [10]:
df

Unnamed: 0,country_code,country,Rank,Password,User_count,Length,Password_type
0,au,Australia,1,123456,308483,6,Numeric
1,au,Australia,2,password,191880,8,Text String
2,au,Australia,3,lizottes,98220,8,Text String
3,au,Australia,4,password1,86884,9,Alphanumeric
4,au,Australia,5,123456789,75856,9,Numeric
...,...,...,...,...,...,...,...
9795,vn,Vietnam,196,hongngoc,2660,8,Text String
9796,vn,Vietnam,197,anhtien,2628,7,Text String
9797,vn,Vietnam,198,lanhuong,2620,8,Text String
9798,vn,Vietnam,199,congacon,2584,8,Text String


In [11]:
def keepassxc_entropy(passw):
    '''
    Given a password it compute the entropy with keepassxc cli
    (only tested on linux systems)
    '''
    cl = "keepassxc-cli estimate '" + passw + "'"
    ent = subprocess.check_output([cl], shell=True)
    
    return round(float(ent.decode('utf-8').split('\t')[1][8:]),1)

In [13]:
df['Entropy'] = df['Password'].apply(lambda x: keepassxc_entropy(x))

In [14]:
df

Unnamed: 0,country_code,country,Rank,Password,User_count,Length,Password_type,Entropy
0,au,Australia,1,123456,308483,6,Numeric,0.0
1,au,Australia,2,password,191880,8,Text String,1.0
2,au,Australia,3,lizottes,98220,8,Text String,14.5
3,au,Australia,4,password1,86884,9,Alphanumeric,4.0
4,au,Australia,5,123456789,75856,9,Numeric,2.3
...,...,...,...,...,...,...,...,...
9795,vn,Vietnam,196,hongngoc,2660,8,Text String,21.9
9796,vn,Vietnam,197,anhtien,2628,7,Text String,23.2
9797,vn,Vietnam,198,lanhuong,2620,8,Text String,22.2
9798,vn,Vietnam,199,congacon,2584,8,Text String,24.6


In [15]:
df.to_csv('./Data/password_cleaned.csv', index=False)