In [1]:
import pandas as pd
import numpy as np
from datetime import timedelta
import re

In [157]:
# DATA QUALITY ANALYSIS WITH TWO TYPES OF ERRORS: NA AND ERROR

df = pd.read_csv('C:/Users/sgulbin/Work/Analysis/DataQualityAnalysis/original_data/users_activated_v8.csv', low_memory = False)

# Remove blocked users
df = df.loc[(df.NameRu == 'Активирован')|(df.NameRu == 'Зарегистрирован')|(df.NameRu == 'На проверке')|\
            (df.NameRu == 'Приостановлен')]
# Format datetime data
df['birthday_dt'] = pd.to_datetime(df['birthday'], errors = 'coerce')
df['license_set_dt'] = pd.to_datetime(df['license_set_date'], errors = 'coerce')
df['psp_issue_date'] = pd.to_datetime(df['psp_issue_date'], errors = 'coerce')
df['LicenseStartDate'] = pd.to_datetime(df['LicenseStartDate'], errors = 'coerce')
# Format docs image availability data
df['passport'] = df['passport'].fillna(False)
df['drivers'] = df['drivers'].fillna(False)
df['selfie'] = df['selfie'].fillna(False)
# Marking null values
for i in ['license_set_dt', 'License', 'PassportNumber', 'PassportDepartmentCode', 'PassportDepartment', 'PassportRegistration',\
          'birth_place', 'birthday_dt', 'sex', 'first_name', 'patronymic_name', 'last_name']:
    if i == 'first_name' or i =='patronymic_name' or i =='last_name':
        df[i+'_null'] = (pd.isnull(df[i]))|\
                        (df[i] == 'Нет')|\
                        (df[i] == 'нет')|\
                        (df[i] == 'Отсутствует')|\
                        (df[i] == 'отсутствует')|\
                        (df[i] == 'В')|\
                        (df[i] == 'Процессе')|\
                        (df[i] == 'Регистрации')
    else:
        df[i+'_null'] = (pd.isnull(df[i]))        
# Marking incorrect datetime data
df['lcns_incorrect_dt'] = (pd.to_datetime('today').year - df.license_set_dt.dt.year < 1.0)|\
                              (pd.to_datetime('today').year - df.license_set_dt.dt.year > 64.0)|\
                              (df.license_set_dt.dt.year-df.birthday_dt.dt.year < 16.0)
df['bd_incorrect_dt'] = ((pd.to_datetime('today')-df['birthday_dt']).astype('timedelta64[Y]') < 19.0)|\
                        ((pd.to_datetime('today')-df['birthday_dt']).astype('timedelta64[Y]') > 80.0)
# Marking incorrect pattern/length
for i in ['PassportNumber']:
    df[i+'_incorrect_len'] = (df[i].str.replace(' ','')).str.len()!=10.0
    df[i+'_incorrect_ptrn'] = (df[i].str.replace(' ','')).str.contains(pat = '\w{10}') == False

# Removing "№" symbol from the license number field
# df.License = df.License.str.replace('№','')

# Marking incorrect pattern of license number
lcns_ptrn_old =  r'^[0-9][0-9][а-яА-Я][а-яА-Я][0-9]{6}$'
lcns_ptrn_crnt =  r'^[0-9]{10}$'
df['License_format_standard'] = np.where(pd.isnull(df.LicenseStartDate),'d',np.where(df.LicenseStartDate<'2011-03-01','d&c','d'))
df['License_incorrect_ptrn'] = np.where(df.License_format_standard == 'd&c',df['License'].str.replace(' ','').str.contains(\
                                        pat = lcns_ptrn_old, regex = True) == False,df['License'].str.replace(' ','').str.contains(\
                                        pat = lcns_ptrn_crnt, regex = True) == False)
# Marking errors in the passport department code
df['psp_dep_code_incorrect_len'] = df['PassportDepartmentCode'].str.len()!=7.0
df['psp_dep_code_incorrect_ptrn'] = df['PassportDepartmentCode'].str.contains(pat = '\d\d\d-\d\d\d') == False
# Pattern to find errors in FIO
fio_ptrn = r'^[а-яА-ЯёЁ]+(?:-[а-яА-ЯёЁ]+)+$|^[а-яА-ЯёЁ]+(?:\s[а-яА-ЯёЁ]+)+$|^[а-яА-ЯёЁ]+$'
# Marking errors with name
for i in ['first_name', 'patronymic_name', 'last_name']:
    df[i+'_nick'] = df[i].str.contains('Мойка')|\
                    df[i].str.contains('Каршайн')|\
                    df[i].str.contains('Карсервис')|\
                    df[i].str.contains('Омывайка')|\
                    df[i].str.contains('Нафта')|\
                    df[i].str.contains('Пройл')|\
                    df[i].str.contains('Пролив')|\
                    df[i].str.contains('Механик')|\
                    df[i].str.contains('Спб')|\
                    df[i].str.contains('\(')|\
                    df[i].str.contains('\)')
    df[i+'_format'] = df[i].str.contains(pat = fio_ptrn, regex = True) == False

# Marking errors in name
name_cols = ['first_name_null','patronymic_name_null','last_name_null','first_name_nick','first_name_format',\
             'patronymic_name_nick','patronymic_name_format','last_name_nick','last_name_format']
df['FIO_error'] = df[name_cols].sum(axis = 1)>0

# Marking errors in passport
psp_cols = ['first_name_null','patronymic_name_null','last_name_null','first_name_nick','first_name_format',\
            'patronymic_name_nick','patronymic_name_format','last_name_nick','last_name_format','PassportNumber_null',\
            'PassportDepartmentCode_null','PassportDepartment_null','PassportRegistration_null',\
            'birth_place_null','sex_null','birthday_dt_null','PassportNumber_incorrect_len','PassportNumber_incorrect_ptrn',\
            'psp_dep_code_incorrect_len','psp_dep_code_incorrect_ptrn','bd_incorrect_dt']
df['passport_error'] = df[psp_cols].sum(axis = 1)>0

# Marking errors in driving license
lcns_cols = ['License_null', 'license_set_dt_null','License_incorrect_ptrn','lcns_incorrect_dt']
df['driving_license_error'] = df[lcns_cols].sum(axis = 1)>0

# Marking errors in driving license number
lcns_nmbr_cols = ['License_null','License_incorrect_ptrn']
df['driving_license_number_error'] = df[lcns_nmbr_cols].sum(axis = 1)>0

# Marking errors in driving license set date
lcns_set_date_cols = ['license_set_dt_null','lcns_incorrect_dt']
df['driving_license_set_date_error'] = df[lcns_set_date_cols].sum(axis = 1)>0

# Marking errors in fields used for KBM request
KBM_cols = ['first_name_null','patronymic_name_null','last_name_null','birthday_dt_null','License_null',\
            'first_name_nick','first_name_format','patronymic_name_nick','patronymic_name_format', 'last_name_nick',\
            'last_name_format','bd_incorrect_dt','License_incorrect_ptrn']
df['KBM_cols_error'] = df[KBM_cols].sum(axis = 1)>0

# Marking errors in birthday date
bd_cols = ['birthday_dt_null','bd_incorrect_dt']
df['bd_error'] = df[bd_cols].sum(axis = 1)>0

# Marking errors in fields used for scoring
scoring_cols = ['sex_null', 'birthday_dt_null','license_set_dt_null','birth_place_null','bd_incorrect_dt','lcns_incorrect_dt']
df['scoring_cols_error'] = df[scoring_cols].sum(axis = 1)>0

# Marking rows with any errors
cols = ['first_name_null','patronymic_name_null','last_name_null','sex_null','birthday_dt_null','License_null',\
        'license_set_dt_null','PassportNumber_null','PassportDepartmentCode_null','PassportDepartment_null',\
        'PassportRegistration_null','birth_place_null','first_name_nick','first_name_format','patronymic_name_nick',\
        'patronymic_name_format','last_name_nick', 'last_name_format', 'bd_incorrect_dt',\
        'License_incorrect_ptrn','lcns_incorrect_dt','PassportNumber_incorrect_len','PassportNumber_incorrect_ptrn',\
        'psp_dep_code_incorrect_len','psp_dep_code_incorrect_ptrn']
aggs = ['FIO_error','passport_error', 'driving_license_error',\
        'driving_license_number_error','driving_license_set_date_error',\
        'KBM_cols_error','bd_error','scoring_cols_error',\
        'any_error']
df['any_error'] = df[cols].sum(axis = 1)>0

# Marking users with expired docs
df['age_when_psp_issued'] = ((df['psp_issue_date']-df['birthday_dt'])/pd.to_timedelta(1, unit='D'))/365.25
df['age'] = ((pd.to_datetime('today')-df['birthday_dt'])/pd.to_timedelta(1, unit = 'D'))/365.25
df['passport_expired'] = np.where((df.age > 20.082)&(df.age < 45.082)&(df.age_when_psp_issued < 20.0),True,\
                                  np.where((df.age > 45.082)&(df.age_when_psp_issued < 45.0),True,False))
df['license_expired'] = np.where((pd.to_datetime('today')-df.LicenseStartDate)/pd.to_timedelta(1, unit='D') > 3653,True,False)

# Leaving only users registered before 2020-04-30
df_old = df.loc[df.new_users != 1]
# Leaving only users that are Russian citizens
df_rus = df_old.loc[df_old.rus_ctzn == 1.0]

# Creating dataframe with error stats
stats_rus = df_rus['license_set_dt_null'].value_counts().rename_axis('error').reset_index(name='license_set_dt_null')
for i in cols+aggs:
    stats_rus[i] = df_rus[i].value_counts()
stats = df['license_set_dt_null'].value_counts().rename_axis('error').reset_index(name='license_set_dt_null')
for i in cols+aggs:
    stats[i] = df[i].value_counts()

# stats_rus.to_csv('C:/Users/sgulbin/Work/Analysis/DataQualityAnalysis/data_quality_results/error_stats/data_quality_results_active_users_rus_after.csv', index = False)
# stats.to_csv('C:/Users/sgulbin/Work/Analysis/DataQualityAnalysis/stage_2/data_quality_results_active_users_all_after_new_algo.csv', index = False)

# Selecting and saving users with KBM cols error
kbm = df.loc[df.KBM_cols_error == True]
kbm_short = kbm[['user_id','login','first_name', 'patronymic_name','last_name','License','LicenseStartDate','birthday_dt',\
                 'FIO_error','bd_error','driving_license_number_error']]
# kbm.to_csv('C:/Users/sgulbin/Work/Analysis/DataQualityAnalysis/stage_2/KBM_cols_error_new_algo.csv', index = False)
# kbm_short.to_csv('C:/Users/sgulbin/Work/Analysis/DataQualityAnalysis/stage_2/KBM_cols_error_new_algo_short.csv', index = False)

# Selecting and saving users with expired docs
expired = df.loc[(df.passport_expired == True)|(df.license_expired == True)]
# expired.to_csv('C:/Users/sgulbin/Work/Analysis/DataQualityAnalysis/stage_2/users_w_expired_docs.csv', index = False)

In [158]:
# AGGREGATING ERROR STATISTICS

# replacing NA with FALSE
for i in ['passport', 'drivers',  'selfie']:
    df[i] = df[i].fillna(False)
    
# Aggregating error statistics. RUS
df_any_rus = df_rus[['user_id', 'passport', 'drivers', 'any_error']]
pvt_any_rus = pd.pivot_table(df_any_rus, index = ['passport','drivers', 'any_error'], aggfunc = 'count')
# pvt_any_rus.to_csv('C:/Users/sgulbin/Work/Analysis/DataQualityAnalysis/data_quality_results/all_docs/all_docs_data_quality_results_pvt_active_users_rus_after.csv')

# Aggregating error statistics. ALL
df_any = df[['user_id', 'passport', 'drivers', 'any_error']]
pvt_any = pd.pivot_table(df_any, index = ['passport','drivers', 'any_error'], aggfunc = 'count')
# pvt_any.to_csv('C:/Users/sgulbin/Work/Analysis/DataQualityAnalysis/stage_2/act_usrs_data_quality_pvt_all_after_new_algo.csv')

# Aggregating KBM cols error statistics. ALL
df_kbm = kbm[['FIO_error','bd_error','driving_license_number_error']]
df_kbm = df_kbm.replace(True,1)
pvt_kbm = df_kbm.sum()
# pvt_kbm.to_csv('C:/Users/sgulbin/Work/Analysis/DataQualityAnalysis/stage_2/KBM_cols_error_pvt_new_algo.csv')
pvt_kbm

FIO_error                        73636.0
bd_error                          1200.0
driving_license_number_error    127227.0
dtype: float64

In [159]:
kbm

Unnamed: 0,user_id,user_ext,login,sumsub_applicant_id,NameRu,activation_dtime,registration_dtime,License,license_set_date,LicenseStartDate,...,driving_license_number_error,driving_license_set_date_error,KBM_cols_error,bd_error,scoring_cols_error,any_error,age_when_psp_issued,age,passport_expired,license_expired
36,19500174,963,79260013332,5a96ac3b0a975a5163be3c3a,Активирован,2015-08-08 21:52:11,2015-08-08 00:00:00,77 МЕ 464683,1970-01-01,2018-03-27,...,True,True,True,False,True,True,20.052019,30.848551,False,False
136,19500598,2875,79269806655,5e85198ad5ea4874e2e4f2d4,Активирован,2015-08-28 13:46:34,2015-08-28 00:00:00,7 АА 117663,1992-01-01,2012-12-12,...,True,False,True,False,False,True,42.535250,50.265389,True,False
174,19500768,3637,79160222820,5c45fc9a0a975a5b15d26e5e,Активирован,2015-09-09 18:19:18,2015-09-09 00:00:00,77ОВ 506758,2009-01-01,2019-01-11,...,True,False,True,False,False,True,20.205339,30.191467,False,False
185,19500833,3960,79035487865,5ad0a2560a975a3cecafe619,Активирован,2015-09-10 12:50:58,2015-09-10 00:00:00,,2006-01-01,2016-06-04,...,True,False,True,False,True,True,15.600274,36.228428,True,False
272,19501256,5584,79636869996,5b0e8aba0a975a28b9259a8f,Активирован,2015-09-10 16:24:13,2015-09-10 00:00:00,77ОЕ215919,2009-01-01,2019-01-26,...,True,False,True,False,True,True,20.120465,30.172302,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1950173,1400000814,31421416,79060612477,5f0c3a94bb03f175d96a5496,Активирован,2020-07-13 14:56:14,2020-07-13 13:34:59,0357 02506,2003-01-01,2003-12-11,...,True,False,True,False,True,True,,47.606936,False,True
1950220,1400001550,31426789,79500054507,5f0c80f453d48659a91d24d8,Активирован,2020-07-13 18:46:05,2020-07-13 18:35:54,AA05 19429,2018-01-01,2018-01-10,...,True,False,True,False,True,True,,26.645950,False,False
1950242,1400001897,31429618,79997727398,5f0ca8b3bb03f10279c9bc4d,Активирован,2020-07-13 21:45:02,2020-07-13 21:23:04,RH37 3310,2014-01-01,2018-04-16,...,True,False,True,False,True,True,,24.469359,False,False
1950247,1400001949,31430006,79993457669,5f0cae61bb03f10279ca2a83,Активирован,2020-07-13 22:03:02,2020-07-13 21:48:15,AA04 02822,2016-01-01,2017-07-03,...,True,False,True,False,True,True,,35.404335,False,False


In [81]:
# TESTING REGULAR EXPRESSIONS. FIO
import re

names = ['Фикрет Оглы','Фикрет-Оглы','Fikret Ogly','Fikret-Ogly','-','000','Johnny','Ванюся','Саид Аб.','Михаи\'л','^','[','*',\
         '$','<',',','?','!','.','\'','Вfнюся', 'Фёдор']
ptrn = r'^[а-яА-ЯёЁ]+(?:-[а-яА-ЯёЁ]+)+$|^[а-яА-ЯёЁ]+(?:\s[а-яА-ЯёЁ]+)+$|^[а-яА-ЯёЁ]+$'
nf = pd.DataFrame({'name':names})
nf['check'] = nf['name'].str.contains(pat = ptrn, regex = True) == False
nf
# for i in names:
#     print(i+': '+str(bool(re.compile(ptrn).match(i))))

Unnamed: 0,name,check
0,Фикрет Оглы,False
1,Фикрет-Оглы,False
2,Fikret Ogly,True
3,Fikret-Ogly,True
4,-,True
5,000,True
6,Johnny,True
7,Ванюся,False
8,Саид Аб.,True
9,Михаи'л,True


In [144]:
# TESTING REGULAR EXPRESSIONS. LICENSE NUMBER
import re

lns = ['00ЯЯ123456','00FF123456','1234567890','00 ЯЯ 123456','00ЯЯ 123456','00 ЯЯ123456','00 FF 123456','00FF 123456',\
         '00 FF123456', 'FFFF123456', '00FF-123456','1234-567890','00ЯЯ-123456', '012345678901', '42 04 437168']
ptrn_old = r'^[0-9][0-9][а-яА-Я][а-яА-Я][0-9]{6}$'
ptrn_crnt = r'^[0-9]{10}$'
ln = pd.DataFrame({'ln':lns})
ln['check_old'] = (ln['ln'].str.replace(' ','')).str.contains(pat = ptrn_old, regex = True)
ln['check_crnt'] = (ln['ln'].str.replace(' ','')).str.contains(pat = ptrn_crnt, regex = True)
ln

Unnamed: 0,ln,check_old,check_crnt
0,00ЯЯ123456,True,False
1,00FF123456,False,False
2,1234567890,False,True
3,00 ЯЯ 123456,True,False
4,00ЯЯ 123456,True,False
5,00 ЯЯ123456,True,False
6,00 FF 123456,False,False
7,00FF 123456,False,False
8,00 FF123456,False,False
9,FFFF123456,False,False
