## 身份证识别模型试验

### 数据

#### 字符集
- 取ppocr中的字符集，去掉身份证中不可能出现的字符
#### 生成

- 地址 
    - 实际发布地址信息与身份证空白模板图，用文字生成器，生成识别模型数据
    - [x] 选中复选框
- 姓名
    - 使用Faker包，随机生成名字语料信息，再用文字识别数据生成器，生成识别数据
- 民族
    - 使用56个民族作为语料库，再用生成器生成数据
- 性别
    - 男、女用生成器生成
- 生日
    - 用Faker包，生成随机得生日语料(XX年XX月XX日)，再用生成器生成数据
- 身份证号
    - 用Faker包，生成随机的身份证号语料，再用生成器生成数据
- 发证机关
    - 用实际发布的发证机关语料，再用生成器随机生成数据
- 有效日期
    - 用Faker包，生成时间格式，中间用"."隔开，时间串间用"-"隔开，生成语料后，再用生成器生成数据
- 字符随机生成
    - 统计已生成的数据中所有字符的出现次数，对低于某值的字符，取出后组成一个字符集合，利用这个字符集合，通过生成器进行无语料的随机字符生成

In [2]:
import json
import os.path as osp
import os
from faker import Faker

In [5]:
# 发证机关
unit_path = 'data/unit.json'
unit_store_path = 'data/unit.txt'

with open(unit_path) as f:
    content = json.loads(f.read())
    unit = list(content.values()) #发证机关语料库
with open(unit_store_path, 'w') as f:
    f.write('\n'.join(unit))

In [84]:
# 地址
address_path = 'data/repitle_address_extract.json'
address_store_path = 'data/address.txt'
nodes_names = []

def find_chinese(file):
    pattern = re.compile(r'[^\u4e00-\u9fa5]')
    chinese = re.sub(pattern, '', file)
    return chinese

def collect_address(data):
    if type(data) == list:
        nodes_names.extend([find_chinese(d) for d in data if len(find_chinese(d)) >0])
    if type(data) == dict:
        for key, value in data.items():
            if len(key) > 0:
                key = find_chinese(key)
                if len(key) > 0:
                    nodes_names.append(key)
            collect_address(value)

with open(address_path, 'r', encoding='utf-8') as f:
    content = json.load(f)
    collect_address(content)
nodes_names = list(set(nodes_names))
nodes_names = map(str, nodes_names)
with open(address_store_path, 'w') as f:
    f.write('\n'.join(nodes_names))

In [48]:
# 姓名
from faker import Faker
name_store_path = 'data/name.txt'

fake = Faker(locale='zh_CN')
name = fake.name()
with open(name_store_path, 'a+') as f:
    for _ in range(1000):
        names = [fake.name() for _ in range(500)]
        f.write('\n'.join(names))

In [52]:
# 民族
nationality = "汉族 彝族 侗族 蒙古族 回族 藏族 维吾尔族 苗族 壮族 朝鲜族 满族 瑶族 白族 土家族 哈尼族 哈萨克族 黎族 " \
              "傈僳族 佤族 畲族 高山族 拉祜族 水族 东乡族 纳西族 " \
              "景颇族 柯尔克孜族 土族 达斡尔族 羌族 撒拉族 毛难族 仫佬族 " \
              "仡佬族 锡伯族 阿昌族 普米族 塔吉克族 怒族 乌孜别克族 " \
              "俄罗斯族 德昂族 保安族 裕固族 崩龙族 独龙族 鄂伦春族 赫哲族 " \
              "门巴族 珞巴族 基诺族 鄂温克族 傣族 京族 塔塔尔族 布朗族 布依族"
with open('data/nationality.txt', 'w') as f:
    f.write('\n'.join(n for n in nationality.split(' ')))

In [60]:
# 有效日期
from faker import Faker
name_store_path = 'data/validity.txt'
fake = Faker(local='zh_CN')
# date = fake.date_object()
with open(name_store_path, 'w') as f:
    birth = [str(fake.date_object()) for _ in range(20000)]
    birth = ['.'.join(d.split('-')) for d in birth]
    f.write('\n'.join(birth))

In [61]:
# 生日
from faker import Faker
name_store_path = 'data/birth.txt'
fake = Faker(local='zh_CN')

with open(name_store_path, 'w') as f:
    birth = [str(fake.date_object()) for _ in range(20000)]
    birth = [d.split('-')[0] + '年' + d.split('-')[1] + '月' + d.split('-')[2] + '日'
             for d in birth]
    f.write('\n'.join(birth))

In [77]:
# 身份证号
from faker import Faker
number_store_path = 'data/number.txt'
fake = Faker(local='zh_CN')
gen_num = lambda: ''.join(map(str, [fake.random_digit() for _ in range(18)]))

with open(number_store_path, 'w') as f:
    number = [gen_num() for _ in range(20000)]
    number.extend([gen_num()[:17] + 'X' for _ in range(2000)])
    f.write('\n'.join(number))

In [80]:
# 字符集
import re

def find_chinese(file):
    pattern = re.compile(r'[^\u4e00-\u9fa5]')
    chinese = re.sub(pattern, '', file)
    return chinese

with open('data/ppocr_keys_v1.txt') as f:
    chars = f.read().split('\n')
    chars = ''.join(chars)
chars_ch = find_chinese(chars)
chars_idcard = chars_ch + '0123456789' + 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' + '.-'
with open('data/idcard_keys.txt', 'w') as f:
    f.write('\n'.join(chars_idcard))


In [13]:
# 统计生成的数据字符，与字符集进行对比，当某字符在生成数据中出现的次数小于某值时，再对这些字符进行随机生成

from glob import glob
import os.path as osp
import os

root = '/mnt/data/rz/data/idCard/rec/gen/20220124'
char_key_path = '/mnt/data/rz/programe/KREP/idCard/data/idcard_keys_new.txt'

with open(char_key_path) as f:
    chars = ''.join(f.read().strip().split('\n'))

img_names = os.listdir(root)
char_num = dict.fromkeys(chars, 0)

except_char = []
for name in img_names:
    ct = name.split('_')[0]
    for ch in ct:
        num = char_num.get(ch, None)
        if num is None:
            except_char.append(ch)
        else:
            char_num[ch] = num + 1

# except_char = list(set(except_char))
# new_chars = chars + ''.join(except_char)
# new_chars = ''.join(list(set(new_chars)))

# with open('data/idcard_keys_new.txt', 'w') as f:
#     f.write('\n'.join(new_chars))

In [17]:
import numpy as np

# 用非常用字符随机组合后生成语料文件
litter_chars = []
for k, v in char_num.items():
    if v < 20:
        litter_chars.append(k)
text_sources = [''.join(np.random.choice(litter_chars, 5)) 
                for _ in range(10)]
with open('data/idcard_litter_char.txt', 'a') as f:
    for _ in range(50000):
        sr = ''.join(np.random.choice(litter_chars, 5)) + '\n'
        f.write(sr)
    

In [33]:
# 对生成的数据进行label文件生成
from glob import glob
import os.path as osp
import os

root = '/mnt/data/rz/data/idCard/rec/exp/'
suffix = 'gen_idcard_number/'
label_store_path = '/mnt/data/rz/data/idCard/rec/exp/gen_train_rec_gt_number_label.txt' 
img_names = os.listdir(osp.join(root, suffix))

lines = []
for imgn in img_names:
    ct = imgn.split('_')[0]
    filen = osp.join(suffix, imgn) + '\t' + ct
    lines.append(filen)

with open(label_store_path, 'w') as f:
    f.write('\n'.join(lines))