<a href="https://colab.research.google.com/github/YapingWu/GoogleColab/blob/main/genpass/preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 数据准备
需要的数据：
1. 将`myspace`和`phpbb`的数据按照`7：3`的比例划分为训练集`xxx_70`和测试集`xxx_30`，用于PL模型的`one-site test`。
2. 按照PCFG规则对`myspace`、`phpbb`和`xxx_70`进行编码并生成密码频次表。

In [10]:
!mkdir 'raw_data'
!mv '/content/myspace.txt' '/content/raw_data/myspace.txt'
!mv '/content/phpbb.txt' '/content/raw_data/phpbb.txt'

In [13]:
!mkdir 'pcfg'

In [4]:
!mkdir 'freq'

mkdir: cannot create directory ‘freq’: File exists


## 划分数据集

In [12]:
import numpy as np

seed = 7
np.random.seed(seed)

def split_dataset(name='myspace', test_ratio=0.3):
  filename = '/content/raw_data/{}.txt'.format(name)
  test_ratio = 0.3
  all_pwd_list = np.genfromtxt(filename, dtype=str , delimiter='\n' , encoding='utf-8')
  np.random.shuffle(all_pwd_list)
  print('{} all_pwd_list.shape：{}'.format(name, all_pwd_list.shape))

  cut = int(all_pwd_list.shape[0] * (1 - test_ratio))
  pwd_list_7 = all_pwd_list[:cut]
  pwd_list_3 = all_pwd_list[cut:]

  np.savetxt('./raw_data/{}_7.txt'.format(name), pwd_list_7, fmt='%s', delimiter='\n', encoding='utf-8')
  np.savetxt('./raw_data/{}_3.txt'.format(name), pwd_list_3, fmt='%s', delimiter='\n', encoding='utf-8')

split_dataset(name='myspace')
split_dataset(name='phpbb')

myspace all_pwd_list.shape：(37119,)
phpbb all_pwd_list.shape：(184325,)


## PCFG编码、生成频率表

In [9]:
import itertools
import pandas as pd

def get_type(ch):
    if ch.isalpha():
        return 'L'
    if ch.isdigit():
        return 'D'
    return 'S'

def encode(unit_list, text):
    text = str(text)
    grammar = ''
    for key, group in itertools.groupby(text, lambda ch: get_type(ch)):
        password = list(group)
        encoded = key + str(len(password))
        grammar += (encoded + ' ')
        unit_list.append((encoded, "".join(password)))

    return grammar

def produce_prop_table(pcfg_list, pcfg_freq_file, pwd_freq_file):
    # 生成两个概率表
    df = pd.DataFrame(pcfg_list)
    df.columns = ['pcfg', 'password']
    # 计算每个pcfg出现的概率
    total_cnt = df.shape[0]  # 总密码数量
    pcfg_cnt = df.groupby(['pcfg']).size().rename('pcfg_cnt').reset_index()  # 每个pcfg出现的次数
    pcfg_cnt['pcfg_prop'] = pcfg_cnt['pcfg_cnt'] / total_cnt
    pcfg_cnt.sort_values('pcfg_prop', ascending=False, inplace=True, ignore_index=True)
    print("每个pcfg出现的概率保存到文件：%s" % pcfg_freq_file)
    pcfg_cnt.to_csv(pcfg_freq_file, index=False)

    # 计算pcfg中每个密码出现的概率
    pwd_cnt_per_pcfg = df.groupby(['pcfg', 'password']).size() \
        .rename('pwd_cnt_per_pcfg').reset_index()  # 每个pcfg中每个密码出现的次数
    pwd_prop = pwd_cnt_per_pcfg.merge(pcfg_cnt, how='left', on='pcfg')
    pwd_prop.sort_values(['pcfg', 'pwd_cnt_per_pcfg'], ascending=False, inplace=True, ignore_index=True)
    pwd_prop['pwd_prop'] = pwd_prop['pwd_cnt_per_pcfg'] / pwd_prop['pcfg_cnt']
    # 补充结束符概率
    pwd_prop = pwd_prop.append({'pcfg': '<END>', 'password': '\n', 'pwd_prop': 1.0}, ignore_index=True)
    pwd_prop.to_csv(pwd_freq_file
                    , columns=['pcfg', 'password', 'pwd_cnt_per_pcfg', 'pwd_prop']
                    , index=False)
    print("pcfg中每个密码出现的概率保存到文件：%s" % pwd_freq_file)

In [None]:
# PCFG编码并生成频率表
for name in ['myspace', 'phpbb', 'myspace_7', 'phpbb_7']:
  filename = '/content/raw_data/{}.txt'.format(name)
  pwd_df = pd.read_csv(filename, header=None, names=['pwd'])  
  pwd_df['len'] = pwd_df['pwd'].apply(lambda x: len(str(x)))  # 计算密码长度
  pwd_df = pwd_df[pwd_df['len'] <= 40]  # 忽略异常值
  pcfg_unit_list = []
  pwd_df['pcfg'] = pwd_df['pwd'].apply(lambda x: encode(pcfg_unit_list, x))

  # 对编码后的结果进行去重，生成wordlist
  pcfg_list = pwd_df['pcfg'].drop_duplicates()
  print("去重后的序列个数：%d" % len(pcfg_list))
  pcfg_list_file = './pcfg/{}.txt'.format(name)
  print("pcfg编码结果保存到文件：%s" % pcfg_list_file)
  pcfg_list.to_csv(pcfg_list_file, index=False)

  produce_prop_table(pcfg_unit_list, './freq/{}_pcfg_freq.txt'.format(name), './freq/{}_pwd_freq.txt'.format(name)) 