# setup

In [1]:
# dependencies
import random
import string
import hashlib

import pandas as pd

In [2]:
# support methods
def pickone(coll):
    """Pick one item from a collection.
    
    - `coll`: the collection to draw from.
    """
    return random.choice(coll)


def pickn(coll, n):
    """Pick a number of items from a collection.
    
    - `coll`: the collection to draw from.
    - `n`: the number of items to draw.
    """
    return [random.choice(coll) for i in range(n)]


def maketoken(coll, nchars=None):
    """Make a 'word' token from a collection of string characters.
    
    - `coll`: the collection to draw from to build the token.
    - `nchars`: the number of 'letters' to use in the token. If None is passed, `nchars` 
    will be a random number between 1 and len(coll)*(.5).
    """
    if not nchars: nchars = random.randint(1, len(coll)*(.5))
    token = ''.join(pickn(coll, nchars))
    return token


def makeline(coll, ntokens=None):
    """Make a line from a collection of string characters.
    
    - `coll`: the collection to draw from to build the 'words'.
    - `ntokens`: the number of 'words' to include in the line.
    """
    if not ntokens: ntokens = random.randint(1,10)
    if ntokens < 1: return '\n'
    punc = pickone(string.punctuation)
    line = ' '.join([maketoken(coll) for i in range(ntokens)]) + punc
    return line


def addnoise(coll, line):
    """Add noisy string characters to a line.
    
    - `coll`: the collection of noisy characters to draw from.
    - `line`: the line to modify.
    """
    n = random.randint(1, 5)
    idxs = pickn(range(len(line)), n=n)
    for i in idxs:
        char = pickone(noise)
        line = f"{line[:i]}{char}{line[i:]}"
    return line


def hashline(line):
    enc = line.encode('utf-8')
    sha = hashlib.sha1(enc)
    dig = sha.hexdigest()
    return dig[:8]

In [3]:
# main
letters = string.ascii_letters
noise = string.digits + string.punctuation + string.whitespace

K = 100

# demo basic support methods

In [4]:
testline = makeline(letters)

In [5]:
noisytest = addnoise(noise, testline)

In [6]:
testline

'znyoOTBbL OiQgES cXvRn uzycyjIEBKqK yQTEjjnnyYJRzRfztE SaGtAZaJeQzqVvTpGKqXF wl,'

In [7]:
noisytest

'znyoOTBbL OiQgES cXvRn uzyc\nyjIEBKq(K yQTEjjnn#yYJRzRfztE SaGtAZaJe"QzqVvTpGKqXF wl,'

# generate random line data

In [8]:
# review existing data instead of creating a new table
lines = pd.read_parquet("../output/lines.parquet")

#lines = pd.DataFrame({
#    i: {'line': addnoise(noise, makeline(letters))} for i in range(K)
#}).T
#lines['n_alpha'] = lines.line.apply(lambda x: sum(c.isalpha() for c in x))
#lines['n_digit'] = lines.line.apply(lambda x: sum(c.isdigit() for c in x))
#lines['hashid'] = lines.line.apply(hashline)

In [9]:
lines.sample(5)

Unnamed: 0,line,n_alpha,n_digit,hashid
84,"fyAs oUQapAZgTZxFvPuFh FmBZqBCgjgTvb Jis K,uXx...",96,1,70881980
49,GiP5pKM lcBXXlTDzDXicugnfD roPheNXBdTgwrMjvZkR...,75,1,feaae651
29,sqPOjxcRj~pHW}HGOFy HsLyruaJaM]DpEySup=OOzYAW ...,72,0,4c95d5fc
76,PeyKrL?HienmpXI}O6CF OpnerUqNwntBwLUi fempUtMX...,70,1,38e0fac8
70,RJsWLpJDaNjPfER CFECzTuoKXT*SpWicsrnvAuLQo 2n ...,89,1,c1c11af5


In [10]:
#lines.to_parquet("../output/lines.parquet")

# generate pseudo-name data

In [11]:
# name-like support
def makenametoken(coll, nchars=None):
    """Make a name-like token from a collection of string characters.
    
    - `coll`: the collection to draw from to build the token.
    - `nchars`: the number of letters to use in the token. If None is passed, `nchars` 
    will be a random number between 1 and 25.
    """
    if not nchars: nchars = random.randint(1, 25)
    token = ''.join(pickn(coll, nchars))
    return token


def makefullname(coll, ntokens=None):
    """Make a name-like line from a collection of string characters.
    
    - `coll`: the collection to draw from to build the 'words'.
    - `ntokens`: the number of 'words' to include in the line.
    """
    if not ntokens: ntokens = random.randint(1,4)
    line = ' '.join([makenametoken(coll) for i in range(ntokens)])
    return line


def formatname(line):
    """Pick a random casing type and apply to a name-like line.
    
    - `line`: the name-like line to return formatted.
    """
    choice = random.randint(1,3)
    if choice == 1: return line.lower()
    elif choice == 2: return line.upper()
    return line.title()

In [12]:
# review existing data instead of creating a new table
names = pd.read_parquet("../output/names.parquet")

#names = pd.DataFrame({
#    i: {'line': makefullname(string.ascii_lowercase)} for i in range(K)
#}).T
#names['formline'] = names.line.apply(formatname)
#names['hashid'] = names.line.apply(hashline)

In [13]:
names.sample(5)

Unnamed: 0,line,formline,hashid
88,xsoukoova wapecfvbktlg tsrskzbdqugtfhqyukcyjsd,XSOUKOOVA WAPECFVBKTLG TSRSKZBDQUGTFHQYUKCYJSD,27be93d6
63,nrvxbecrdceicxdvetmfjh,NRVXBECRDCEICXDVETMFJH,d0f452ed
52,h rwsljrswgjametjprgqqw kjesugx bhusbrxxplwexo...,H Rwsljrswgjametjprgqqw Kjesugx Bhusbrxxplwexo...,209f37df
83,sgszwlmb sksg p,SGSZWLMB SKSG P,ccb945f7
22,peeauri,Peeauri,03d72752


In [14]:
#names.to_parquet("../output/names.parquet")

# generate numeric data

In [15]:
# badge-like data
def makebadge(coll, nchars=None):
    """Make a 'badge' token from a collection of string characters.
    
    - `coll`: the collection to draw from to build the token.
    - `nchars`: the number of 'letters' to use in the token. If None is passed, `nchars` 
    will be a random number between 4 and 6.
    """
    if not nchars: nchars = random.randint(4,6)
    token = ''.join(pickn(coll, nchars))
    return int(token)

In [16]:
badges = pd.DataFrame({
    i: {'badge': makebadge(string.digits)} for i in range(K)
}).T

In [17]:
badges.sample(5)

Unnamed: 0,badge
78,95057
0,43523
59,40234
56,5898
86,898708
