In [8]:
## Purpose. Create fake memos that will be used to train a classifier to pick up on a 
## key area of a written document. This data will be used for Audit Innovation Lab Final Project. The faker function will
## use the top key words from the Subsequent Events and Inventory PCAOB AS  

In [9]:
from faker import Faker ## to create noise in the memos
import pandas as pd
import numpy as np
from random import sample

In [10]:
## instantiate faker 
fake = Faker()

In [11]:
## read in inventory and derivative words
sub = pd.read_csv('sub_words.csv')
inven = pd.read_csv('inventory_words.csv')
LM = pd.read_csv("LM Dictionary.csv")

In [12]:
sub.head(10)

Unnamed: 0,header
0,financial
1,statements
2,date
3,subsequent
4,events
5,balance
6,sheet
7,adjustment
8,auditor
9,period


In [13]:
inven.head(10)

Unnamed: 0,head
0,procedures
1,inventory
2,auditor
3,inventories
4,counts
5,physical
6,circumstances
7,independent
8,reasonable
9,satisfied


In [14]:
LM.head()

Unnamed: 0,Head
0,ABANDON
1,ABANDONED
2,ABANDONING
3,ABANDONMENT
4,ABANDONMENTS


In [15]:
## convert to lists
sub_list = sub['header'].tolist()
inven_list = inven['head'].tolist()
LM['Head'] = LM['Head'].str.lower()
LM_list = LM['Head'].tolist()

In [18]:
## create empty dataframe
df = pd.DataFrame(columns = ['Memo', 'Area'])
## create for loop that will generate random memos for both inventory and subsequent events
for i in np.arange(0,4000):
    ## create inventory sub lists using random sample.Create one sub list with words from sub events for noise (to confuse the classifier)
    inven_rand1 = sample(inven_list, 15)
    inven_rand2 = sample(inven_list, 15)
    inven_rand3 = sample(inven_list, 15)
    inven_rand4 = sample(inven_list, 15)
    inven_rand5 = sample(inven_list, 15)
    inven_rand6 = sample(inven_list, 15)
    inven_conf = sample(sub_list, 4) ## confusion list
    inven_conf2 = sample(sub_list, 4)
    
    ## create random chunks of text using faker for an inventory memo
    memo_inv = (fake.text() + " " + fake.sentence(ext_word_list = inven_rand1) + " " + fake.sentence() + " "
                + fake.sentence(ext_word_list = inven_rand2) + " " + fake.sentence() + " " 
                + fake.sentence(ext_word_list = inven_rand3) + " " + fake.sentence() + " " + fake.text() +  " " + fake.sentence(ext_word_list = inven_conf)
                + fake.text() + " " + fake.sentence(ext_word_list = inven_rand4) + " " + fake.sentence() + " " 
                + fake.sentence(ext_word_list = inven_rand5) + " " + fake.sentence() + " " 
                + fake.sentence(ext_word_list = inven_rand6) + " " + fake.sentence() + " " + fake.text() +  " " + fake.sentence(ext_word_list = inven_conf2)
               )
    df = df.append({'Memo': memo_inv, 'Area': 'Inventory'}, ignore_index = True)
    
    ## create subsequent event sub lists using random sample. Create a sub list from invntory to confuse.
    sub_rand1 = sample(sub_list, 15)
    sub_rand2 = sample(sub_list, 15)
    sub_rand3 = sample(sub_list, 15)
    sub_rand4 = sample(sub_list, 15)
    sub_rand5 = sample(sub_list, 15)
    sub_rand6 = sample(sub_list, 15)
    sub_conf = sample(inven_list, 4) ##confusion list
    sub_conf2 = sample(inven_list, 4)
    
    ## create random chunks of text using faker for a subsequent events memo
    memo_sub = (fake.text() + " " + fake.sentence(ext_word_list = sub_rand1) + " " + fake.sentence() + " "
                + fake.sentence(ext_word_list = sub_rand2) + " " + fake.sentence() + " " 
                + fake.sentence(ext_word_list = sub_rand3) + " " + fake.sentence() + " " + fake.text() + " " + fake.sentence(ext_word_list = sub_conf)
                + fake.text() + " " + fake.sentence(ext_word_list = sub_rand4) + " " + fake.sentence() + " " 
                + fake.sentence(ext_word_list = sub_rand5) + " " + fake.sentence() + " " 
                + fake.sentence(ext_word_list = sub_rand6) + " " + fake.sentence() + " " + fake.text() + " " + fake.sentence(ext_word_list = sub_conf2)   
               )
    df = df.append({'Memo': memo_sub, 'Area': 'Subsequent Events'}, ignore_index = True)

In [19]:
df

Unnamed: 0,Memo,Area
0,Place last game. Head later concern theory sin...,Inventory
1,Majority contain manager nice result less grou...,Subsequent Events
2,Build analysis blue program cost someone wife ...,Inventory
3,With attorney final education life lay treat. ...,Subsequent Events
4,Share piece consider practice. Back interestin...,Inventory
5,Participant after building front head herself....,Subsequent Events
6,Activity ask issue generation I civil. Choose ...,Inventory
7,Financial think same I. Read position machine ...,Subsequent Events
8,General station wonder stay size throw wish. A...,Inventory
9,Successful walk behavior also. Even thing thre...,Subsequent Events


In [20]:
## Randomly shuffle the dataframe 
df_shuffled = df.sample(frac=1).reset_index(drop=True)

In [21]:
df_shuffled.head(20)

Unnamed: 0,Memo,Area
0,Success church resource. Study soon charge dur...,Inventory
1,Debate medical voice recent piece but everythi...,Inventory
2,Kind candidate hospital population. Compare pr...,Subsequent Events
3,Return a example gas wear. Soon off deep worry...,Inventory
4,Listen lawyer month game. Laugh benefit hot pa...,Inventory
5,Front PM carry issue seek skill wide. Stay war...,Inventory
6,Now effect material everything of. Article dev...,Inventory
7,Indicate else prevent arrive personal card.\nL...,Inventory
8,Network enter because. Field name hold expert ...,Subsequent Events
9,Environmental and by account. Indicate feel ex...,Subsequent Events


In [22]:
## save to folder
df_shuffled.to_csv('memos2.csv', index = False)

In [23]:
## EXAMPLE OF INVENTORY TRAINING MEMO
df_shuffled.iloc[0,0]

'Success church resource. Study soon charge during. Think when performance up page return.\nManager say help trade professional career. Process role accept benefit budget consumer eat. Satisfied satisfied may counting application transactions satisfied. Control stock past director assume indeed issue federal. B details asked b client count. Image particularly wrong truth indicate. Relating satisfy paragraphs developed methods. Piece manager Republican party serious whom hit once. See whether none really garden white. Compare impact clearly sign future. Mean break happy.\nMention buy daughter leave option. Available at seat fill. Pattern hard each run. Assets assets assessments assessments balancesheet assessments assets.Seven executive hot floor admit soldier. Weight other service try now push. Admit southern left training fine. Records developed relating able current accounting accounting. Body professional night worker probably treatment. Applied assurance warehouseman must sampling 