In [1]:
# This code utilizes the Python package 'Faker'
# Faker is a Python package that generates fake data for you. 
# Whether you need to bootstrap your database, create good-looking XML documents, 
# fill-in your persistence to stress test it, or anonymize data taken from 
# a production service.

# I am specifically using it to create dummy data to use for testing
# the Go.Data ArcGIS Pro toolbox.

# https://faker.readthedocs.io/en/master/

# Author: Amy Louise Lang
# https://github.com/LangsterGA

# Before running script - update the 'st_dt' and 'end_dt' range for randomly generated report dates.

import pandas as pd
from faker import Faker
import random
from datetime import timedelta, date

# I am using Czech names
fake = Faker('en_IN')
Faker.seed(0)

In [2]:
# I am listing my case age ranges from 1-80
lst = list(range(80))
print(lst)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79]


In [3]:
# Need to generate a list of 500 random ages from the list above
RandAges = random.choices(lst, k=500)


In [4]:
# Create 500 first and last names
#FnameData = [fake.first_name() for i in range(100)]
LnameData = [fake.last_name() for i in range(500)]
FnameMale = [fake.first_name_male() for i in range(250)]
FnameFemale = [fake.first_name_female() for i in range(250)]
FnameData = FnameFemale + FnameMale


Choose random names from list

In [5]:
# Import my adm2 names for Sendak Island - my fake country using Philadelphia area geoids

In [6]:
# from pandas import 
data = pd.read_csv('SendakADM2_names.csv')
# converting column data to list
CityNames = data['Name'].tolist()


In [7]:
# Create a dataframe of the adm2 names that were imported
ADMdf = pd.DataFrame((CityNames), columns=['name'])
ADMdf

Unnamed: 0,name
0,Aaronchester
1,Aaronport
2,Aaronstad
3,Acevedoland
4,Adamshire
...,...
1003,Williamsonburgh
1004,Williamsshire
1005,Williamstown
1006,Wolfeton


In [8]:
# Choose 25 Adm2 names randomly. Then I will use these to assign to random case names. 
# Limiting to 25 so our outbreak doesn't generate one case in 1,050 adm areas

rand = (random.sample(CityNames, 25))
rand

['Lake Edward',
 'North Aaron',
 'Sheryltown',
 'Gibbsmouth',
 'Port Dustinstad',
 'Lake Matthewfurt',
 'Banksside',
 'Patelchester',
 'South Kimberly',
 'Jonesmouth',
 'Davidside',
 'Wallaceville',
 'Lake Julie',
 'North Vanessashire',
 'North Rachelshire',
 'Toddfurt',
 'Port Brittanyview',
 'Lake Stacymouth',
 'Huntbury',
 'Port Matthewside',
 'Lake Derekside',
 'New William',
 'East Lisa',
 'Oneillville',
 'South Megan']

In [9]:
# In the code below I am taking the list of 25 Adm2 areas from previous step,  and generating a 
# a random list of 500 values from it.
rand3 = random.choices(rand, k=500)


In [10]:
# Zip together the random first and last names, ages and 500 locations

Zipdf = pd.DataFrame(list(zip(FnameData, LnameData, RandAges, rand3)))
Zipdf.columns =['FName', 'LName', 'Age', 'Locations']
Zipdf.insert(4, "Occ", "") 
Zipdf.insert(5, "class", "Confirmed") 
Zipdf.insert(2, "ReportDt", "") 


In [11]:
# This section sets up variables and attributes related to dates
# Need to generate a list of dates between assigned start date and end date

from datetime import timedelta, date

def daterange(date1, date2):
    for n in range(int ((date2 - date1).days)+1):
        yield date1 + timedelta(n)
RD_List = []

# edit start and end dates
start_dt = date(2022, 11, 27)
end_dt = date(2022, 12, 27)
for dt in daterange(start_dt, end_dt):
        RD_List.append(dt)


# Generate a random sample of 500 dates from the list above
randDates = random.choices(RD_List, k=500)

# Assign these random dates to the ReportDt column and calculate outcome and onset from those dates
Zipdf['ReportDt'] = randDates
Zipdf['OnsetDt'] = Zipdf['ReportDt'] + timedelta(days=-3)
Zipdf['OutcometDt'] = Zipdf['ReportDt'] + timedelta(days=7)

In [12]:
from pandas import *
Occs = read_csv("occupations.csv")
OccsList = Occs['Type'].tolist()

# Generate a random sample of 500 occupations from the list above
randOccs = random.choices(OccsList, k=500)

# Assign these random occupations to the Occ column
Zipdf['Occ'] = randOccs

# Assign the value of 'child' in the occupation field for anyone 12 years and under
Zipdf.loc[Zipdf.Age <= 12, 'Occ'] = 'Child'

# This is for CASES and assigns outcomes randomly from list of choices
outcomes = ["Alive", "Recovered", "Deceased"]
randOuts = random.choices(outcomes, k=500)
Zipdf['Outcome'] = randOuts

# Add the input mask established for case ID for the outbreak - by giving each one the same value, Go.Data will assign the 
# ID based on the last entered ID
Zipdf['Case_ID'] = 'CASE-2021-9999'

In [13]:
Zipdf

Unnamed: 0,FName,LName,ReportDt,Age,Locations,Occ,class,OnsetDt,OutcometDt,Outcome,Case_ID
0,Kismat,Uppal,2022-11-29,69,Gibbsmouth,Traditional healer,Confirmed,2022-11-26,2022-12-06,Deceased,CASE-2021-9999
1,Miraya,Hans,2022-12-11,34,North Aaron,Mining,Confirmed,2022-12-08,2022-12-18,Deceased,CASE-2021-9999
2,Hansh,Sibal,2022-12-09,9,Lake Matthewfurt,Child,Confirmed,2022-12-06,2022-12-16,Recovered,CASE-2021-9999
3,Mohanlal,Walia,2022-12-06,50,Oneillville,Traditional healer,Confirmed,2022-12-03,2022-12-13,Deceased,CASE-2021-9999
4,Samarth,Kade,2022-12-11,68,Port Matthewside,Hunter,Confirmed,2022-12-08,2022-12-18,Deceased,CASE-2021-9999
...,...,...,...,...,...,...,...,...,...,...,...
495,Anaya,Bumb,2022-11-27,38,Davidside,Hunter,Confirmed,2022-11-24,2022-12-04,Recovered,CASE-2021-9999
496,Vanya,Borde,2022-11-19,47,Davidside,Civil servant,Confirmed,2022-11-16,2022-11-26,Alive,CASE-2021-9999
497,Indrajit,Manne,2022-11-27,31,Lake Matthewfurt,Mining,Confirmed,2022-11-24,2022-12-04,Recovered,CASE-2021-9999
498,Akarsh,Dhillon,2022-11-30,63,Sheryltown,Civil servant,Confirmed,2022-11-27,2022-12-07,Recovered,CASE-2021-9999


In [14]:
# Export data frame to a csv file for bulk upload to Go.Data
Zipdf.to_csv('100BulkCases_{}.csv'.format(end_dt), encoding='utf-8')