In [16]:
from olpy import simulate
from sqlalchemy import create_engine
import numpy as np
import pandas as pd
import uuid
from datetime import datetime
import time
import random
from olpy.flight import Flight
from shapely.geometry import Point, Polygon
from geopy.geocoders import Nominatim
locator = Nominatim(user_agent="myGeocoder")

# Get schema of flight

# Make polygons for different beats

beat1 = Polygon([(-122.3786768, 37.6033406),
(-122.4109677, 37.5878350),
(-122.3965398, 37.5701493),
(-122.3797073, 37.5587194),
(-122.3336756, 37.5859306),
(-122.3597832, 37.5902835),
(-122.3786768, 37.6033406)])

beat2 = Polygon([(-122.3336756, 37.5859306),
(-122.3232582, 37.5807729),
(-122.3177618, 37.5839018),
(-122.2983529, 37.5723379),
(-122.3366555, 37.5080918),
(-122.3797073, 37.5587194),
(-122.3336756, 37.5859306)])

beat3 = Polygon([(-122.3366555, 37.5080918),
(-122.2983529, 37.5723379),
(-122.2626490, 37.5698889),
(-122.2489082, 37.5600919),
(-122.2542144, 37.5397113),
(-122.2947682, 37.5016914),
(-122.3366555, 37.5080918)])

beat4 = Polygon([(-122.2947682, 37.5016914),
(-122.2542144, 37.5397113),
(-122.2434810, 37.5486604),
(-122.2299119, 37.5419912),
(-122.2542643, 37.5120407),
(-122.2781174, 37.4854836),
(-122.2947682, 37.5016914)])

# Util functions

def generate_random_lat_long(number, polygon):
    list_of_points = []
    minx, miny, maxx, maxy = polygon.bounds
    counter = 0
    while counter < number:
        pnt = Point(random.uniform(minx, maxx), random.uniform(miny, maxy))
        if polygon.contains(pnt):
            list_of_points.append((pnt.x, pnt.y))
            counter += 1
    return list_of_points

def random_date_in_date_range(start, end):
    #current time in unix utc
    start_timestamp = datetime.strptime(start,'%Y-%m-%d').timestamp()
    end_timestamp = datetime.strptime(end,'%Y-%m-%d').timestamp()

    random_timestamp = np.random.randint(start_timestamp, end_timestamp)

    random_date = datetime.fromtimestamp(random_timestamp).strftime('%Y-%m-%d')

    return random_date

def random_date_up_to_x_days_after_previous_date(date, days):
    timestamp = datetime.strptime(date,'%Y-%m-%d').timestamp()
    days_in_seconds = days*24*60*60
    one_day = 24*60*60
    time_difference = np.random.randint(days_in_seconds) + one_day #add a day so possible range is 1-x days rather than 0-x
    new_date = datetime.fromtimestamp(timestamp + time_difference).strftime('%Y-%m-%d')
    return new_date

# Dicts for creating classes

providers = {
    'Joyland Healthcare': 0.05 ,
    'Peaceful Play': 0.25,
    'Sharing Center': 0.15,
    'Main St. Group Home': 0.25,
    'Saint Marys': 0.15,
    'Road Home': 0.15
}

council_districts = {
    'Santa Clara': 0.2,
    'San Mateo': 0.5,
    'San Francisco': 0.3
}

stay_away_dict = {
    50: 0.4,
    100: 0.3,
    200: 0.2,
    500: 0.1
}

beat_dict = {
    'beat1': 0.2,
    'beat2': 0.3,
    'beat3': 0.3,
    'beat4': 0.2
}

#Classes which generate data

class GuidesPerson():
    '''
    Generate Person Info (ID, FirstName, MiddleName, LastName, SSN, Sex, DOB)
    Include Mugshot and Drivers License in Person Info (ImageID, Image, DLNo)
    '''
    def __init__(self, key):

        self.inmate = simulate.subjectgenerator.Subject()
        self.inmate.generate()
        self.inmate = vars(self.inmate)['subject']
        self.inmate['SubjectIdentification'] = str(uuid.uuid1())

        self.inmate['ImageID'] = str(uuid.uuid1())
        self.inmate['Image'] = 'Bytes'

        self.inmate['DLNo'] = np.random.randint(1000000,9999999)

        today = datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d')

        probation = np.random.choice(['Y','N'], p = [0.5,0.5])

        if probation == 'Y':
            self.inmate['ProbationID'] = str(uuid.uuid1())
            self.inmate['ProbationStart'] = random_date_in_date_range('2015-01-01', today)
            self.inmate['ProbationEnd'] = random_date_up_to_x_days_after_previous_date(self.inmate['ProbationStart'], 1000)


        provider = np.random.choice(['Y','N'], p = [0.5,0.5])

        if provider == 'Y':
            self.inmate['Provider'] = np.random.choice(list(providers.keys()), p = list(providers.values()))

    @property
    def __dict__(self):
        return self.inmate

class Case():
    '''
    Generate Cases and assign to Council Districts (CaseID, CouncilDistrict) and randomly distribute them to people
    '''
    def __init__(self, key):

        self.case = {}
        self.case['CaseID'] = str(uuid.uuid1())
        self.case['CouncilDistrict'] = np.random.choice(list(council_districts.keys()), p = list(council_districts.values()))

    @property
    def __dict__(self):
        return self.case

class ServiceOfProcess():
    '''
    Generate serviceofprocess (StayAwayOrderID,  StayAwayRadius,StayAwayRadiusStr, StayAwayUnits)
    Include Location and Beat in generation of serviceofprocess (Coordinates, Lat, Long, Address, City, State)
    '''
    def __init__(self, key):

        self.serviceofprocess = {}
        self.serviceofprocess['StayAwayOrderID'] = str(uuid.uuid1())
        self.serviceofprocess['StayAwayRadius'] = np.random.choice(list(stay_away_dict.keys()), p = list(stay_away_dict.values()))
        sar = self.serviceofprocess['StayAwayRadius']
        self.serviceofprocess['StayAwayRadiusStr'] = f"Radius = {sar} yards"
        self.serviceofprocess['StayAwayUnits'] = "Yards"


        self.serviceofprocess['Beat'] = np.random.choice(list(beat_dict.keys()), p = list(beat_dict.values()))

        self.beat = self.serviceofprocess['Beat']

        self.serviceofprocess['Coordinates'] = generate_random_lat_long(1, eval(self.beat))[0]
        self.serviceofprocess['Lat'] = self.serviceofprocess['Coordinates'][0]
        self.serviceofprocess['Long'] = self.serviceofprocess['Coordinates'][1]


        self.sdlat = self.serviceofprocess['Lat']
        self.sdlong = self.serviceofprocess['Long']

        self.coordinates = f"{self.sdlong},{self.sdlat}"
        try:

            self.location = locator.reverse(self.coordinates)
            self.address = ""

            self.city = ""

            if 'house_number' in self.location.raw['address'].keys():
                self.address += self.location.raw['address']['house_number']+ " "

            if 'road' in self.location.raw['address'].keys():
                self.address += self.location.raw['address']['road']

            if 'town' in self.location.raw['address'].keys():
                self.city = self.location.raw['address']['town']

            if 'city' in self.location.raw['address'].keys():
                self.city = self.location.raw['address']['city']


            self.serviceofprocess['Address'] = self.address

            self.serviceofprocess['City'] = self.city
            self.serviceofprocess['State'] = self.location.raw['address']['state']

        except:
            print('this request didnt work')




    @property
    def __dict__(self):
        return self.serviceofprocess

# Main methods which create and join dataframes based on classes

def make_df(Enttype, keylist):

    dictlist = []

    for i in keylist:
        data = Enttype(i).__dict__ # right now i have certain dict methods returning a list -
                                    #this is a bit confusing and i would like to find a better way to design

        if isinstance(data, list):
            dictlist.extend(data)

        else:
            dictlist.append(data)

    return pd.DataFrame(dictlist).astype('object')

def join_rows_probablistically(df1,df2,n,join_p):
    df1_copy = df1.copy()
    df2_copy = df2.copy()
    dictlist = []
    for i in range(n):
        has_case = np.random.choice(['Y','N'], p = [join_p,1-join_p])
        df1_sample = df1_copy.sample().to_dict('record')[0]
        df2_sample = df2_copy.sample().to_dict('record')[0]

        if has_case == 'Y':
            result = {**df1_sample,**df2_sample}


        else:
            result = {**df1_sample}

        dictlist.append(result)
    return pd.DataFrame(dictlist).astype('object')

# Make and join all the data

lbpdf = make_df(GuidesPerson, list(range(200)))
case_df = make_df(Case, list(range(200)))
sop_df = make_df(ServiceOfProcess, list(range(30)))

case_sop_df = join_rows_probablistically(case_df, sop_df, 200, 0.5)
full = join_rows_probablistically(lbpdf, case_sop_df, 200, 0.8)

date_columns = ['dob']

for col in date_columns:
    full[col] = full[col].astype(np.datetime64).dt.strftime('%Y-%m-%d')
    full.loc[full[col] == 'NaT', col] = np.nan

date_to_datetime_columns = ['ProbationEnd','ProbationStart']
datetime_columns = [k for (k,v) in full.dtypes.items() if v.type == np.datetime64] + date_to_datetime_columns


for col in datetime_columns:
    full[col] = full[col].apply(lambda x: pd.Timestamp(x).tz_localize("America/Los_Angeles").to_pydatetime())

def make_assn_hash(df, col1, col2, name):
    cols = [col1,col2]
    c1nn = df.loc[df[cols].notnull().all(axis=1), col1].astype(str)
    c2nn = df.loc[df[cols].notnull().all(axis=1), col2].astype(str)
    combined_cols =  c1nn + c2nn
    assn_hash = combined_cols.apply(lambda x: hash(x+name))
    return assn_hash

def make_assn_cols(df, fd):
    for k, v in fd['associationDefinitions'].items():
        col_string = f"assn_{k}"
        src, dst = v['src'], v['dst']
        srccol = fd['entityDefinitions'][src]['properties'][0]['column']
        dstcol = fd['entityDefinitions'][dst]['properties'][0]['column']
        df[col_string] = make_assn_hash(df, srccol, dstcol, k)


fl = Flight()
fl.deserialize('guides_demo.yaml')
guides_fd = fl.schema

make_assn_cols(full, guides_fd)



Finished deserializing!


In [17]:
full

Unnamed: 0,firstName,lastName,middleName,sex,ssn,dob,race,ethnicity,SubjectIdentification,ImageID,Image,DLNo,Provider,ProbationID,ProbationStart,ProbationEnd,CaseID,CouncilDistrict,StayAwayOrderID,StayAwayRadius,StayAwayRadiusStr,StayAwayUnits,Beat,Coordinates,Lat,Long,Address,City,State,assn_appearsin,assn_ispictureof,assn_filedfor,assn_filedfor2,assn_locatedat,assn_locatedat2,assn_subjectof,assn_reported,assn_registeredfor
0,Christopher,Carlisle,Edward,M,780-25-3818,1967-03-15,White,Non-Hispanic,a8c2831e-6597-11ea-9fd7-acde48001122,a8c2838c-6597-11ea-9fd7-acde48001122,Bytes,7161665,,a8c28602-6597-11ea-9fd7-acde48001122,2016-11-21 00:00:00-08:00,2019-08-02 00:00:00-07:00,a95d9de0-6597-11ea-9fd7-acde48001122,San Francisco,,,,,,,,,,,,8.826550e+18,6651180436254062548,,,,-3.128812e+18,7.521969e+18,-3013193792761493484,
1,Kelly,Cave,Shanta,F,180-48-2633,1938-09-21,White,Unknown,a8a80ae8-6597-11ea-9fd7-acde48001122,a8a80b56-6597-11ea-9fd7-acde48001122,Bytes,6397238,,a8a80d90-6597-11ea-9fd7-acde48001122,2015-05-06 00:00:00-07:00,2015-05-27 00:00:00-07:00,,,,,,,,,,,,,,,-5166585693543557701,,,,,-2.869728e+18,-5514853238578688526,
2,David,Williams,Eddie,M,766-87-2412,2004-05-26,Asian,Hispanic,a9240026-6597-11ea-9fd7-acde48001122,a924008a-6597-11ea-9fd7-acde48001122,Bytes,6416916,,,NaT,NaT,a95e31ba-6597-11ea-9fd7-acde48001122,San Mateo,,,,,,,,,,,,-1.639909e+18,-8441147480216967678,,,,6.288223e+18,,-7422561030065827306,
3,Elena,Lindley,Alice,F,277-63-2453,1971-04-28,Other,Hispanic,a8ca98c4-6597-11ea-9fd7-acde48001122,a8ca9928-6597-11ea-9fd7-acde48001122,Bytes,2130347,,,NaT,NaT,a95e39da-6597-11ea-9fd7-acde48001122,San Francisco,,,,,,,,,,,,1.852059e+18,8946852917661453040,,,,-4.972480e+18,,5338713139784783294,
4,Charles,Recker,Robert,M,638-32-1615,1947-05-22,Other,Hispanic,a8c46de6-6597-11ea-9fd7-acde48001122,a8c46e5e-6597-11ea-9fd7-acde48001122,Bytes,5770908,,,NaT,NaT,a95d88b4-6597-11ea-9fd7-acde48001122,Santa Clara,,,,,,,,,,,,-3.386209e+18,2511648920226939202,,,,-3.189240e+18,,-5257987545947986516,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,Kevin,Smith,Henry,M,282-15-0616,1985-04-21,White,Hispanic,a90204f8-6597-11ea-9fd7-acde48001122,a902055c-6597-11ea-9fd7-acde48001122,Bytes,1116968,Sharing Center,,NaT,NaT,,,,,,,,,,,,,,,-5393282011313939542,,,,,,5772851886414962215,
196,Carroll,Olson,Wilbur,M,406-78-4136,1946-06-01,Other,Hispanic,a8e102c6-6597-11ea-9fd7-acde48001122,a8e103d4-6597-11ea-9fd7-acde48001122,Bytes,2687809,,,NaT,NaT,a95e2058-6597-11ea-9fd7-acde48001122,San Mateo,aeb80f28-6597-11ea-9fd7-acde48001122,200,Radius = 200 yards,Yards,beat4,"(-122.28728689631788, 37.50228553520403)",-122.287,37.5023,1400 Alameda de las Pulgas,Belmont,California,-2.461876e+18,6990622538252295014,-4.891055e+18,5.070394e+18,8.285600e+18,-1.234555e+18,,-3003883929884120027,-8.105893e+18
197,Claude,Bond,Dustin,M,364-61-5355,1945-03-07,White,Hispanic,a9176622-6597-11ea-9fd7-acde48001122,a9176686-6597-11ea-9fd7-acde48001122,Bytes,7970488,,a91768ca-6597-11ea-9fd7-acde48001122,2018-04-12 00:00:00-07:00,2018-09-07 00:00:00-07:00,a95e2896-6597-11ea-9fd7-acde48001122,Santa Clara,,,,,,,,,,,,3.019962e+17,6384910745869940084,,,,9.223041e+18,-3.589866e+18,6033486635287889661,
198,Joseph,Mckinney,Kristopher,M,345-84-0533,1952-02-07,White,Hispanic,a8e14290-6597-11ea-9fd7-acde48001122,a8e142fe-6597-11ea-9fd7-acde48001122,Bytes,5660018,Main St. Group Home,,NaT,NaT,,,,,,,,,,,,,,,-6369001740237337668,,,,,,6007671499098588135,


In [3]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_columns', None)

In [15]:
full['ProbationEnd'] = full['ProbationEnd'].apply(lambda x: pd.Timestamp(x).tz_localize("America/Los_Angeles").to_pydatetime())

In [8]:
datetime.combine('2020-10-19', datetime.time(0))

TypeError: descriptor 'time' requires a 'datetime.datetime' object but received a 'int'

In [4]:
full

Unnamed: 0,firstName,lastName,middleName,sex,ssn,dob,race,ethnicity,SubjectIdentification,ImageID,Image,DLNo,ProbationID,ProbationStart,ProbationEnd,Provider,CaseID,CouncilDistrict,StayAwayOrderID,StayAwayRadius,StayAwayRadiusStr,StayAwayUnits,Beat,Coordinates,Lat,Long,Address,City,State,assn_appearsin,assn_ispictureof,assn_filedfor,assn_filedfor2,assn_locatedat,assn_locatedat2,assn_subjectof,assn_reported,assn_registeredfor
0,Eric,Luna,Willie,M,837-12-7017,1995-07-11,White,Hispanic,d9ef17e2-6595-11ea-9fd7-acde48001122,d9ef1850-6595-11ea-9fd7-acde48001122,Bytes,7710276,d9ef1a9e-6595-11ea-9fd7-acde48001122,2019-03-07,2020-10-19,,dac7a24c-6595-11ea-9fd7-acde48001122,San Francisco,dcbff31a-6595-11ea-9fd7-acde48001122,50,Radius = 50 yards,Yards,beat1,"(-122.38699294351764, 37.57908019816505)",-122.387,37.5791,2887 Hillside Drive,Burlingame,California,4.268701e+18,3779992156689224579,3.213795e+18,2.639666e+18,3.108165e+18,1.071079e+18,-8.751623e+18,8846513726058345975,2.491211e+17
1,Christopher,Feder,Frank,M,153-87-3620,1969-10-06,White,Non-Hispanic,da125914-6595-11ea-9fd7-acde48001122,da125978-6595-11ea-9fd7-acde48001122,Bytes,4254703,da125bb2-6595-11ea-9fd7-acde48001122,2016-12-22,2017-09-02,,dac84fe4-6595-11ea-9fd7-acde48001122,San Francisco,e32e8c34-6595-11ea-9fd7-acde48001122,100,Radius = 100 yards,Yards,beat4,"(-122.27865486127585, 37.50479879932981)",-122.279,37.5048,154 Wessex Way,San Carlos,California,-2.045771e+18,8492739791804270227,2.981676e+18,3.413453e+18,-9.096443e+16,-5.832331e+18,-8.734031e+18,-3006453347489913162,-6.591085e+18
2,Joanna,Sideris,Gwen,F,078-63-4631,1987-11-22,White,Hispanic,d9ea9af0-6595-11ea-9fd7-acde48001122,d9ea9b5e-6595-11ea-9fd7-acde48001122,Bytes,8685966,,,,Road Home,dac85408-6595-11ea-9fd7-acde48001122,San Francisco,e32e8c34-6595-11ea-9fd7-acde48001122,100,Radius = 100 yards,Yards,beat4,"(-122.27865486127585, 37.50479879932981)",-122.279,37.5048,154 Wessex Way,San Carlos,California,-1.294635e+18,686519245094235503,2.981676e+18,-2.495861e+18,-9.096443e+16,-1.036526e+18,,7672862081025069299,-6.591085e+18
3,Laura,Fair,Cynthia,F,538-60-1885,1925-05-21,Declined,Non-Hispanic,da791208-6595-11ea-9fd7-acde48001122,da79128a-6595-11ea-9fd7-acde48001122,Bytes,4025418,,,,Road Home,dac86af6-6595-11ea-9fd7-acde48001122,San Mateo,dcbff31a-6595-11ea-9fd7-acde48001122,50,Radius = 50 yards,Yards,beat1,"(-122.38699294351764, 37.57908019816505)",-122.387,37.5791,2887 Hillside Drive,Burlingame,California,-4.581253e+17,-1732276128219841203,3.213795e+18,-5.293296e+18,3.108165e+18,-5.766407e+18,,-8716251997085100705,2.491211e+17
4,Jeremy,Toledo,Alonzo,M,546-75-7743,1964-12-29,Multiracial,Hispanic,da2a18c4-6595-11ea-9fd7-acde48001122,da2a1932-6595-11ea-9fd7-acde48001122,Bytes,2433936,,,,Joyland Healthcare,dac85408-6595-11ea-9fd7-acde48001122,San Francisco,dfd229ba-6595-11ea-9fd7-acde48001122,50,Radius = 50 yards,Yards,beat2,"(-122.32247035511487, 37.543601528863036)",-122.322,37.5436,Murphy Drive,San Mateo,California,-7.966995e+18,-2036361746721548383,2.964161e+18,8.323974e+18,5.061411e+18,-1.036526e+18,,7416008264771513309,-2.800844e+18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,Robert,Yazzie,Rafael,M,885-80-1461,2005-06-04,White,Hispanic,da8b1f48-6595-11ea-9fd7-acde48001122,da8b1fac-6595-11ea-9fd7-acde48001122,Bytes,3574765,,,,,,,,,,,,,,,,,,,2947212772384091466,,,,,,8061203099420881943,
196,Gayle,Kelton,Melva,F,538-70-8138,1997-07-27,Black,Hispanic,da8a33f8-6595-11ea-9fd7-acde48001122,da8a3466-6595-11ea-9fd7-acde48001122,Bytes,2216993,,,,Saint Marys,dac8465c-6595-11ea-9fd7-acde48001122,San Mateo,e17f0e72-6595-11ea-9fd7-acde48001122,50,Radius = 50 yards,Yards,beat4,"(-122.28719394315954, 37.50326318212147)",-122.287,37.5033,1400 Alameda de las Pulgas,Belmont,California,7.325437e+18,132793820472196312,-4.175329e+18,4.388928e+18,-3.588184e+18,-2.335473e+18,,-8590843158931404934,5.143740e+18
197,Christopher,Beales,Joe,M,564-27-0070,1991-09-03,White,Hispanic,da598dde-6595-11ea-9fd7-acde48001122,da598e56-6595-11ea-9fd7-acde48001122,Bytes,9618195,da5990b8-6595-11ea-9fd7-acde48001122,2019-06-24,2020-02-22,Road Home,,,,,,,,,,,,,,,8957269529122965991,,,,,3.851573e+18,6781173783319326945,
198,Jose,Arnold,Ralph,M,232-54-8348,1970-01-20,Black,Hispanic,da43c7ba-6595-11ea-9fd7-acde48001122,da43c832-6595-11ea-9fd7-acde48001122,Bytes,1110115,da43cada-6595-11ea-9fd7-acde48001122,2016-08-07,2018-03-04,Saint Marys,dac83176-6595-11ea-9fd7-acde48001122,Santa Clara,db572642-6595-11ea-9fd7-acde48001122,50,Radius = 50 yards,Yards,beat2,"(-122.32499248200747, 37.578737422497376)",-122.325,37.5787,457 North Idaho Street,San Mateo,California,2.000915e+18,-8271525824819858068,-8.780538e+18,3.675579e+18,-2.548280e+18,4.174960e+18,-5.209042e+18,5270950320142268110,-2.117330e+18
