In [None]:
import numpy as np
import pandas as pd
import itertools


In [None]:
class Population:
    
    SIMULATIONS = 0
    PEOPLE = 0
    
    def __init__(self,location_attributes,location_profiles,simulation_name=None,random_state=None):
        """
            Object that defines the enviroment of a simulation and generates a population.
            
            The location attribues are fixed for each location and are used for high-level calibration
                of the appoximate distributions of key features.
                
            The location profiles are used to determine characteristics of individuals generated
                in population subgroups of each location.
                
            :param location_attributes: pandas.DataFrame with the following fields:
                location_name : (string) location identifier.
                density : (string) rural or urban.
                population : (int) Number of residents.
                employment_rate : (float) Rate between 0 and 1 of employed (vs. unemployed) residents.
                wealth_rate : (float) Rate between 0 and 1 of high-income (vs. low-income) residents.
                
            :param location_profiles: pandas.DataFrame with the following fields:
                location_name : (string) location identifier.
                wealth_status : (boolean) 0 represents low income and 1 represents high income.
                employment_status : (string) 0 represents unemployed and 1 represents employed.
                phoneownership_rate : (float) Probability between 0 and 1 that a resident owns a cellpone.
                worktravel_baseline : (float) Distance in miles of baseline for work-related travel.
                socialtravel_baseline : (float) Distance in miles of baseline for social-related travel.
                grocerytravel_baseline : (float) Distance in miles of baseline for grocery-related travel.
                
            :param simulation_name: (string) Unique identifier of simulation run, or None.
                
            :param random_state: integer representing the random state, or None.
                
            :return: Population object.
        """
        
        # Set random state:
        np.random.seed(random_state)
        
        # Increment counter and store simulation label
        Population.SIMULATIONS += 1
        if simulation_name is None:
            simulation_name = "simulation{}".format(Population.SIMULATIONS)
        self.simulation_name = simulation_name
        
        # Define lookup dictionaries:
        wealth_labels = {
            0 : "lowincome", False : "lowincome",
            1 : "highincome", True : "highincome",
        }
        employment_labels = {
            0 : "unemployed", False : "unemployed",
            1 : "employed", True : "employed",
        }
        
        # Verify and store input data:
        self.location_attributes = location_attributes.copy()
        self.location_profiles = location_profiles.copy()
        self.check_location_profiles()
        
        # Define properties:
        location_names = []
        people = []
        
        # Track census-like figures (for calibration data):
        census_calibration = []
        
        # Assign properties:
        for i,attribute_row in self.location_attributes.iterrows():
            location_name = attribute_row['location_name']
            # Create records for this location:
            location_names.append(location_name)
            # Get location attributes:
            wealth_rate = attribute_row['wealth_rate']
            employment_rate = attribute_row['employment_rate']
            #lowincome_rate = (1-attribute_row['wealth_rate'])
            #highincome_rate = (attribute_row['wealth_rate'])
            #unemployed_rate = (1-attribute_row['employment_rate'])
            #employed_rate = (attribute_row['employment_rate'])
            # Iterate through relevant location profiles:
            location_profiles = self.location_profiles[self.location_profiles['location_name']==location_name]
            for i,profile_row in location_profiles.iterrows():
                # Build subgroup labels:
                wealth_status = profile_row['wealth_status']
                employment_status = profile_row['employment_status']
                wealth_label = wealth_labels[wealth_status]
                employment_label = employment_labels[employment_status]
                subgroup_label = "{}_{}".format(wealth_label,employment_label)
                # Determine subgroup population:
                #employment_prob = employment_rate if employment_status==1 else (1-employment_rate)
                #wealth_prob = wealth_rate if wealth_status==1 else (1-wealth_rate)
                subgroup_population = attribute_row['population']
                subgroup_population *= employment_rate if employment_status==1 else (1-employment_rate)
                subgroup_population *= wealth_rate if wealth_status==1 else (1-wealth_rate)
                # Generate people according to specified distributions:
                location_name = attribute_row['location_name']
                location_density = attribute_row['density']
                phoneownership_prob = profile_row['phoneownership_rate']
                worktravel_mean = profile_row['worktravel_baseline']
                socialtravel_mean = profile_row['socialtravel_baseline']
                grocerytravel_mean = profile_row['grocerytravel_baseline']
                worktravel_variance = worktravel_mean/5
                socialtravel_variance = socialtravel_mean/5
                grocerytravel_variance = grocerytravel_mean/5
                subgroup_population_rounded = int(np.round(subgroup_population,0))
                for p in range(subgroup_population_rounded):
                    Population.PEOPLE += 1
                    person_name = "person{}".format(Population.PEOPLE)
                    person = {
                        'person_name' : person_name,
                        'simulation_name' : self.simulation_name,
                        'location_name' : location_name,
                        'location_density' : location_density,
                        'wealth' : wealth_label,
                        'employment' : employment_label,
                        'phoneownership' : np.random.binomial(1,phoneownership_prob),
                        'worktravel' : max(0,np.random.normal(worktravel_mean,worktravel_variance)),
                        'socialtravel' : max(0,np.random.normal(socialtravel_mean,socialtravel_variance)),
                        'grocerytravel' : max(0,np.random.normal(grocerytravel_mean,grocerytravel_variance)),
                    }
                    people.append(person)
                census_calibration.append({
                    'location_name' : location_name,
                    'location_density' : location_density,
                    'wealth' : wealth_label,
                    'employment' : employment_label,
                    'people' : subgroup_population,
                    'phoneownership' : subgroup_population*phoneownership_prob,
                    'worktravel' : subgroup_population*worktravel_mean,
                    'socialtravel' : subgroup_population*socialtravel_mean,
                    'grocerytravel' : subgroup_population*grocerytravel_mean,
                })
        census_calibration = pd.DataFrame(census_calibration)
        people = pd.DataFrame(people)
        
        self.location_names = location_names
        self.people = people
        
        # Compute census-like figures (for sumulated data):
        group_cols = ['location_name','location_density','wealth','employment']
        value_cols = ['people','phoneownership','worktravel','socialtravel','grocerytravel']
        census_simulation = self.people.assign(people=1)
        census_simulation = census_simulation.groupby(group_cols)[value_cols].sum()
        census_simulation['travel'] = census_simulation[['worktravel','socialtravel','grocerytravel']].sum(axis=1)
        census_simulation = census_simulation.sort_index()
        
        census_calibration = census_calibration.set_index(group_cols)
        census_calibration['travel'] = census_calibration[['worktravel','socialtravel','grocerytravel']].sum(axis=1)
        census_calibration = census_calibration.sort_index()
        
        self.census_calibration = census_calibration
        self.census_simulation = census_simulation
        
        # Unset random state:
        np.random.seed(None)
        
    # Check validity of subgroup profiles based on hardcoded rules:
    def check_location_profiles(self):
        location_profiles = self.location_profiles
        subgroup_cols = ['employment_status','wealth_status']
        assert pd.isnull(location_profiles).sum().sum()==0, "Location profile table has blank values."
        for location_name,subgroup_profiles in location_profiles.groupby('location_name'):
            # Get list of possible values in each grouping column:
            possible_values = []
            for col in subgroup_cols:
                vals = sorted(set(location_profiles[col]))
                possible_values.append( vals )
            # Get all possible combinations of grouping columns:
            possible_combos = list(itertools.product(*possible_values))
            possible_combos = [tuple(possible_combo) for possible_combo in possible_combos]
            # Make sure that each combination is represented:
            def subgroup_label(group_cols,group_vals):
                group_cols = np.array([group_cols]).flatten()
                group_vals = np.array([group_vals]).flatten()
                label = [
                    "{}={}".format(group_col,group_val)
                    for group_col,group_val in zip(group_cols,group_vals)
                ]
                label = ",".join(label)
                return label
            actual_combos = subgroup_profiles[subgroup_cols].to_records(index=False)
            actual_combos = [tuple(actual_combo) for actual_combo in actual_combos]
            for possible_combo in possible_combos:
                assert possible_combo in actual_combos, "Missing combination in {}: {}".format(
                    location_name,subgroup_label(subgroup_cols,possible_combo)
                )
            # Verify values:
            def verify_unique(value_col,condition_cols):
                if (condition_cols is None) or (len(condition_cols)==0):
                    # Unconditional:
                    vals = set(subgroup_profiles[value_col])
                    assert len(vals)==1, "Found multiple values for column {} : {}".format(
                        value_col,vals
                    )
                else:
                    # Conditional:
                    for g,grp in subgroup_profiles.groupby(condition_cols):
                        vals = set(grp[value_col])
                        assert len(vals)==1, "Found multiple values for column {} conditional on {}: {}".format(
                            value_col,subgroup_label(condition_cols,g),vals
                        )
            # Make sure that phoneownership_rate depends only on wealth_status (for this locaiton):
            verify_unique('phoneownership_rate',['wealth_status'])
            # Make sure that worktravel_baseline depends only on employment_status (for this locaiton):
            verify_unique('worktravel_baseline',['employment_status'])
            # Make sure that worktravel_baseline depends only on wealth_status (for this locaiton):
            verify_unique('socialtravel_baseline',['wealth_status'])
            # Make sure that grocerytravel_baseline is unconditional (for this locaiton):
            verify_unique('grocerytravel_baseline',None)


In [None]:
# Define fixed attributes for each location:
location_attributes = pd.DataFrame([
    { 'location_name':'loc1', 'density':'rural', 'population':3000, 'employment_rate':0.7, 'wealth_rate':0.6 },
    { 'location_name':'loc2', 'density':'rural', 'population':7000, 'employment_rate':0.7, 'wealth_rate':0.7 },
    { 'location_name':'loc3', 'density':'rural', 'population':4000, 'employment_rate':0.6, 'wealth_rate':0.4 },
    { 'location_name':'loc4', 'density':'rural', 'population':3000, 'employment_rate':0.6, 'wealth_rate':0.5 },
    { 'location_name':'loc5', 'density':'urban', 'population':1000, 'employment_rate':0.6, 'wealth_rate':0.5 },
    { 'location_name':'loc6', 'density':'urban', 'population':2000, 'employment_rate':0.6, 'wealth_rate':0.5 },
    { 'location_name':'loc7', 'density':'urban', 'population':3000, 'employment_rate':0.8, 'wealth_rate':0.5 },
    { 'location_name':'loc8', 'density':'urban', 'population':2000, 'employment_rate':0.7, 'wealth_rate':0.5 },
])
location_attributes


In [None]:
# Define profiles of sub-poluations for each location:
location_profiles = pd.DataFrame([
    { 'name':'loc1', 'wlth':0, 'empl':0, 'phone':0.7, 'wrk':  0, 'soc':50, 'gro':10, },
    { 'name':'loc1', 'wlth':0, 'empl':1, 'phone':0.7, 'wrk':100, 'soc':50, 'gro':10, },
    { 'name':'loc1', 'wlth':1, 'empl':0, 'phone':0.9, 'wrk':  0, 'soc':50, 'gro':10, },
    { 'name':'loc1', 'wlth':1, 'empl':1, 'phone':0.9, 'wrk':100, 'soc':50, 'gro':10, },
    { 'name':'loc2', 'wlth':0, 'empl':0, 'phone':0.7, 'wrk':  0, 'soc':50, 'gro':10, },
    { 'name':'loc2', 'wlth':0, 'empl':1, 'phone':0.7, 'wrk':100, 'soc':50, 'gro':10, },
    { 'name':'loc2', 'wlth':1, 'empl':0, 'phone':0.9, 'wrk':  0, 'soc':50, 'gro':10, },
    { 'name':'loc2', 'wlth':1, 'empl':1, 'phone':0.9, 'wrk':100, 'soc':50, 'gro':10, },
    { 'name':'loc3', 'wlth':0, 'empl':0, 'phone':0.7, 'wrk':  0, 'soc':50, 'gro':10, },
    { 'name':'loc3', 'wlth':0, 'empl':1, 'phone':0.7, 'wrk':100, 'soc':50, 'gro':10, },
    { 'name':'loc3', 'wlth':1, 'empl':0, 'phone':0.9, 'wrk':  0, 'soc':50, 'gro':10, },
    { 'name':'loc3', 'wlth':1, 'empl':1, 'phone':0.9, 'wrk':100, 'soc':50, 'gro':10, },
    { 'name':'loc4', 'wlth':0, 'empl':0, 'phone':0.7, 'wrk':  0, 'soc':50, 'gro':10, },
    { 'name':'loc4', 'wlth':0, 'empl':1, 'phone':0.7, 'wrk':100, 'soc':50, 'gro':10, },
    { 'name':'loc4', 'wlth':1, 'empl':0, 'phone':0.9, 'wrk':  0, 'soc':50, 'gro':10, },
    { 'name':'loc4', 'wlth':1, 'empl':1, 'phone':0.9, 'wrk':100, 'soc':50, 'gro':10, },
    { 'name':'loc5', 'wlth':0, 'empl':0, 'phone':0.7, 'wrk':  0, 'soc':50, 'gro': 5, },
    { 'name':'loc5', 'wlth':0, 'empl':1, 'phone':0.7, 'wrk':100, 'soc':50, 'gro': 5, },
    { 'name':'loc5', 'wlth':1, 'empl':0, 'phone':0.9, 'wrk':  0, 'soc':50, 'gro': 5, },
    { 'name':'loc5', 'wlth':1, 'empl':1, 'phone':0.9, 'wrk':100, 'soc':50, 'gro': 5, },
    { 'name':'loc6', 'wlth':0, 'empl':0, 'phone':0.7, 'wrk':  0, 'soc':50, 'gro': 5, },
    { 'name':'loc6', 'wlth':0, 'empl':1, 'phone':0.7, 'wrk':100, 'soc':50, 'gro': 5, },
    { 'name':'loc6', 'wlth':1, 'empl':0, 'phone':0.9, 'wrk':  0, 'soc':50, 'gro': 5, },
    { 'name':'loc6', 'wlth':1, 'empl':1, 'phone':0.9, 'wrk':100, 'soc':50, 'gro': 5, },
    { 'name':'loc7', 'wlth':0, 'empl':0, 'phone':0.7, 'wrk':  0, 'soc':50, 'gro': 5, },
    { 'name':'loc7', 'wlth':0, 'empl':1, 'phone':0.7, 'wrk':100, 'soc':50, 'gro': 5, },
    { 'name':'loc7', 'wlth':1, 'empl':0, 'phone':0.9, 'wrk':  0, 'soc':50, 'gro': 5, },
    { 'name':'loc7', 'wlth':1, 'empl':1, 'phone':0.9, 'wrk':100, 'soc':50, 'gro': 5, },
    { 'name':'loc8', 'wlth':0, 'empl':0, 'phone':0.7, 'wrk':  0, 'soc':50, 'gro': 5, },
    { 'name':'loc8', 'wlth':0, 'empl':1, 'phone':0.7, 'wrk':100, 'soc':50, 'gro': 5, },
    { 'name':'loc8', 'wlth':1, 'empl':0, 'phone':0.9, 'wrk':  0, 'soc':50, 'gro': 5, },
    { 'name':'loc8', 'wlth':1, 'empl':1, 'phone':0.9, 'wrk':100, 'soc':50, 'gro': 5, },
])
location_profiles = location_profiles.rename(columns={
    'name' : 'location_name',
    'wlth' : 'wealth_status',
    'empl' : 'employment_status',
    'phone' : 'phoneownership_rate',
    'wrk' : 'worktravel_baseline',
    'soc' : 'socialtravel_baseline',
    'gro' : 'grocerytravel_baseline',
})

location_profiles


In [None]:
population = Population(location_attributes, location_profiles, random_state=221)
print(population.location_names)

population.people


In [None]:
# Responses:
#

# Measures:
#   - Standard (perfect measurement).
#   - Stochastic phone measurement.
#   - Stochastic phone measurement, inflating grocery (e.g. if data is from store coupon app).
