# Load Data

> Functions that load the data for the map.   

**Contents**

`Solomon Geospatial Data`
- `SolomonGeo`: A class that cleans the solomon islandas census and geography data
- `SolomonGeo.read_test`: Loads and transforms the test data
- `SolomonGeo.get_geojson`: Returns the geo_df as a geojson datatset

In [None]:
#| default_exp load_data

In [None]:
#| export
from nbdev.showdoc import *
import geopandas as gpd
import pandas as pd
import numpy as np
from git import Repo
import json
from fastcore import *
from fastcore.basics import patch
from fastcore.test import *
import sys
import topojson as tp
import pickle
from urllib.request import urlopen
import boto3
from dotenv import load_dotenv
from dash import dcc
import os
import copy


load_dotenv()

True

In [None]:
#| hide
repo = Repo('.', search_parent_directories=True)
fp = str(repo.working_tree_dir) + "/testData/"
const_df = pd.read_csv(fp + 'sol_census_' + 'all' + '_' + '2009_v2.csv')
# Check that the files exist using fastcore (both census and geo)

## Solomon Geospatial Data
> Load the geography and census data
### Geography Data
Solomons islands geography data is organised at the levels
 - adm0 - The country as as whole, Solomon Islands
 - adm1 - Also referred to as the province e.g. Honiara, Malaita
 - adm2 - The Consituency e.g. Central Honiara
 - adm3 - Ward, the smallest geography I am reporting. E.g. Cruz

### Census Data
 Solomon islands census data has been used from the 2009 and 2019 census. For the respective census:
2009
 - We have the total population in for each of the administration regions
2019
 - There is only data available down to the province level

### Test that the .env variables exist

In [None]:
#| hide
ACCESS_KEY = os.getenv("ACCESS_KEY")
SECRET_ACCCESS_KEY = os.getenv("SECRET_ACCESS_KEY")
REGION_NAME = os.getenv("REGION_NAME")
if len(ACCESS_KEY) == 0:
    # If not in .env, then use environment variables
    ACCESS_KEY = os.environ["ACCESS_KEY"]
    SECRET_ACCCESS_KEY = os.environ["SECRET_ACCESS_KEY"]
    REGION_NAME = os.environ["REGION_NAME"]

In [None]:
# Test that the environment variables can be loaded
test(ACCESS_KEY, None, nequals)
test(SECRET_ACCCESS_KEY, None, nequals)
test(REGION_NAME, None, nequals)
test_eq(REGION_NAME, 'ap-southeast-2')

## Function for connecting to AWS S3 client

In [None]:
#| export
def s3_client()-> boto3.client:
    '''Return a connection to teh AWS s3 client'''
    ACCESS_KEY = os.getenv("ACCESS_KEY")
    SECRET_ACCCESS_KEY = os.getenv("SECRET_ACCESS_KEY")
    REGION_NAME = os.getenv("REGION_NAME")
    if len(ACCESS_KEY) == 0:
        # If not in .env, then use environment variables
        ACCESS_KEY = os.environ["ACCESS_KEY"]
        SECRET_ACCCESS_KEY = os.environ["SECRET_ACCESS_KEY"]
        REGION_NAME = os.environ["REGION_NAME"]
    session = boto3.Session(region_name='ap-southeast-2')
    # Creating the low level functional client
    return session.client(
        's3',
        endpoint_url='https://s3.ap-southeast-2.amazonaws.com',
        aws_access_key_id = ACCESS_KEY,
        aws_secret_access_key = SECRET_ACCCESS_KEY,
        region_name = REGION_NAME,
    )

In [None]:
#| hide
show_doc(s3_client)

---

### s3_client

>      s3_client ()

Return a connection to teh AWS s3 client

In [None]:
#| export
class SolomonGeo:
    # TODO work out how to format the attributes
    # Look at nbdev docs maybe?
    # TODO change all data to int?
    # TODO - should I make this a dataclass for the auto functionaliy? potentially should try it out
    '''
    Load the solomon islands geography data 
    Attributes:
        cen_df    Geopandas dataframe containing geographies and census data
        geo_levels    A list of the types of available aggregations
        census_vars    A dictionary of census variables in the dataset 
        data_type   Specifies whether the variable is a percentage or number
        locations A dictionary of locations accessed by the geography level
    '''
    def __init__(self, 
                cen_df:pd.DataFrame, # A dataset containing the census data,
                pop_df:pd.DataFrame, # A dataset contain the population projection data
                geos:gpd.GeoDataFrame, # A geodataframe containing geographies of data
    ):
        self.census = cen_df
        self.population = pop_df
        self.geo = geos

        # variable that tracks the types of aggregations
        self.geo_levels = cen_df.loc[:, ('core', 'agg')].unique()

        # Save a list of census variables, ignoring the core variables
        # Use a dictionary that maps the upper level column names to lower level ones
        var_df = cen_df.drop(columns = "core", level=0)
        vars = {}
        for col in var_df.columns:
            if col[0] not in vars:
                vars[col[0]] = [col[1]]
            else:
                vars[col[0]].append(col[1])
        self.census_vars = vars

        # Save a list of population variables, ignoring the core variables
        # Use a dictionary that maps the upper level column names to lower level ones
        var_df = pop_df.drop(columns = ["core", "Age"], level=0)
        vars = {}
        for col in var_df.columns:
            if col[0] not in vars:
                vars[col[0]] = [col[1]]
            else:
                vars[col[0]].append(col[1])
        self.population_vars = vars
        # Seperately save the age groupings
        self.ages = list(pop_df['Age']['Age_Bracket'].values)
        self.pop_years = list(np.unique(pop_df['core']['year'].values))

        # TODO should captialise first letter
        self.data_type = cen_df.loc[:, ('core', 'type')].unique()

        # save a list of locations as a dictionary access by geography level
        locations = {}
        for geo in self.geo_levels:
            locations[geo] = cen_df.loc[cen_df['core']['agg'] == geo, ('core', 'location')].unique()
        self.locations = locations
    
        # TODO: need a list of column sub headings: get from column name split by `:`

        self.type_default = 'Total'


    @classmethod
    def read_test(cls,
                 ): # A solmon geo class TODO work out how to return self here... (can't?)
        '''
        Contsructor that initialises the object from files using the local testing data
        '''

        repo = Repo('.', search_parent_directories=True)
        pw = str(repo.working_tree_dir) + "/testData/"
        df = pd.read_csv(pw + 'sol_census_all_2009_v2.csv')
        pop = pd.read_csv(pw + 'solo_pop_proj_2009.csv')
        aggs = df.loc[:, 'agg'].unique()
        geos = []
        for agg in aggs:
            geo = gpd.read_file(pw + 'sol_geo_' + agg.lower() + '.json')
            # Add an agg column, as the data and geometry need to be joined by id and agg
            geo.loc[:, 'agg'] = agg
            geos.append(geo)

        ret = cls.__transform(df, pop, geos)
        return cls(
            cen_df = ret[0],
            pop_df = ret[1],
            geos = ret[2],
        )
    
    @classmethod
    def load_pickle(cls,
                    folder:str = "/testData/", #file path of the folder to save in
                    aws:bool = True, # Whether to load from github or local
                    file_name:str = 'sol_geo.pickle' # file name of the saved class
                 ):
        '''
        A constuctor that initialises the object from aws pickle
        '''
        # Create a connection to AWS server
        client = s3_client()

        if aws:
            # Create the S3 object
            obj = client.get_object(
                Bucket = 'hobby-data',
                Key = file_name, 
            )

            # Read in the pickle
            try:
                tmp_geo = pickle.load(obj['Body'])
            except:
                raise ValueError("Issue dowloading pickle file from AWS.")
                
        else:
            # TODO work out how to make this a class method
            repo = Repo('.', search_parent_directories=True)
            pw = str(repo.working_tree_dir) + folder + file_name
            
            with open(pw, 'rb') as f:
                tmp_geo = pickle.load(f)
 
        
        return cls(
            cen_df = gpd.GeoDataFrame(tmp_geo['census']),
            pop_df = gpd.GeoDataFrame(tmp_geo['population']),
            geos = gpd.GeoDataFrame(tmp_geo['geo']),
        )
        
    
    @classmethod
    def gen_stored(cls,
                  json_sol:dict, # A geojson dataset
                 ): # A solmon geo class TODO work out how to return self here... (can't?)
        '''
        A constructor that creates a JSON serialised SolomonGeo object from a stored geopandas dataframe.
        The purpose of this is to allow the object to be stored JSON serialised in a DCC.Store object in 
        the browser before being deserialised and as an object.

        Note that storing and the reloading, will result in dropping the geometry.
        '''
        def df_to_hier(df:pd.DataFrame, # dataframe to convert to hierarchical
                       ) -> pd.DataFrame: # Converted dataframe back to hierachical
            cols = df.columns.str.extract(r'(.*): (.+)', expand=True)
            df.columns = pd.MultiIndex.from_arrays((cols[0], cols[1]))
            df.columns.names = [None]*2
            return df
        
        json_sol = json_sol["data"]

        census = pd.DataFrame(json_sol['census'])
        census = df_to_hier(census)
        census.index.name = 'pk'

        population = pd.DataFrame(json_sol['population'])
        population = df_to_hier(population)

        geo = gpd.GeoDataFrame(json_sol['geojson'])

        return cls(
            cen_df = census,
            pop_df = population,
            geos = geo,
        )
    
    @classmethod
    def __transform(cls, 
                    df:pd.DataFrame, # The dataframe containing census data
                    pop_df:pd.DataFrame, # The dataset containing the population projection data
                    l_geos:[gpd.GeoDataFrame], # A list of geopandas dataframes containing 
                                                # the geographies 
                 ) -> gpd.GeoDataFrame: # Returns combined dataset
        '''
        Extract and return input datasets from file. Assumes correct format of input dataset, then
        Transform given raw input dataset into a cleaned and combined geopandas dataframe
        '''
        # TODO seperate out the geometry from the data.
        # TODO - make a function that tests that the geo and datasets both join

        geos = gpd.GeoDataFrame()
        for geo in l_geos:
            # Before combining, need to rename like columns
            # Rename columns and keep only necessary ones, Note that id can be province id, contsituency id etc.
            geo.columns = geo.columns.str.replace(r'^[a-zA-Z]+name$', 'geo_name', case = False, regex = True)
            # TODO this assumes the id key column is the first one (which so far it is...)
            geo.rename(columns = {geo.columns[0]:'id'}, inplace=True)

            geo = geo.loc[:, ['id', 'agg', 'geometry']] 

            # simplify the geography, use topo to preserver the topology between shapes
            topo = tp.Topology(geo, prequantize=False)
            geo = topo.toposimplify(720/43200).to_gdf() # old 360/43200

            geos = pd.concat([geos, geo])
            
        # Clean the geospatial dataframe
        geos.loc[:, 'year'] = '2009'
        
        # Clean the census data
        df = df.dropna()
        # Rename columns to be consistent across geography
        df.columns = df.columns.str.replace(r'^[a-zA-Z]+_name$', 'location', case = False, regex = True)
        # id needs to change types twice so that it is a string of an int
        df = df.astype({'id': 'int'})#, 'male_pop':'int', 	'female_pop':'int', 'total_pop':'int'})
        df = df.astype({'id': 'str'})

        pop_df = pop_df.astype({'core: id': 'int'})
        pop_df = pop_df.astype({'core: id': 'str'})

        # Add location names to geography dataset
        locations = copy.copy(df)
        locations = locations.loc[:, ['id', 'agg', 'location']].drop_duplicates()
        geos = geos.merge(locations, on=['id', 'agg'], how = 'left')

        # Index is unique by type and geoname
        df['pk'] = df['location'] + "_" + df["type"] 
        df = df.set_index("pk") 

        # Rename some of the census data
        df = df.rename(columns = {
                                'id':'core: id', 'agg':'core: agg', 'location':'core: location',
                                'year':'core: year', 'type':'core: type'})

        # Test that the datasets all have geographies
        test_geo(df, geos)
        test_geo(pop_df, geos.loc[geos['agg'] == 'Province'])         

        # Convert into a multiindex dataframe, with hiearchical columns
        try:
            cols = df.columns.str.extract(r'(.*): (.+)', expand=True)
            df.columns = pd.MultiIndex.from_arrays((cols[0], cols[1]))
            df.columns.names = [None]*2

            cols2 = pop_df.columns.str.extract(r'(.*): (.+)', expand=True)
            pop_df.columns = pd.MultiIndex.from_arrays((cols2[0], cols2[1])) 
            pop_df.columns.names = [None]*2
        except:
            raise ValueError("Issue converting geopandas dataframe to multindex. \
                             Check that all columns have \': \' beside the following\
                             core columns: geometry, id, agg, year, type.")
        
        # Set index of geography and population data
        geos = geos.set_index(geos.loc[:, 'location']) 
        pop_df.set_index(('core', 'location'), inplace = True)

        # Set all non core and age columns of population to int variables
        # TODO must be a better way to do this
        cols = pop_df.columns.get_level_values(0)
        ignore = ['core', 'Age']
        cols = [c for c in cols if c not in ignore]
        cols = list(set(cols))
        for c1 in cols:
            to_change = pop_df[c1].columns
            for c2 in to_change:
                pop_df[(c1, c2)] = pop_df[(c1, c2)].apply(lambda x: int(x.split()[0].replace(',', '')))
        
                
        # return the transformed dataset
        return df, pop_df, geos


# SolomonGeo Data Class
This data class manipulates and stores the geospatial and regular data relating to each geography. It includes
the necessary data and methods to manipulate that data to build the resulting dash app.

In [None]:
#| hide
show_doc(SolomonGeo)

---

### SolomonGeo

>      SolomonGeo (cen_df:pandas.core.frame.DataFrame,
>                  pop_df:pandas.core.frame.DataFrame,
>                  geos:geopandas.geodataframe.GeoDataFrame)

Load the solomon islands geography data 
Attributes:
    cen_df    Geopandas dataframe containing geographies and census data
    geo_levels    A list of the types of available aggregations
    census_vars    A dictionary of census variables in the dataset 
    data_type   Specifies whether the variable is a percentage or number
    locations A dictionary of locations accessed by the geography level

|    | **Type** | **Details** |
| -- | -------- | ----------- |
| cen_df | DataFrame | A dataset containing the census data, |
| pop_df | DataFrame | A dataset contain the population projection data |
| geos | GeoDataFrame | A geodataframe containing geographies of data |

## Constructors for SolomonGeo class
These constudctors are used to build the SolomonGeo objects from various inputs

In [None]:
#| hide
show_doc(SolomonGeo.read_test)

---

### SolomonGeo.read_test

>      SolomonGeo.read_test ()

Contsructor that initialises the object from files using the local testing data

In [None]:
show_doc(SolomonGeo.load_pickle)

---

### SolomonGeo.load_pickle

>      SolomonGeo.load_pickle (folder:str='/testData/', aws:bool=True,
>                              file_name:str='sol_geo.pickle')

A constuctor that initialises the object from aws pickle

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| folder | str | /testData/ | file path of the folder to save in |
| aws | bool | True | Whether to load from github or local |
| file_name | str | sol_geo.pickle | file name of the saved class |

In [None]:
#| hide
show_doc(SolomonGeo.gen_stored)

---

### SolomonGeo.gen_stored

>      SolomonGeo.gen_stored (json_sol:dict)

A constructor that creates a JSON serialised SolomonGeo object from a stored geopandas dataframe.
The purpose of this is to allow the object to be stored JSON serialised in a DCC.Store object in 
the browser before being deserialised and as an object.

Note that storing and the reloading, will result in dropping the geometry.

|    | **Type** | **Details** |
| -- | -------- | ----------- |
| json_sol | dict | A geojson dataset |

## Test that dataset and geography merge perfectly

In [None]:
# TODO should this method be callable from the class
def test_geo(df:pd.DataFrame, # The dataframe to test
                geo:gpd.GeoDataFrame, # The geographys
                ):
    '''
    Tests that the provided geography and data merge perfectly. I.e. all data
    has a coresponding geography. The primary key to merge id and agg
    '''
    # Use outer join to see if anything is missing
    geo_df = geo.merge(df, left_on=['id', 'agg'], right_on=['core: id', 'core: agg'], how = 'outer', indicator = True)
    test_eq(sum(geo_df['_merge'] != 'both'), 0) # The total of non-joined rows should be 0



## Save SolomonGeo
Save as a pickled object

In [None]:
#| export
@patch
def save_pickle(self:SolomonGeo,
                aws:bool = True, # Whether to save to aws or locally
                folder:str = "/testData/", #file path of the folder to save in, only necesasry for local saving
                file_name:str = 'sol_geo.pickle' # file name of the saved class
             ):
    '''
    Save a pickle of the SolomonGeo class in backblaze b2
    '''
    if aws:
      body_pickle = pickle.dumps(self.__dict__)
      try:
        client = s3_client()
        client.put_object(
            Bucket = 'hobby-data',
            Key = file_name, 
            Body = body_pickle
        )
      except:
         raise ValueError("Issue uploading pickle file to AWS.")
    else:
      repo = Repo('.', search_parent_directories=True)
      pw = str(repo.working_tree_dir) + folder + file_name
      
      f = open(pw, 'wb')
      pickle.dump(self.__dict__, f, 2)
      f.close()


In [None]:
show_doc(SolomonGeo.save_pickle)

---

### SolomonGeo.save_pickle

>      SolomonGeo.save_pickle (aws:bool=True, folder:str='/testData/',
>                              file_name:str='sol_geo.pickle')

Save a pickle of the SolomonGeo class in backblaze b2

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| aws | bool | True | Whether to save to aws or locally |
| folder | str | /testData/ | file path of the folder to save in, only necesasry for local saving |
| file_name | str | sol_geo.pickle | file name of the saved class |

## Get Geo JSON
A getter method for the geometry portion of the dataset that returns a geoJson formated geography. 

It only includes the geography and location name as id

In [None]:
#| export
@patch
def get_geojson(self:SolomonGeo, 
                geo_filter:str = None, # Filters the geojson to the requested aggregation 
               ) -> dict: # Geo JSON formatted dataset
    '''
    A getter method for the SolomonGeo class that returns a Geo JSON formatted dataset
    '''
    ret = self.geo
    # Return only required aggregation if specified
    if geo_filter is not None:
        ret = ret.loc[ret['agg'] == geo_filter, :]
    # Return only the geometry (plus location name in id)
    # to minise file size
    return json.loads(ret.loc[:, 'geometry'].to_json())

In [None]:
#| hide
show_doc(SolomonGeo.get_geojson)

---

### SolomonGeo.get_geojson

>      SolomonGeo.get_geojson (geo_filter:str=None)

A getter method for the SolomonGeo class that returns a Geo JSON formatted dataset

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| geo_filter | str | None | Filters the geojson to the requested aggregation |
| **Returns** | **dict** |  | **Geo JSON formatted dataset** |

## Store JSON
A getter method that returns a dcc.Store object with the data of the `SolomonGeo` class
converted to a dictionary for storing with dash. We use a dictionary instead of geojson
as the time to convert dictionary back to geopandas is much faster than geojson. (0.01s vs 1.2s) This 
makes the map much more snappy and responsive

On storing, we drop the geometry as it won't store as json. Note that you can get around this
by using a geojson, but this increases reload time by a lot. 

In [None]:
#| export
@patch
def get_store(self:SolomonGeo, 
            ) -> dcc.Store: # Geo JSON formatted dataset
    '''
    A getter method that returns a dcc.Store object with the data of the `SolomonGeo` class
    converted to a dictionary for storing with dash. 
    '''
    def hier_to_pandas(df:pd.DataFrame) -> pd.DataFrame:
        cols = df.columns.droplevel(1) + ": " + df.columns.droplevel(0)
        cols = cols.tolist()
        df.columns = cols
        return df

    cen_df = copy.copy(self.census)
    cen_df = hier_to_pandas(cen_df)

    pop_df = copy.copy(self.population)
    pop_df = hier_to_pandas(pop_df)
    
    geos = copy.copy(self.geo)  
    # Need to drop geometry as it won't serialize
    geos.drop(columns = 'geometry', inplace = True)  

    return dcc.Store(id="geo_df", data={"data": {
                                            "census": cen_df.to_dict("records"),
                                            "population": pop_df.to_dict("records"),
                                            "geojson": geos.to_dict()}})

In [None]:
#| hide
show_doc(SolomonGeo.get_store)


---

### SolomonGeo.get_store

>      SolomonGeo.get_store ()

A getter method that returns a dcc.Store object with the data of the `SolomonGeo` class
converted to a dictionary for storing with dash.

## Get Census Data
Returns the census dataset for a particular aggregation

In [None]:
#| export
@patch
def get_census(self:SolomonGeo, 
                geo_filter:str = None, # Filters the dataframe to the requested geography 
                var:str = None, # Selects an upper level 
                measure:str = None, # Selects the lower level variable, if var 1 is used, measure must be used.
                loc_filter:[str] = None, # Filters one of more locations
                # TODO remove hardcoding here?
                type_filter:str = 'Total', # Return either number of proportion
                agg = False, # Whether to return the dataset aggregated for the given selection
               ) -> pd.DataFrame: # Pandas Dataframe containing population data
    '''
    A getter method for the SolomonGeo class that returns a pandas dataset containg
    the id variable and the requested census data. This is the minimal data required
    to display on the map. 
    - Optionally can aggregate the dataset, uses weighted aggregation for proportional data
    '''
    ret = self.census
    ret = ret.loc[ret['core']['type'] == type_filter, :] 
    ret = ret.set_index(ret.loc[:, ('core', 'location')]) # Change index to location as it's more desriptive
    # TODO check that filter is valid
    if geo_filter is not None:
        try:
            assert(geo_filter in ['Ward', 'Constituency', 'Province'])
        except:
            ValueError("Geo filter must be one of: ['Ward', 'Constituency', 'Province']")
        ret = ret.loc[ret['core']['agg'] == geo_filter, :]

    if loc_filter is not None:
        ret = ret.loc[ret['core']['location'].isin(loc_filter), :]

    # Return no core data to minimise the html size
    ret = ret.drop(columns = 'core', level=0)

    # Keep only selected column if required
    if measure is not None:
        try:
            assert(var is not None)
            assert(measure in self.census_vars[var])
        except:
            ValueError("If measure is set, var 1 must be set and the key value pair of var and measure must match")
        ret = ret[var].filter(items = [measure])
    elif var is not None:
        # Keep all values from upper level column
        ret = ret[var]

    ret = pd.DataFrame(ret)

    # If required, aggregate dataset based on data type
    if agg == True:
        if type_filter == 'Total':
            ret = ret.sum()
        elif type_filter == 'Proportion':
            ret = ret.sum() / ret.sum().sum() * 100
        else:
            raise ValueError('The type passed to the aggregate function must be one of the following: \'Total\', \'Proportion\'.')
    
    return ret

In [None]:
#| hide
show_doc(SolomonGeo.get_census)

---

### SolomonGeo.get_census

>      SolomonGeo.get_census (geo_filter:str=None, var:str=None,
>                             measure:str=None, loc_filter:[<class'str'>]=None,
>                             type_filter:str='Total', agg=False)

A getter method for the SolomonGeo class that returns a pandas dataset containg
the id variable and the requested census data. This is the minimal data required
to display on the map. 
- Optionally can aggregate the dataset, uses weighted aggregation for proportional data

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| geo_filter | str | None | Filters the dataframe to the requested geography |
| var | str | None | Selects an upper level |
| measure | str | None | Selects the lower level variable, if var 1 is used, measure must be used. |
| loc_filter | [<class 'str'>] | None | Filters one of more locations |
| type_filter | str | Total | Return either number of proportion |
| agg | bool | False | Whether to return the dataset aggregated for the given selection |
| **Returns** | **DataFrame** |  | **Pandas Dataframe containing population data** |

## Get Population Data
Returns the population projection data for a given aggregation

In [None]:
#| export
@patch
def get_pop(self:SolomonGeo, 
                years:str, # Selects the year/years of data to return
                var:str = None, # Selects an upper level variable
                measure:str = None, # Selects the lower level variable, if var 1 is used, measure must be used.
                loc_filter:[str] = None, # Filters one of more locations
                type_filter:str = 'Total', # Return either number of proportion
                agg = False, # Whether to return the dataset aggregated for the given selection
                ages:[int] = None, # Filters for one or more Age Brackets, if none returns all
               ) -> pd.DataFrame: # Pandas Dataframe containing population data
    '''
    A getter method for the SolomonGeo class that returns a pandas dataset containg
    the id variable and the requested popultion data. This is the minimal data required
    to display on the map. 
    '''
    geo_filter = 'Province'
    ret = self.population
    ret = ret.loc[ret['core']['type'] == type_filter, :] 
    # TODO check that filter is valid
    if geo_filter is not None:
        try:
            assert(geo_filter in ['Ward', 'Constituency', 'Province'])
        except:
            ValueError("Geo filter must be one of: ['Ward', 'Constituency', 'Province']")
        ret = ret.loc[ret['core']['agg'] == geo_filter, :]

    # Filter to years
    ret = ret.loc[ret['core']['year'].isin(years), :]

    if loc_filter is not None:
        ret = ret.loc[ret.index.isin(loc_filter), :]

    if ages is not None:
        ret = ret.loc[ret['Age']['Age_Bracket'].isin(ages), :]

    # Return no core data to minimise the html size
    age_year = copy.copy(ret)
    age_year = age_year[[('core', 'year'), ('Age', 'Age_Bracket')]]
    # WARNING - don't reorder before the concat below
    ret = ret.drop(columns = ['core', 'Age'], level=0)
    ret = ret.drop(columns = "Numerical_Bracket", level = 1)

    # Keep only selected column if required
    if measure is not None:
        try:
            assert(var is not None)
            assert(measure in self.census_vars[var])
        except:
            ValueError("If measure is set, var 1 must be set and the key value pair of var and measure must match")
        ret = ret[var].filter(items = [measure])
    elif var is not None:
        # Keep all values from upper level column
        ret = ret[var]

    ret = pd.concat([age_year, ret], axis = 1)
    ret = pd.DataFrame(ret)
    # Flatten the dataset
    ret.columns = ret.columns.get_level_values(1)
    # If required, aggregate dataset based on data type
    if agg == True:
        if type_filter == 'Total':
            ret = ret.groupby(['year']).sum(numeric_only= True)
        elif type_filter == 'Proportion':
            ret = ret.groupby('year').sum(numeric_only= True) / ret.groupby('year').sum(numeric_only= True).sum(numeric_only= True) * 100
        else:
            raise ValueError('The type passed to the aggregate function must be one of the following: \'Total\', \'Proportion\'.')
        
    return ret

# Testing

In [None]:
sol_geo = SolomonGeo.read_test()

KeyError: 'Age_Bracket'

In [None]:
sol_geo.population

Unnamed: 0_level_0,core,core,core,core,Age,Age,Population,Population,Population
Unnamed: 0_level_1,id,agg,type,year,Age_Bracket,Numerical_Bracket,Males,Females,Total
"(core, location)",Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Choiseul,1,Province,Total,2010,0-4,0,2279,2060,4339
Choiseul,1,Province,Total,2010,5-9,5,2131,2002,4133
Choiseul,1,Province,Total,2010,10-14,10,1778,1640,3418
Choiseul,1,Province,Total,2010,15-19,15,1379,1342,2721
Choiseul,1,Province,Total,2010,20-24,20,1192,1083,2275
...,...,...,...,...,...,...,...,...,...
Honiara,10,Province,Total,2025,60-64,60,1406,1236,2642
Honiara,10,Province,Total,2025,65-69,65,955,791,1746
Honiara,10,Province,Total,2025,70-74,70,619,466,1085
Honiara,10,Province,Total,2025,75-79,75,318,275,593


In [None]:
tester = sol_geo.get_pop(years = [2010, 2011, 2012])
tester
# TODO work out how to aggregate all the columns other than age.

Unnamed: 0_level_0,year,Age_Bracket,Males,Females,Total
"(core, location)",Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Choiseul,2010,0-4,2279,2060,4339
Choiseul,2010,5-9,2131,2002,4133
Choiseul,2010,10-14,1778,1640,3418
Choiseul,2010,15-19,1379,1342,2721
Choiseul,2010,20-24,1192,1083,2275
...,...,...,...,...,...
Honiara,2012,60-64,628,447,1075
Honiara,2012,65-69,368,303,671
Honiara,2012,70-74,226,162,388
Honiara,2012,75-79,110,78,188


In [None]:
sol_geo.get_pop(years = [2010, 2011, 2012], loc_filter = ['Honiara', 'Choiseul'], agg = True)

Unnamed: 0_level_0,Males,Females,Total
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010,51714,46769,98483
2011,53138,48238,101370
2012,54568,49707,104278


In [None]:
sol_geo.locations

{'Province': array(['Choiseul', 'Honiara', 'Western', 'Isabel', 'Central',
        'Rennell-Bell', 'Guadalcanal', 'Malaita', 'Makira-Ulawa', 'Temotu'],
       dtype=object),
 'Constituency': array(['South Choiseul', 'North West Choiseul', 'East Choiseul',
        'East Honiara', 'Central Honiara', 'West Honiara',
        'Shortland Islands', 'North Vella la Vella',
        'South Vella la Vella', 'Ranongga-Simbo', 'Gizo-Kolombangara',
        'West New Georgia-Vonavona', 'North New Georgia',
        'South New Georgia-Rendova', 'Marovo', 'Hograno-Kia-Havulei',
        'Marigne-Kokota', 'Gao-Bugotu', 'Nggela', 'Savo-Russull Islands',
        'Rennell-Bellona', 'North West Guadalcanl', 'West Guadalcanal',
        'South Guadalcanal', 'East Guadalcanal', 'East-Central',
        'North-East', 'North Guadalcanal', 'Central Guadalcanal',
        'North Malaita', 'Lau-Mbaelela', 'Baegu-Asifola', 'Fataleka',
        "West Kwara'ae", "Central Kwara'ae", 'East Malaita',
        'Auki-Langalanga'

## Save and Load

Test that the newly created solomon geo object can be saved to aws

In [None]:
sol_geo.save_pickle()

Test that we can connect to the aws s3 bucket

In [None]:
# Create the S3 object
obj = s3_client().get_object(
    Bucket = 'hobby-data',
    Key = 'test.txt', 
)

# Read in the pickle
try:
    data = obj['Body'].read()
except:
    raise ValueError("Issue dowloading test file from AWS.")

In [None]:
#| slow
SolomonGeo.load_pickle('/testData/', aws=True)

<__main__.SolomonGeo>

Test that gen_stored creates a copy correctly from a json serialised dataframe

In [None]:
stored_geo = sol_geo.get_store()
# TODO I need to potentially created a function to check if two objects are the same

In [None]:
restored_geo = SolomonGeo.gen_stored(stored_geo.data)


In [None]:
restored_geo.get_census(geo_filter = 'Constituency', loc_filter=['Nggela', 'East AreAre'])

Unnamed: 0_level_0,Key Statistics,Main source of household drinking water,Main source of household drinking water,Main source of household drinking water,Main source of household drinking water,Main source of household drinking water,Main source of household drinking water,Main source of household drinking water,Main source of household drinking water,Main source of household drinking water,...,Household money received from remittances,Household money received from remittances,Main source of household income in last 12 months,Main source of household income in last 12 months,Main source of household income in last 12 months,Main source of household income in last 12 months,Main source of household income in last 12 months,Main source of household income in last 12 months,Main source of household income in last 12 months,Main source of household income in last 12 months
Unnamed: 0_level_1,Total Households,metered SIWA drinking water,communal standpipe,private water tank,communal water tank,well - protected,well - unprotected,river or stream,bottled water,other source of drinking water,...,1000 - 1499 S.I. dollars,more than 1500 S.I. dollars,No income,Wages Salary,Own business,Sale fish crop craft,Land lease,House rent,Remittances,Other source
"(core, location)",Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Nggela,3315,26.0,1557.0,258.0,437.0,52.0,13.0,868.0,1.0,103.0,...,25.0,59.0,49.0,558.0,327.0,1878.0,0.0,4.0,75.0,424.0
East AreAre,1105,1.0,815.0,12.0,9.0,1.0,3.0,223.0,0.0,41.0,...,41.0,15.0,37.0,89.0,101.0,650.0,0.0,2.0,31.0,195.0


In [None]:
# TODO create a test to check that two object are the same. Probably need to write defualt behaviour into
# object. Maybe use fastcore test_eq
#test_eq(stored_geo.geo_df, sol_geo.geo_df)

Test filtering of down to multiple locations

In [None]:
#test_eq(sol_geo.get_census(geo_filter = 'constituency')['total_pop'].sum(), const_df['total_pop'].sum())
sol_geo.get_census(geo_filter = 'Constituency', loc_filter=['Nggela', 'East AreAre'])

Unnamed: 0_level_0,Key Statistics,Main source of household drinking water,Main source of household drinking water,Main source of household drinking water,Main source of household drinking water,Main source of household drinking water,Main source of household drinking water,Main source of household drinking water,Main source of household drinking water,Main source of household drinking water,...,Household money received from remittances,Household money received from remittances,Main source of household income in last 12 months,Main source of household income in last 12 months,Main source of household income in last 12 months,Main source of household income in last 12 months,Main source of household income in last 12 months,Main source of household income in last 12 months,Main source of household income in last 12 months,Main source of household income in last 12 months
Unnamed: 0_level_1,Total Households,metered SIWA drinking water,communal standpipe,private water tank,communal water tank,well - protected,well - unprotected,river or stream,bottled water,other source of drinking water,...,1000 - 1499 S.I. dollars,more than 1500 S.I. dollars,No income,Wages Salary,Own business,Sale fish crop craft,Land lease,House rent,Remittances,Other source
"(core, location)",Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Nggela,3315,26.0,1557.0,258.0,437.0,52.0,13.0,868.0,1.0,103.0,...,25.0,59.0,49.0,558.0,327.0,1878.0,0.0,4.0,75.0,424.0
East AreAre,1105,1.0,815.0,12.0,9.0,1.0,3.0,223.0,0.0,41.0,...,41.0,15.0,37.0,89.0,101.0,650.0,0.0,2.0,31.0,195.0


In [None]:
# TODO work out how to check save and load are the same as a test
# Might need to wreite and equality function

In [None]:
# TODO should do a count check for the geojson similar to this
sol_geo.census[sol_geo.census['core']['agg'] == 'Constituency'].count()

core                                               id                      100
                                                   location                100
                                                   agg                     100
                                                   type                    100
Key Statistics                                     Total Households        100
                                                                          ... 
Main source of household income in last 12 months  Sale fish crop craft    100
                                                   Land lease              100
                                                   House rent              100
                                                   Remittances             100
                                                   Other source            100
Length: 70, dtype: int64

# TODO check that all proportions are less than 0

In [None]:
# TODO filter and test equality?
test = sol_geo.get_geojson(geo_filter = 'Ward')

In [None]:
sol_geo.geo

Unnamed: 0_level_0,geometry,id,agg,year,location
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Choiseul,"MULTIPOLYGON (((157.56463 -7.49272, 157.56005 ...",1,Province,2009,Choiseul
Honiara,"MULTIPOLYGON (((159.91806 -9.42379, 159.97668 ...",10,Province,2009,Honiara
Western,"MULTIPOLYGON (((158.25148 -8.78861, 158.24885 ...",2,Province,2009,Western
Isabel,"MULTIPOLYGON (((159.86906 -8.57748, 159.86852 ...",3,Province,2009,Isabel
Central,"MULTIPOLYGON (((160.24338 -9.19871, 160.24522 ...",4,Province,2009,Central
...,...,...,...,...,...
Duff Islands,"MULTIPOLYGON (((167.24674 -9.93260, 167.24473 ...",913,Ward,2009,Duff Islands
Utupua,"MULTIPOLYGON (((166.50509 -11.30801, 166.50669...",914,Ward,2009,Utupua
Vanikoro,"MULTIPOLYGON (((166.98326 -11.67945, 166.98258...",915,Ward,2009,Vanikoro
Tikopia,"MULTIPOLYGON (((168.84036 -12.28504, 168.82691...",916,Ward,2009,Tikopia


The sum after merging and filtering should equal the sum from the raw dataset.

In [None]:
test_eq(sol_geo.get_census(geo_filter = 'Constituency',var = 'Key Statistics', measure = 'Total Households', agg = True)[0], 
        const_df.loc[const_df['type'] == 'Total'].loc[const_df['agg'] == 'Constituency']['Key Statistics: Total Households'].sum())

  test_eq(sol_geo.get_census(geo_filter = 'Constituency',var = 'Key Statistics', measure = 'Total Households', agg = True)[0],


In [None]:
sol_geo.get_census(geo_filter = 'Constituency', var='Key Statistics', measure = 'Total Households', agg = True)

Total Households    91251
dtype: int64

In [None]:
sol_geo.get_census(geo_filter = 'Constituency', var='Key Statistics', measure = 'Total Households', loc_filter=['West Guadalcanal']).values[0]

array([1671])

In [None]:
sol_geo.census['core'].filter(items = ['id'])

Unnamed: 0_level_0,id
pk,Unnamed: 1_level_1
Choiseul_Total,1
Honiara_Total,10
Western_Total,2
Isabel_Total,3
Central_Total,4
...,...
Duff Islands_Proportion,913
Utupua_Proportion,914
Vanikoro_Proportion,915
Tikopia_Proportion,916


In [None]:
# TODO -test that they all sum to one
test_agg = sol_geo.get_census(geo_filter = 'Constituency', type_filter='Proportion', var = "Main source of household drinking water", 
                              agg = True)
test_eq(test_agg.sum(), 100.0)
test_agg

metered SIWA drinking water        5.939895
communal standpipe                36.295381
private water tank                13.917023
communal water tank               12.631202
well - protected                   1.820029
well - unprotected                 1.608226
river or stream                   23.762780
bottled water                      0.550409
other source of drinking water     3.475056
dtype: float64

In [None]:
test_agg.sum()

100.0

In [None]:
sol_geo.population_vars

{'Population': ['Males', 'Females', 'Total']}

In [None]:
sol_geo.pop_years

NameError: name 'sol_geo' is not defined

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()