# Load Data

> Functions that load the data for the map.   

**Contents**

`Solomon Geospatial Data`
- `SolomonGeo`: A class that cleans the solomon islandas census and geography data
- `SolomonGeo.read_test`: Loads and transforms the test data
- `SolomonGeo.get_geojson`: Returns the geo_df as a geojson datatset

In [None]:
#| default_exp load_data

In [None]:
#| export
from nbdev.showdoc import *
import geopandas as gpd
import pandas as pd
from git import Repo
import json
from fastcore import *
from fastcore.basics import patch
from fastcore.test import *
import sys
import topojson as tp
import pickle
from urllib.request import urlopen
import boto3
from dotenv import load_dotenv
from dash import dcc
import os
import copy

load_dotenv()

True

In [None]:
#| hide
repo = Repo('.', search_parent_directories=True)
fp = str(repo.working_tree_dir) + "/testData/"
const_df = pd.read_csv(fp + 'sol_census_' + 'all' + '_' + '2009' + '.csv')
# Check that the files exist using fastcore (both census and geo)

## Solomon Geospatial Data
> Load the geography and census data
### Geography Data
Solomons islands geography data is organised at the levels
 - adm0 - The country as as whole, Solomon Islands
 - adm1 - Also referred to as the province e.g. Honiara, Malaita
 - adm2 - The Consituency e.g. Central Honiara
 - adm3 - Ward, the smallest geography I am reporting. E.g. Cruz

### Census Data
 Solomon islands census data has been used from the 2009 and 2019 census. For the respective census:
2009
 - We have the total population in for each of the administration regions
2019
 - There is only data available down to the province level

### Test that the .env variables exist

In [None]:
#| hide
ACCESS_KEY = os.getenv("ACCESS_KEY")
SECRET_ACCCESS_KEY = os.getenv("SECRET_ACCESS_KEY")
REGION_NAME = os.getenv("REGION_NAME")
if len(ACCESS_KEY) == 0:
    # If not in .env, then use environment variables
    ACCESS_KEY = os.environ["ACCESS_KEY"]
    SECRET_ACCCESS_KEY = os.environ["SECRET_ACCESS_KEY"]
    REGION_NAME = os.environ["REGION_NAME"]

In [None]:
# Test that the environment variables can be loaded
test(ACCESS_KEY, None, nequals)
test(SECRET_ACCCESS_KEY, None, nequals)
test(REGION_NAME, None, nequals)
test_eq(REGION_NAME, 'ap-southeast-2')

In [None]:
#| export
def s3_client()-> boto3.client:
    '''Return a connection to teh AWS s3 client'''
    ACCESS_KEY = os.getenv("ACCESS_KEY")
    SECRET_ACCCESS_KEY = os.getenv("SECRET_ACCESS_KEY")
    REGION_NAME = os.getenv("REGION_NAME")
    if len(ACCESS_KEY) == 0:
        # If not in .env, then use environment variables
        ACCESS_KEY = os.environ["ACCESS_KEY"]
        SECRET_ACCCESS_KEY = os.environ["SECRET_ACCESS_KEY"]
        REGION_NAME = os.environ["REGION_NAME"]
    session = boto3.Session(region_name='ap-southeast-2')
    # Creating the low level functional client
    return session.client(
        's3',
        endpoint_url='https://s3.ap-southeast-2.amazonaws.com',
        aws_access_key_id = ACCESS_KEY,
        aws_secret_access_key = SECRET_ACCCESS_KEY,
        region_name = REGION_NAME,
    )

In [None]:
#| hide
show_doc(s3_client)

---

### s3_client

>      s3_client ()

Return a connection to teh AWS s3 client

In [None]:
#| export
class SolomonGeo:
    # TODO work out how to format the attributes
    # Look at nbdev docs maybe?
    # TODO change all data to int?
    # TODO - should I make this a dataclass for the auto functionaliy? potentially should try it out
    '''
    Load the solomon islands geography data 
    Attributes:
        geo_df    Geopandas dataframe containing geographies and census data
        geo_levels    A list of the types of available aggregations
        census_vars    A dictionary of census variables in the dataset 
        data_type   Specifies whether the variable is a percentage or number
        locations A dictionary of locations accessed by the geography level
    '''
    def __init__(self, 
                geo_df:gpd.GeoDataFrame): # A geopandas dataset containing population and geography boundaries for each aggregation
        self.geo_df = geo_df

        # variable that tracks the types of aggregations
        self.geo_levels = geo_df.loc[:, ('core', 'agg')].unique()

        # Save a list of census variables, ignoring the core variables
        # Use a dictionary that maps the upper level column names to lower level ones
        var_df = geo_df.drop(columns = "core", level=0)
        vars = {}
        for col in var_df.columns:
            if col[0] not in vars:
                vars[col[0]] = [col[1]]
            else:
                vars[col[0]].append(col[1])
        self.census_vars = vars

        # TODO should captialise first letter
        self.data_type = geo_df.loc[:, ('core', 'type')].unique()

        # save a list of locations as a dictionary access by geography level
        locations = {}
        for geo in self.geo_levels:
            locations[geo] = geo_df.loc[geo_df['core']['agg'] == geo].index.unique().sort_values()
        self.locations = locations
        # TODO: need a list of column sub headings: get from column name split by `:`

        self.type_default = 'Total'


    @classmethod
    def read_test(cls,
                 ): # A solmon geo class TODO work out how to return self here... (can't?)
        '''
        Contsructor that initialises the object from files using the local testing data
        '''

        repo = Repo('.', search_parent_directories=True)
        pw = str(repo.working_tree_dir) + "/testData/"
        df = pd.read_csv(pw + 'sol_census_all_2009.csv')
        aggs = df.loc[:, 'agg'].unique()
        geos = []
        for agg in aggs:
            geo = gpd.read_file(pw + 'sol_geo_' + agg.lower() + '.json')
            # Add an agg column, as the data and geometry need to be joined by id and agg
            geo.loc[:, 'agg'] = agg
            geos.append(geo)

        gdf = cls.__transform(df, geos)
        return cls(
            geo_df = gdf
        )
    
    @classmethod
    def load_pickle(cls,
                    folder:str = "/testData/", #file path of the folder to save in
                    aws:bool = True, # Whether to load from github or local
                    file_name:str = 'sol_geo.pickle' # file name of the saved class
                 ):
        '''
        A constuctor that initialises the object from aws pickle
        '''
        # Create a connection to AWS server
        client = s3_client()

        if aws:
            # Create the S3 object
            obj = client.get_object(
                Bucket = 'hobby-data',
                Key = file_name, 
            )

            # Read in the pickle
            try:
                tmp_geo = pickle.load(obj['Body'])
            except:
                raise ValueError("Issue dowloading pickle file from AWS.")
                
        else:
            # TODO work out how to make this a class method
            repo = Repo('.', search_parent_directories=True)
            pw = str(repo.working_tree_dir) + folder + file_name
            
            with open(pw, 'rb') as f:
                tmp_geo = pickle.load(f)
 
        
        return cls(
            geo_df = gpd.GeoDataFrame(tmp_geo['geo_df'])
        )
        
    
    @classmethod
    def gen_stored(cls,
                  json_sol:dict, # A geojson dataset
                 ): # A solmon geo class TODO work out how to return self here... (can't?)
        '''
        A constructor that creates a JSON serialised SolomonGeo object from a stored geopandas dataframe.
        The purpose of this is to allow the object to be stored JSON serialised in a DCC.Store object in 
        the browser before being deserialised and as an object.
        '''
        gdf = gpd.GeoDataFrame(json_sol)
        gdf.index.name = 'pk'
        return cls(
            geo_df = gdf
        )
    
    @classmethod
    def __transform(cls, 
                    df:pd.DataFrame, # The dataframe containing census data
                    l_geos:[gpd.GeoDataFrame], # A list of geopandas dataframes containing 
                                                # the geographies 
                 ) -> gpd.GeoDataFrame: # Returns combined dataset
        '''
        Extract and return input datasets from file. Assumes correct format of input dataset, then
        Transform given raw input dataset into a cleaned and combined geopandas dataframe
        '''
        # TODO - if I add more years, loop per year

        geos = gpd.GeoDataFrame()
        for geo in l_geos:
            # Before combining, need to rename like columns
            # Rename columns and keep only necessary ones, Note that id can be province id, contsituency id etc.
            geo.columns = geo.columns.str.replace(r'^[a-zA-Z]+name$', 'geo_name', case = False, regex = True)
            # TODO this assumes the id key column is the first one (which so far it is...)
            geo.rename(columns = {geo.columns[0]:'id'}, inplace=True)

            geo = geo.loc[:, ['id', 'agg', 'geometry']] 

            # simplify the geography, use topo to preserver the topology between shapes
            topo = tp.Topology(geo, prequantize=False)
            geo = topo.toposimplify(360/43200).to_gdf()

            geos = pd.concat([geos, geo])
            
        # Clean the geospatial dataframe
        geos.loc[:, 'year'] = '2009'
        
        # Clean the census data
        df = df.dropna()
        # Rename columns to be consistent across geography
        df.columns = df.columns.str.replace(r'^[a-zA-Z]+_name$', 'geo_name', case = False, regex = True)
        # id needs to change types twice so that it is a string of an int
        df = df.astype({'id': 'int'})#, 'male_pop':'int', 	'female_pop':'int', 'total_pop':'int'})
        df = df.astype({'id': 'str'})
        
        # Merge the data together
        geo_df = geos.merge(df, on=['id', 'agg'])

        # Index is unique by type and geoname
        geo_df['pk'] = geo_df['geo_name'] + "_" + geo_df["type"] 
        geo_df = geo_df.set_index("pk") 

        # Convert into a multiindex dataframe, with hiearchical columns
        try:
            geo_df = geo_df.rename(columns = {'geometry':'core: geometry', 
                                          'id':'core: id', 'agg':'core: agg', 'geo_name':'core: location',
                                          'year':'core: year', 'type':'core: type'})
            cols = geo_df.columns.str.extract(r'(.*): (.+)', expand=True)
            geo_df.columns = pd.MultiIndex.from_arrays((cols[0], cols[1]))
            geo_df.columns.names = [None]*2
        except:
            raise ValueError("Issue converting geopandas dataframe to multindex. \
                             Check that all columns have \': \' beside the following\
                             core columns: geometry, id, agg, year, type.")
        # Turn the transformed dataset
        return geo_df


    

In [None]:
#| hide
show_doc(SolomonGeo)

---

### SolomonGeo

>      SolomonGeo (geo_df:geopandas.geodataframe.GeoDataFrame)

Load the solomon islands geography data 
Attributes:
    geo_df    Geopandas dataframe containing geographies and census data
    geo_levels    A list of the types of available aggregations
    census_vars    A dictionary of census variables in the dataset 
    data_type   Specifies whether the variable is a percentage or number
    locations A dictionary of locations accessed by the geography level

|    | **Type** | **Details** |
| -- | -------- | ----------- |
| geo_df | GeoDataFrame | A geopandas dataset containing population and geography boundaries for each aggregation |

In [None]:
#| hide
show_doc(SolomonGeo.read_test)

---

### SolomonGeo.read_test

>      SolomonGeo.read_test ()

Contsructor that initialises the object from files using the local testing data

In [None]:
#| hide
show_doc(SolomonGeo.gen_stored)

---

### SolomonGeo.gen_stored

>      SolomonGeo.gen_stored (json_sol:dict)

A constructor that creates a JSON serialised SolomonGeo object from a stored geopandas dataframe.
The purpose of this is to allow the object to be stored JSON serialised in a DCC.Store object in 
the browser before being deserialised and as an object.

|    | **Type** | **Details** |
| -- | -------- | ----------- |
| json_sol | dict | A geojson dataset |

## Save and Load

### Save Solomon Geo
Saves a solmon geo class in a pickle

In [None]:
#| export
@patch
def save_pickle(self:SolomonGeo,
                aws:bool = True, # Whether to save to aws or locally
                folder:str = "/testData/", #file path of the folder to save in, only necesasry for local saving
                file_name:str = 'sol_geo.pickle' # file name of the saved class
             ):
    '''
    Save a pickle of the SolomonGeo class in backblaze b2
    '''
    if aws:
      body_pickle = pickle.dumps(self.__dict__)
      try:
        client = s3_client()
        client.put_object(
            Bucket = 'hobby-data',
            Key = file_name, 
            Body = body_pickle
        )
      except:
         raise ValueError("Issue uploading pickle file to AWS.")
    else:
      repo = Repo('.', search_parent_directories=True)
      pw = str(repo.working_tree_dir) + folder + file_name
      
      f = open(pw, 'wb')
      pickle.dump(self.__dict__, f, 2)
      f.close()


In [None]:
show_doc(SolomonGeo.save_pickle)

---

### SolomonGeo.save_pickle

>      SolomonGeo.save_pickle (aws:bool=True, folder:str='/testData/',
>                              file_name:str='sol_geo.pickle')

Save a pickle of the SolomonGeo class in backblaze b2

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| aws | bool | True | Whether to save to aws or locally |
| folder | str | /testData/ | file path of the folder to save in, only necesasry for local saving |
| file_name | str | sol_geo.pickle | file name of the saved class |

In [None]:
show_doc(SolomonGeo.load_pickle)

---

### SolomonGeo.load_pickle

>      SolomonGeo.load_pickle (folder:str='/testData/', aws:bool=True,
>                              file_name:str='sol_geo.pickle')

A constuctor that initialises the object from aws pickle

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| folder | str | /testData/ | file path of the folder to save in |
| aws | bool | True | Whether to load from github or local |
| file_name | str | sol_geo.pickle | file name of the saved class |

## Get Geo JSON
A getter method for the geometry portion of the dataset that returns a geoJson formated geography. 

It only includes the geography and location name as id

In [None]:
#| export
@patch
def get_geojson(self:SolomonGeo, 
                geo_filter:str = None, # Filters the geojson to the requested aggregation 
               ) -> dict: # Geo JSON formatted dataset
    '''
    A getter method for the SolomonGeo class that returns a Geo JSON formatted dataset
    '''
    ret = self.geo_df
    # Only need geojson from one half of the dataset
    ret = ret.loc[ret['core']['type'] == self.type_default, :]
    if geo_filter is not None:
        ret = ret.loc[ret['core']['agg'] == geo_filter, :]
    # Return only the core data to minimise the html size
    return json.loads(ret.loc[:, ('core', 'geometry')].to_json())

In [None]:
#| hide
show_doc(SolomonGeo.get_geojson)

---

### SolomonGeo.get_geojson

>      SolomonGeo.get_geojson (geo_filter:str=None)

A getter method for the SolomonGeo class that returns a Geo JSON formatted dataset

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| geo_filter | str | None | Filters the geojson to the requested aggregation |
| **Returns** | **dict** |  | **Geo JSON formatted dataset** |

## Store JSON
Returns a geojson file that can be stored in dcc.Store

In [None]:
#| export
@patch
def get_store(self:SolomonGeo, 
            ) -> dcc.Store: # Geo JSON formatted dataset
    '''
    A getter method that returns a dcc.Store object with the data of the `SolomonGeo` class
    converted to json for storing with dash.
    '''
    df = copy.copy(self.geo_df)
    cols = df.columns.droplevel(1) + ": " + df.columns.droplevel(0)
    cols = cols.tolist()
    cols[0] = 'geometry' # rename geometry as it is required for the geojson
    df.columns = cols
    return dcc.Store(id="geo_df", data={"geojson": df.to_json()})

## Get Dataframe
Returns a particular subset of the geo_df for further anaylsis

In [None]:
#| export
@patch
def get_df(self:SolomonGeo, 
                geo_filter:str = None, # Filters the dataframe to the requested geography 
                var:str = None, # Selects an upper level 
                measure:str = None, # Selects the lower level variable, if var 1 is used, measure must be used.
                loc_filter:[str] = None, # Filters one of more locations
                # TODO remove hardcoding here?
                type_filter:str = 'Total', # Return either number of proportion
                agg_flag = False, # Whether to return the dataset aggregated for the given selection
               ) -> pd.DataFrame: # Pandas Dataframe containing population data
    '''
    A getter method for the SolomonGeo class that returns a pandas dataset containg
    the id variable and the total population variable. This is the minimal data required
    to display on the map. 
    '''
    ret = self.geo_df
    ret = ret.loc[ret['core']['type'] == type_filter, :]
    # TODO check that filter is valid
    if geo_filter is not None:
        try:
            assert(geo_filter in ['Ward', 'Constituency', 'Province'])
        except:
            ValueError("Geo filter must be one of: ['Ward', 'Constituency', 'Province']")
        ret = ret.loc[ret['core']['agg'] == geo_filter, :]

    if loc_filter is not None:
        ret = ret.loc[ret['core']['location'].isin(loc_filter), :]

    # Return no core data to minimise the html size
    ret = ret.drop(columns = 'core', level=0)

    # Keep only selected column if required
    if measure is not None:
        try:
            assert(var is not None)
            assert(measure in self.census_vars[var])
        except:
            ValueError("If measure is set, var 1 must be set and the key value pair of var and measure must match")
        ret = ret[var].filter(items = [measure])
    elif var is not None:
        # Keep all values from upper level column
        ret = ret[var]
        
    return pd.DataFrame(ret)

In [None]:
#| hide
show_doc(SolomonGeo.get_df)

---

### SolomonGeo.get_df

>      SolomonGeo.get_df (geo_filter:str=None, var:str=None, measure:str=None,
>                         loc_filter:[<class'str'>]=None,
>                         type_filter:str='Total', agg_flag=False)

A getter method for the SolomonGeo class that returns a pandas dataset containg
the id variable and the total population variable. This is the minimal data required
to display on the map.

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| geo_filter | str | None | Filters the dataframe to the requested geography |
| var | str | None | Selects an upper level |
| measure | str | None | Selects the lower level variable, if var 1 is used, measure must be used. |
| loc_filter | [<class 'str'>] | None | Filters one of more locations |
| type_filter | str | Total | Return either number of proportion |
| agg_flag | bool | False | Whether to return the dataset aggregated for the given selection |
| **Returns** | **DataFrame** |  | **Pandas Dataframe containing population data** |

## Aggregate Dataframe
Returns an aggregate pandas series 

In [None]:
#| export
@patch
def agg_df(self:SolomonGeo, 
                geo_filter:str = None, # Filters the dataframe to the requested geography 
                var:str = None, # Selects an upper level 
                measure:str = None, # Selects the lower level variable, if var 1 is used, measure must be used.
                loc_filter:[str] = None, # Filters one of more locations
                # TODO remove hardcoding here?
                type_filter:str = 'Total', # Return either number of proportion
               ) -> pd.Series: # Pandas data series containing aggregated and filtered data
    '''
    A getter method for the SolomonGeo class that calls get_df to get a spcific and then further 
    aggregates that dataset so that the proportion is the weighted proportion
    '''
    df = self.get_df(geo_filter, var, measure, loc_filter)

    if type_filter == 'Total':
        df = df.sum()
    elif type_filter == 'Proportion':
        df = df.sum() / df.sum().sum() * 100
    else:
        raise ValueError('The type passed to the aggregate function must be one of the following: \'Total\', \'Proportion\'.')
    return df

In [None]:
show_doc(agg_df)

---

[source](https://github.com/fastai/nbdev/blob/master/nbdev/showdoc.py#LNone){target="_blank" style="float:right; font-size:smaller"}

### show_doc

>      show_doc (sym, renderer=None, name:str|None=None, title_level:int=3)

Show signature and docstring for `sym`

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| sym |  |  | Symbol to document |
| renderer | NoneType | None | Optional renderer (defaults to markdown) |
| name | str \| None | None | Optionally override displayed name of `sym` |
| title_level | int | 3 | Heading level to use for symbol name |

# Testing

In [None]:
sol_geo = SolomonGeo.read_test()

In [None]:
sol_geo.geo_df

Unnamed: 0_level_0,core,core,core,core,core,core,Key Statistics,Main source of household drinking water,Main source of household drinking water,Main source of household drinking water,...,Household money received from remittances,Household money received from remittances,Main source of household income in last 12 months,Main source of household income in last 12 months,Main source of household income in last 12 months,Main source of household income in last 12 months,Main source of household income in last 12 months,Main source of household income in last 12 months,Main source of household income in last 12 months,Main source of household income in last 12 months
Unnamed: 0_level_1,geometry,id,agg,year,location,type,Total Households,metered SIWA drinking water,communal standpipe,private water tank,...,1000 - 1499 S.I. dollars,more than 1500 S.I. dollars,No income,Wages Salary,Own business,Sale fish crop craft,Land lease,House rent,Remittances,Other source
pk,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
Choiseul_Total,"MULTIPOLYGON (((157.56463 -7.49272, 157.56005 ...",1,Province,2009,Choiseul,Total,4712,2.00,1416.00,1137.00,...,209.00,179.00,88.00,892.00,206.00,2443.00,6.00,16.00,206.00,855.00
Choiseul_Proportion,"MULTIPOLYGON (((157.56463 -7.49272, 157.56005 ...",1,Province,2009,Choiseul,Proportion,100,0.04,30.05,24.13,...,4.44,3.80,1.87,18.93,4.37,51.85,0.13,0.34,4.37,18.15
Honiara_Total,"MULTIPOLYGON (((159.91806 -9.42379, 159.92354 ...",10,Province,2009,Honiara,Total,8981,6771.00,351.00,861.00,...,203.00,395.00,75.00,6984.00,783.00,393.00,54.00,179.00,48.00,465.00
Honiara_Proportion,"MULTIPOLYGON (((159.91806 -9.42379, 159.92354 ...",10,Province,2009,Honiara,Proportion,100,75.39,3.91,9.59,...,2.26,4.40,0.84,77.76,8.72,4.38,0.60,1.99,0.53,5.18
Western_Total,"MULTIPOLYGON (((158.25148 -8.78861, 158.24885 ...",2,Province,2009,Western,Total,13762,37.00,4151.00,4596.00,...,530.00,478.00,182.00,3732.00,618.00,7518.00,8.00,48.00,410.00,1246.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Vanikoro_Proportion,"MULTIPOLYGON (((166.98326 -11.67945, 166.98258...",915,Ward,2009,Vanikoro,Proportion,100,0.38,62.78,3.38,...,0.75,0.00,4.89,4.89,6.39,46.99,0.00,0.00,0.00,36.84
Tikopia_Total,"MULTIPOLYGON (((168.84036 -12.28504, 168.82691...",916,Ward,2009,Tikopia,Total,262,0.00,255.00,1.00,...,10.00,4.00,39.00,19.00,2.00,87.00,0.00,0.00,26.00,89.00
Tikopia_Proportion,"MULTIPOLYGON (((168.84036 -12.28504, 168.82691...",916,Ward,2009,Tikopia,Proportion,100,0.00,97.33,0.38,...,3.82,1.53,14.89,7.25,0.76,33.21,0.00,0.00,9.92,33.97
Neo_Total,"MULTIPOLYGON (((165.81001 -10.65326, 165.79693...",917,Ward,2009,Neo,Total,301,0.00,1.00,22.00,...,3.00,4.00,4.00,22.00,5.00,251.00,0.00,3.00,7.00,9.00


In [None]:
sol_geo.locations

{'Province': Index(['Central_Proportion', 'Central_Total', 'Choiseul_Proportion',
        'Choiseul_Total', 'Guadalcanal_Proportion', 'Guadalcanal_Total',
        'Honiara_Proportion', 'Honiara_Total', 'Isabel_Proportion',
        'Isabel_Total', 'Makira-Ulawa_Proportion', 'Makira-Ulawa_Total',
        'Malaita_Proportion', 'Malaita_Total', 'Rennell-Bell_Proportion',
        'Rennell-Bell_Total', 'Temotu_Proportion', 'Temotu_Total',
        'Western_Proportion', 'Western_Total'],
       dtype='object', name='pk'),
 'Constituency': Index(['Auki-Langalanga_Proportion', 'Auki-Langalanga_Total',
        'Baegu-Asifola_Proportion', 'Baegu-Asifola_Total',
        'Central Guadalcanal_Proportion', 'Central Guadalcanal_Total',
        'Central Honiara_Proportion', 'Central Honiara_Total',
        'Central Kwara'ae_Proportion', 'Central Kwara'ae_Total',
        'Central Makira_Proportion', 'Central Makira_Total',
        'East AreAre_Proportion', 'East AreAre_Total',
        'East Choiseul_Prop

## Save and Load

Test that the newly created solomon geo object can be saved to aws

In [None]:
sol_geo.save_pickle()

Test that we can connect to the aws s3 bucket

In [None]:
# Create the S3 object
obj = s3_client().get_object(
    Bucket = 'hobby-data',
    Key = 'test.txt', 
)

# Read in the pickle
try:
    data = obj['Body'].read()
except:
    raise ValueError("Issue dowloading test file from AWS.")

In [None]:
#| slow
SolomonGeo.load_pickle('/testData/', aws=True)

<__main__.SolomonGeo>

Test that gen_stored creates a copy correctly from a json serialised dataframe

In [None]:
stored_geo = sol_geo.get_store()
# TODO I need to potentially created a function to check if two objects are the same

TypeError: 'module' object is not callable

In [None]:
stored_geo

Store(id='geo_df', data={'geojson': '{"type": "FeatureCollection", "features": [{"id": "Choiseul_Total", "type": "Feature", "properties": {"core: id": "1", "core: agg": "Province", "core: year": "2009", "core: location": "Choiseul", "core: type": "Total", "Key Statistics: Total Households": 4712, "Main source of household drinking water: metered SIWA drinking water": 2.0, "Main source of household drinking water: communal standpipe": 1416.0, "Main source of household drinking water: private water tank": 1137.0, "Main source of household drinking water: communal water tank": 840.0, "Main source of household drinking water: well - protected": 14.0, "Main source of household drinking water: well - unprotected": 2.0, "Main source of household drinking water: river or stream": 1184.0, "Main source of household drinking water: bottled water": 20.0, "Main source of household drinking water: other source of drinking water": 97.0, "Main source of household washing water: piped - private": 89.0,

In [None]:
stored_geo.get_df(geo_filter = 'Constituency', loc_filter=['Nggela', 'East AreAre'])

Unnamed: 0_level_0,Key Statistics,Main source of household drinking water,Main source of household drinking water,Main source of household drinking water,Main source of household drinking water,Main source of household drinking water,Main source of household drinking water,Main source of household drinking water,Main source of household drinking water,Main source of household drinking water,...,Household money received from remittances,Household money received from remittances,Main source of household income in last 12 months,Main source of household income in last 12 months,Main source of household income in last 12 months,Main source of household income in last 12 months,Main source of household income in last 12 months,Main source of household income in last 12 months,Main source of household income in last 12 months,Main source of household income in last 12 months
Unnamed: 0_level_1,Total Households,metered SIWA drinking water,communal standpipe,private water tank,communal water tank,well - protected,well - unprotected,river or stream,bottled water,other source of drinking water,...,1000 - 1499 S.I. dollars,more than 1500 S.I. dollars,No income,Wages Salary,Own business,Sale fish crop craft,Land lease,House rent,Remittances,Other source
pk,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
East AreAre_Total,1105,1.0,815.0,12.0,9.0,1.0,3.0,223.0,0.0,41.0,...,41.0,15.0,37.0,89.0,101.0,650.0,0.0,2.0,31.0,195.0
Nggela_Total,3315,26.0,1557.0,258.0,437.0,52.0,13.0,868.0,1.0,103.0,...,25.0,59.0,49.0,558.0,327.0,1878.0,0.0,4.0,75.0,424.0


In [None]:
# TODO create a test to check that two object are the same. Probably need to write defualt behaviour into
# object. Maybe use fastcore test_eq
#test_eq(stored_geo.geo_df, sol_geo.geo_df)

Test filtering of down to multiple locations

In [None]:
#test_eq(sol_geo.get_df(geo_filter = 'constituency')['total_pop'].sum(), const_df['total_pop'].sum())
sol_geo.get_df(geo_filter = 'Constituency', loc_filter=['Nggela', 'East AreAre'])

KeyError: 'core'

In [None]:
# TODO work out how to check save and load are the same as a test
# Might need to wreite and equality function

In [None]:
# TODO should do a count check for the geojson similar to this
sol_geo.geo_df[sol_geo.geo_df['core']['agg'] == 'Constituency'].count()

core                                               geometry                100
                                                   id                      100
                                                   agg                     100
                                                   year                    100
                                                   location                100
                                                                          ... 
Main source of household income in last 12 months  Sale fish crop craft    100
                                                   Land lease              100
                                                   House rent              100
                                                   Remittances             100
                                                   Other source            100
Length: 72, dtype: int64

# TODO check that all proportions are less than 0

In [None]:
# TODO filter and test equality?
test = sol_geo.get_geojson(geo_filter = 'ward')

The sum after merging and filtering should equal the sum from the raw dataset.

In [None]:
test_eq(sol_geo.agg_df(geo_filter = 'Constituency',var = 'Key Statistics', measure = 'Total Households')[0], 
        const_df.loc[const_df['type'] == 'Total'].loc[const_df['agg'] == 'Constituency']['Key Statistics: Total Households'].sum())

KeyError: 'core'

In [None]:
sol_geo.agg_df(geo_filter = 'Constituency', var='Key Statistics', measure = 'Total Households')

KeyError: 'core'

In [None]:
sol_geo.get_df(geo_filter = 'Constituency', var='Key Statistics', measure = 'Total Households', loc_filter=['West Guadalcanal']).values[0]

KeyError: 'core'

In [None]:
sol_geo.geo_df['core'].filter(items = ['id'])

KeyError: 'core'

In [None]:
# TODO -test that they all sum to one
test_agg = sol_geo.agg_df(geo_filter = 'Constituency', type_filter='Proportion', var = "Main source of household drinking water")
test_eq(test_agg.sum(), 100.0)
test_agg

KeyError: 'core'

In [None]:
test_agg.sum()

NameError: name 'test_agg' is not defined

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()