# Load Data

> Functions that load the data for the map.   

**Contents**

`Solomon Geospatial Data`
- `SolomonGeo`: A class that cleans the solomon islandas census and geography data
- `SolomonGeo.read_test`: Loads and transforms the test data
- `SolomonGeo.get_geojson`: Returns the geo_df as a geojson datatset

In [None]:
#| default_exp load_data

In [None]:
#| export
from nbdev.showdoc import *
import geopandas as gpd
import pandas as pd
from git import Repo
import json
from fastcore import *
from fastcore.basics import patch
from fastcore.test import *
import sys
#import topojson as tp

In [None]:
#| hide
repo = Repo('.', search_parent_directories=True)
fp = str(repo.working_tree_dir) + "/testData/"
const_df = pd.read_csv(fp + 'sol_census_' + 'constituency' + '_' + '2009' + '.csv')
# Check that the files exist using fastcore (both census and geo)

## Solomon Geospatial Data
> Load the geography and census data
### Geography Data
Solomons islands geography data is organised at the levels
 - adm0 - The country as as whole, Solomon Islands
 - adm1 - Also referred to as the province e.g. Honiara, Malaita
 - adm2 - The Consituency e.g. Central Honiara
 - adm3 - Ward, the smallest geography I am reporting. E.g. Cruz

### Census Data
 Solomon islands census data has been used from the 2009 and 2019 census. For the respective census:
2009
 - We have the total population in for each of the administration regions
2019
 - There is only data available down to the province level

In [None]:
#| export
class SolomonGeo:
    # TODO work out how to format the attributes
    # Look at nbdev docs maybe?
    # TODO change all data to int?
    # TODO - should I make this a dataclass for the auto functionaliy? potentially should try it out
    '''
    Load the solomon islands geography data 
    Attributes:
        geo_df    Geopandas dataframe containing geographies and census data
    '''
    def __init__(self, 
                geo_df:gpd.GeoDataFrame): # A geopandas dataset containing population and geography boundaries for each aggregation
        self.geo_df = geo_df

    @classmethod
    def read_test(cls,
                 )-> gpd.GeoDataFrame: # The geopandas dataset for given aggregation
        '''
        Initialise the object using the local testing data
        '''
        # TODO might need to further abstract this concatenation process
        df, geo = cls.extract_from_file('ward', '2009')
        gdf_ward = cls.transform('ward', '2009', df, geo)
        
        df, geo = cls.extract_from_file('constituency', '2009')
        gdf_const = cls.transform('constituency', '2009', df, geo)

        df, geo = cls.extract_from_file('province', '2009')
        gdf_prov = cls.transform('province', '2009', df, geo)
        
        # Append the datasets together
        geo_df = pd.concat([gdf_ward, gdf_const, gdf_prov])

        # simplify the geography
        geo_df.geometry = geo_df.geometry.simplify(tolerance = 360/43200)
        # TODO use topo to preserver the topology between shapes
        #topo = tp.Topology(gdf, prequantize=False)
        #gdf_simplified = topo.toposimplify(5).to_gdf()

        return cls(
            geo_df = geo_df
        )

    @classmethod
    def extract_from_file(cls, 
                            aggregation:str, # Indicates the aggregation of the data
                            year:str, # The year of that data, only relevant for census data
                 ) -> (pd.DataFrame, 
                      gpd.GeoDataFrame): # Returns input pandas and geopandas datasets
        '''
        Extract and return input datasets from file
        '''
        repo = Repo('.', search_parent_directories=True)
        pw = str(repo.working_tree_dir) + "/testData/"
        return (
            pd.read_csv(pw + 'sol_census_' + aggregation + '_' + year + '.csv'), 
            gpd.read_file(pw + 'sol_geo_' + aggregation + '.json')
        )

    @classmethod
    def transform(cls, 
            aggregation:str, # Inicates the aggregation of the data
            year:str, # The year of that data, only relevant for census data
            df:pd.DataFrame, # Uncleaned input census dataset
            geo:gpd.GeoDataFrame, # Uncleaned input geospatial dataset
           )-> gpd.GeoDataFrame: # The geopandas dataset for given aggregation
        '''
        Tranform given raw input dataset into a cleaned and combined geopandas dataframe
        '''
        # Clean the geospatial dataframe
        # Rename columns and keep only necessary ones, Note that id can be province id, contsituency id etc.
        geo.columns = geo.columns.str.replace(r'^[a-zA-Z]+name$', 'geo_name', case = False, regex = True)
        # TODO this assume the key column is the first one (which so far it is...)
        geo.rename(columns = {geo.columns[0]:'id'}, inplace=True)
        # Dropping geo_name from the geography dataset and relying on census data naming
        geo = geo.loc[:, ['id', 'geometry']] 
        
        # Add a column that indicates level of aggregation and one for the year
        geo.loc[:, 'agg'] = aggregation
        geo.loc[:, 'year'] = year
        
        # Clean the census data
        df = df.dropna()
        # Rename columns to be consistent across geography
        df.columns = df.columns.str.replace(r'^[a-zA-Z]+_name$', 'geo_name', case = False, regex = True)
        # id needs to change types twice so that it is a string of an int
        df = df.astype({'id': 'int', 'male_pop':'int', 	'female_pop':'int', 'total_pop':'int'})
        df = df.astype({'id': 'str'})
        
        # Merge the data together
        geo_df = geo.merge(df, on=['id']).set_index("geo_name") # , 'geo_name'
        return geo_df
        

In [None]:
#| hide
show_doc(SolomonGeo)

---

[source](https://github.com/Gippers/SolomonIslandsDataMap/blob/main/SolomonIslandsDataMap/load_data.py#L19){target="_blank" style="float:right; font-size:smaller"}

### SolomonGeo

>      SolomonGeo (geo_df:geopandas.geodataframe.GeoDataFrame)

Load the solomon islands geography data 
Attributes:
    geo_df    Geopandas dataframe containing geographies and census data

|    | **Type** | **Details** |
| -- | -------- | ----------- |
| geo_df | GeoDataFrame | A geopandas dataset containing population and geography boundaries for each aggregation |

In [None]:
#| hide
show_doc(SolomonGeo.read_test)

---

[source](https://github.com/Gippers/SolomonIslandsDataMap/blob/main/SolomonIslandsDataMap/load_data.py#L34){target="_blank" style="float:right; font-size:smaller"}

### SolomonGeo.read_test

>      SolomonGeo.read_test ()

Initialise the object using the local testing data

In [None]:
#| export
@patch
def get_geojson(self:SolomonGeo, 
                agg_filter:str = None, # Filters the geojson to the requested aggregation 
               ) -> dict: # Geo JSON formatted dataset
    '''
    A getter method for the SolomonGeo class that returns a Geo JSON formatted dataset
    '''
    ret = self.geo_df
    if agg_filter is not None:
        ret = ret.loc[ret['agg'] == agg_filter, :]
    # Return only the core data to minimise the html size
    return json.loads(ret.loc[:, ['geometry']].to_json())

In [None]:
#| hide
show_doc(SolomonGeo.get_geojson)

---

[source](https://github.com/Gippers/SolomonIslandsDataMap/blob/main/SolomonIslandsDataMap/load_data.py#L115){target="_blank" style="float:right; font-size:smaller"}

### SolomonGeo.get_geojson

>      SolomonGeo.get_geojson (agg_filter:str=None)

A getter method for the SolomonGeo class that returns a Geo JSON formatted dataset

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| agg_filter | str | None | Filters the geojson to the requested aggregation |
| **Returns** | **dict** |  | **Geo JSON formatted dataset** |

In [None]:
#| export
@patch
def get_df(self:SolomonGeo, 
                agg_filter:str = None, # Filters the dataframe to the requested aggregation 
               ) -> pd.DataFrame: # Pandas Dataframe containing population data
    '''
    A getter method for the SolomonGeo class that returns a pandas dataset containg
    the id variable and the total population variable. This is the minimal data required
    to display on the map. 
    '''
    ret = self.geo_df
    # TODO check that filter is valid
    if agg_filter is not None:
        ret = ret.loc[ret['agg'] == agg_filter, :]
    # Return only the core data to minimise the html size
    return pd.DataFrame(ret.loc[:, ['total_pop']])

In [None]:
#| hide
show_doc(SolomonGeo.get_df)

---

[source](https://github.com/Gippers/SolomonIslandsDataMap/blob/main/SolomonIslandsDataMap/load_data.py#L129){target="_blank" style="float:right; font-size:smaller"}

### SolomonGeo.get_df

>      SolomonGeo.get_df (agg_filter:str=None)

A getter method for the SolomonGeo class that returns a pandas dataset containg
the id variable and the total population variable. This is the minimal data required
to display on the map.

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| agg_filter | str | None | Filters the dataframe to the requested aggregation |
| **Returns** | **DataFrame** |  | **Pandas Dataframe containing population data** |

# Testing

In [None]:
sol_geo = SolomonGeo.read_test()

In [None]:
sol_geo.geo_df

Unnamed: 0_level_0,id,geometry,agg,year,male_pop,female_pop,total_pop
geo_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Nggosi,1001,"POLYGON ((159.92475 -9.42335, 159.94071 -9.428...",ward,2009,5240,4822,10062
Mbumburu,1002,"POLYGON ((159.94507 -9.43466, 159.94816 -9.442...",ward,2009,1912,1713,3625
Rove - Lengakiki,1003,"POLYGON ((159.94362 -9.42779, 159.95162 -9.428...",ward,2009,1464,1149,2613
Cruz,1004,"POLYGON ((159.95288 -9.43426, 159.95162 -9.428...",ward,2009,125,107,232
Vavaea,1005,"POLYGON ((159.95970 -9.42874, 159.96065 -9.441...",ward,2009,3788,3208,6996
...,...,...,...,...,...,...,...
Rennell-Bell,5,"MULTIPOLYGON (((159.99852 -11.46234, 160.09535...",province,2009,1549,1492,3041
Guadalcanal,6,"MULTIPOLYGON (((160.85478 -9.83830, 160.85556 ...",province,2009,48283,45330,93613
Malaita,7,"MULTIPOLYGON (((161.39845 -9.57862, 161.39803 ...",province,2009,69232,68364,137596
Makira-Ulawa,8,"MULTIPOLYGON (((162.45571 -10.88208, 162.45811...",province,2009,20789,19630,40419


In [None]:
# TODO should do a count check for the geojson similar to this
sol_geo.geo_df[sol_geo.geo_df['agg'] == 'constituency'].count()

id            50
geometry      50
agg           50
year          50
male_pop      50
female_pop    50
total_pop     50
dtype: int64

In [None]:
# TODO filter and test equality?
test = sol_geo.get_geojson(agg_filter = 'ward')

The sum after merging and filtering should equal the sum from the raw dataset.

In [None]:
test_eq(sol_geo.get_df(agg_filter = 'constituency')['total_pop'].sum(), const_df['total_pop'].sum())

In [None]:
# TODO - test that there are enough rows - don't drop any during merging...

In [None]:
sol_geo.get_df(agg_filter = 'province').sum()

total_pop    515870
dtype: int64

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()