# Load Data

> Functions that load the data for the map.   

**Contents**

`Solomon Geospatial Data`
- `SolomonGeo`: A class that cleans the solomon islandas census and geography data
- `SolomonGeo.read_test`: Loads and transforms the test data
- `SolomonGeo.get_geojson`: Returns the geo_df as a geojson datatset

In [None]:
#| default_exp load_data

In [None]:
#| export
from nbdev.showdoc import *
import geopandas as gpd
import pandas as pd
from git import Repo
import json
from fastcore import *
from fastcore.basics import patch
from fastcore.test import *
import sys
import topojson as tp
import pickle

In [None]:
#| hide
repo = Repo('.', search_parent_directories=True)
fp = str(repo.working_tree_dir) + "/testData/"
const_df = pd.read_csv(fp + 'sol_census_' + 'constituency' + '_' + '2009' + '.csv')
# Check that the files exist using fastcore (both census and geo)

## Solomon Geospatial Data
> Load the geography and census data
### Geography Data
Solomons islands geography data is organised at the levels
 - adm0 - The country as as whole, Solomon Islands
 - adm1 - Also referred to as the province e.g. Honiara, Malaita
 - adm2 - The Consituency e.g. Central Honiara
 - adm3 - Ward, the smallest geography I am reporting. E.g. Cruz

### Census Data
 Solomon islands census data has been used from the 2009 and 2019 census. For the respective census:
2009
 - We have the total population in for each of the administration regions
2019
 - There is only data available down to the province level

In [None]:
#| export
class SolomonGeo:
    # TODO work out how to format the attributes
    # Look at nbdev docs maybe?
    # TODO change all data to int?
    # TODO - should I make this a dataclass for the auto functionaliy? potentially should try it out
    '''
    Load the solomon islands geography data 
    Attributes:
        geo_df    Geopandas dataframe containing geographies and census data
        census_vars    A list of census variables in the dataset 
    '''
    def __init__(self, 
                geo_df:gpd.GeoDataFrame): # A geopandas dataset containing population and geography boundaries for each aggregation
        self.geo_df = geo_df

        # Save a list of census variables
        col_ignore = ['geometry', 'id', 'agg', 'year']
        self.census_vars = list(geo_df.drop(columns = col_ignore).columns)

    @classmethod
    def read_test(cls,
                 ): # A solmon geo class TODO work out how to return self here... (can't?)
        '''
        Initialise the object using the local testing data
        '''
        # TODO might need to further abstract this concatenation process
        df, geo = cls.extract_from_file('ward', '2009')
        gdf_ward = cls.transform('ward', '2009', df, geo)
        
        df, geo = cls.extract_from_file('constituency', '2009')
        gdf_const = cls.transform('constituency', '2009', df, geo)

        df, geo = cls.extract_from_file('province', '2009')
        gdf_prov = cls.transform('province', '2009', df, geo)
        
        # Append the datasets together
        geo_df = pd.concat([gdf_ward, gdf_const, gdf_prov])

        # simplify the geography, use topo to preserver the topology between shapes
        topo = tp.Topology(geo_df, prequantize=False)
        geo_df = topo.toposimplify(360/43200).to_gdf()


        return cls(
            geo_df = geo_df
        )

    @classmethod
    def extract_from_file(cls, 
                            aggregation:str, # Indicates the aggregation of the data
                            year:str, # The year of that data, only relevant for census data
                 ) -> (pd.DataFrame, 
                      gpd.GeoDataFrame): # Returns input pandas and geopandas datasets
        '''
        Extract and return input datasets from file
        '''
        repo = Repo('.', search_parent_directories=True)
        pw = str(repo.working_tree_dir) + "/testData/"
        return (
            pd.read_csv(pw + 'sol_census_' + aggregation + '_' + year + '.csv'), 
            gpd.read_file(pw + 'sol_geo_' + aggregation + '.json')
        )

    @classmethod
    def transform(cls, 
            aggregation:str, # Inicates the aggregation of the data
            year:str, # The year of that data, only relevant for census data
            df:pd.DataFrame, # Uncleaned input census dataset
            geo:gpd.GeoDataFrame, # Uncleaned input geospatial dataset
           )-> gpd.GeoDataFrame: # The geopandas dataset for given aggregation
        '''
        Tranform given raw input dataset into a cleaned and combined geopandas dataframe
        '''
        # Clean the geospatial dataframe
        # Rename columns and keep only necessary ones, Note that id can be province id, contsituency id etc.
        geo.columns = geo.columns.str.replace(r'^[a-zA-Z]+name$', 'geo_name', case = False, regex = True)
        # TODO this assume the key column is the first one (which so far it is...)
        geo.rename(columns = {geo.columns[0]:'id'}, inplace=True)
        # Dropping geo_name from the geography dataset and relying on census data naming
        geo = geo.loc[:, ['id', 'geometry']] 
        
        # Add a column that indicates level of aggregation and one for the year
        geo.loc[:, 'agg'] = aggregation
        geo.loc[:, 'year'] = year
        
        # Clean the census data
        df = df.dropna()
        # Rename columns to be consistent across geography
        df.columns = df.columns.str.replace(r'^[a-zA-Z]+_name$', 'geo_name', case = False, regex = True)
        # id needs to change types twice so that it is a string of an int
        df = df.astype({'id': 'int', 'male_pop':'int', 	'female_pop':'int', 'total_pop':'int'})
        df = df.astype({'id': 'str'})
        
        # Merge the data together
        geo_df = geo.merge(df, on=['id']).set_index("geo_name") # , 'geo_name'
        return geo_df

    @classmethod
    def load_pickle(cls,
                    folder:str, #file path of the folder to save in
                    file_name:str = 'sol_geo.pickle' # file name of the saved class
                 ):
        '''
        Initialise the object from a saved filepath
        '''
        # TODO work out how to make this a class method
        repo = Repo('.', search_parent_directories=True)
        pw = str(repo.working_tree_dir) + folder + file_name
        
        with open(pw, 'rb') as f:
            tmp_geo = pickle.load(f)

        # TODO  guide said do below line, don't think relevant though
        #cls.__dict__.update(tmp_dict) 
        
        return cls(
            geo_df = gpd.GeoDataFrame(tmp_geo['geo_df'])
        )
        

In [None]:
#| hide
show_doc(SolomonGeo)

---

[source](https://github.com/Gippers/SolomonIslandsDataMap/blob/main/SolomonIslandsDataMap/load_data.py#L20){target="_blank" style="float:right; font-size:smaller"}

### SolomonGeo

>      SolomonGeo (geo_df:geopandas.geodataframe.GeoDataFrame)

Load the solomon islands geography data 
Attributes:
    geo_df    Geopandas dataframe containing geographies and census data
    census_vars    A list of census variables in the dataset

|    | **Type** | **Details** |
| -- | -------- | ----------- |
| geo_df | GeoDataFrame | A geopandas dataset containing population and geography boundaries for each aggregation |

In [None]:
#| hide
show_doc(SolomonGeo.read_test)

---

[source](https://github.com/Gippers/SolomonIslandsDataMap/blob/main/SolomonIslandsDataMap/load_data.py#L35){target="_blank" style="float:right; font-size:smaller"}

### SolomonGeo.read_test

>      SolomonGeo.read_test ()

Initialise the object using the local testing data

## Save and Load

### Save Solomon Geo
Saves a solmon geo class in a pickle

In [None]:
#| export
@patch
def save_pickle(self:SolomonGeo,
              folder:str, #file path of the folder to save in
                file_name:str = 'sol_geo.pickle' # file name of the saved class
             ):
    '''
    Save a pickle of the SolomonGeo class
    '''
    repo = Repo('.', search_parent_directories=True)
    pw = str(repo.working_tree_dir) + folder + file_name
    
    f = open(pw, 'wb')
    pickle.dump(self.__dict__, f, 2)
    f.close()

    # For now I will also save the goegraphy in an assets folder
    # TODO update this process in future - may need to save elsewhere
    # TODO I think I need to save in multiple spots
    pw_asset = str(repo.working_tree_dir) + "/assets/sol_geo.json"
    with open(pw_asset, 'w') as f:
        json.dump(self.get_geojson(agg_filter = 'ward'), f)


In [None]:
show_doc(SolomonGeo.save_pickle)

---

[source](https://github.com/Gippers/SolomonIslandsDataMap/blob/main/SolomonIslandsDataMap/load_data.py#L137){target="_blank" style="float:right; font-size:smaller"}

### SolomonGeo.save_pickle

>      SolomonGeo.save_pickle (folder:str, file_name:str='sol_geo.pickle')

Save a pickle of the SolomonGeo class

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| folder | str |  | file path of the folder to save in |
| file_name | str | sol_geo.pickle | file name of the saved class |

In [None]:
show_doc(SolomonGeo.load_pickle)

---

[source](https://github.com/Gippers/SolomonIslandsDataMap/blob/main/SolomonIslandsDataMap/load_data.py#L113){target="_blank" style="float:right; font-size:smaller"}

### SolomonGeo.load_pickle

>      SolomonGeo.load_pickle (folder:str, file_name:str='sol_geo.pickle')

Initialise the object from a saved filepath

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| folder | str |  | file path of the folder to save in |
| file_name | str | sol_geo.pickle | file name of the saved class |

## Get Geo JSON
A getter method for the geometry portion of the dataset that returns a geoJson formated geography. 

It only includes the geography and location name as id

In [None]:
#| export
@patch
def get_geojson(self:SolomonGeo, 
                agg_filter:str = None, # Filters the geojson to the requested aggregation 
               ) -> dict: # Geo JSON formatted dataset
    '''
    A getter method for the SolomonGeo class that returns a Geo JSON formatted dataset
    '''
    ret = self.geo_df
    if agg_filter is not None:
        ret = ret.loc[ret['agg'] == agg_filter, :]
    # Return only the core data to minimise the html size
    return json.loads(ret.loc[:, ['geometry']].to_json())

In [None]:
#| hide
show_doc(SolomonGeo.get_geojson)

---

[source](https://github.com/Gippers/SolomonIslandsDataMap/blob/main/SolomonIslandsDataMap/load_data.py#L161){target="_blank" style="float:right; font-size:smaller"}

### SolomonGeo.get_geojson

>      SolomonGeo.get_geojson (agg_filter:str=None)

A getter method for the SolomonGeo class that returns a Geo JSON formatted dataset

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| agg_filter | str | None | Filters the geojson to the requested aggregation |
| **Returns** | **dict** |  | **Geo JSON formatted dataset** |

In [None]:
#| export
@patch
def get_df(self:SolomonGeo, 
                agg_filter:str = None, # Filters the dataframe to the requested aggregation 
               ) -> pd.DataFrame: # Pandas Dataframe containing population data
    '''
    A getter method for the SolomonGeo class that returns a pandas dataset containg
    the id variable and the total population variable. This is the minimal data required
    to display on the map. 
    '''
    ret = self.geo_df
    # TODO check that filter is valid
    if agg_filter is not None:
        ret = ret.loc[ret['agg'] == agg_filter, :]
    # Return only the core data to minimise the html size
    names = ['geometry', 'id', 'agg', 'year']
    return pd.DataFrame(ret.drop(columns = names))

In [None]:
#| hide
show_doc(SolomonGeo.get_df)

---

[source](https://github.com/Gippers/SolomonIslandsDataMap/blob/main/SolomonIslandsDataMap/load_data.py#L175){target="_blank" style="float:right; font-size:smaller"}

### SolomonGeo.get_df

>      SolomonGeo.get_df (agg_filter:str=None)

A getter method for the SolomonGeo class that returns a pandas dataset containg
the id variable and the total population variable. This is the minimal data required
to display on the map.

|    | **Type** | **Default** | **Details** |
| -- | -------- | ----------- | ----------- |
| agg_filter | str | None | Filters the dataframe to the requested aggregation |
| **Returns** | **DataFrame** |  | **Pandas Dataframe containing population data** |

# Testing

In [None]:
sol_geo = SolomonGeo.read_test()

In [None]:
sol_geo.geo_df

Unnamed: 0,geometry,id,agg,year,male_pop,female_pop,total_pop
Nggosi,"MULTIPOLYGON (((159.94071 -9.42825, 159.91806 ...",1001,ward,2009,5240,4822,10062
Mbumburu,"MULTIPOLYGON (((159.95194 -9.43454, 159.93912 ...",1002,ward,2009,1912,1713,3625
Rove - Lengakiki,"MULTIPOLYGON (((159.95162 -9.42894, 159.94071 ...",1003,ward,2009,1464,1149,2613
Cruz,"MULTIPOLYGON (((159.95288 -9.43426, 159.95311 ...",1004,ward,2009,125,107,232
Vavaea,"MULTIPOLYGON (((159.96243 -9.43309, 159.95694 ...",1005,ward,2009,3788,3208,6996
...,...,...,...,...,...,...,...
Rennell-Bell,"MULTIPOLYGON (((160.00120 -11.46231, 159.98697...",5,province,2009,1549,1492,3041
Guadalcanal,"MULTIPOLYGON (((160.85478 -9.83830, 160.85295 ...",6,province,2009,48283,45330,93613
Malaita,"MULTIPOLYGON (((161.39845 -9.57862, 161.39904 ...",7,province,2009,69232,68364,137596
Makira-Ulawa,"MULTIPOLYGON (((162.45571 -10.88208, 162.43361...",8,province,2009,20789,19630,40419


Save and Load

In [None]:
sol_geo.save_pickle('/testData/')

In [None]:
SolomonGeo.load_pickle('/testData/')

<__main__.SolomonGeo>

In [None]:
# TODO work out how to check save and load are the same as a test
# Might need to wreite and equality function

In [None]:
# TODO should do a count check for the geojson similar to this
sol_geo.geo_df[sol_geo.geo_df['agg'] == 'constituency'].count()

geometry      50
id            50
agg           50
year          50
male_pop      50
female_pop    50
total_pop     50
dtype: int64

In [None]:
# TODO filter and test equality?
test = sol_geo.get_geojson(agg_filter = 'ward')

The sum after merging and filtering should equal the sum from the raw dataset.

In [None]:
test_eq(sol_geo.get_df(agg_filter = 'constituency')['total_pop'].sum(), const_df['total_pop'].sum())

In [None]:
# TODO - test that there are enough rows - don't drop any during merging...

In [None]:
sol_geo.get_df(agg_filter = 'province').sum()

male_pop      264455
female_pop    251415
total_pop     515870
dtype: int64

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()