# Load Data

> Functions that load the data for the map.   

In [None]:
#| default_exp load_data

In [None]:
#| export
from nbdev.showdoc import *
import geopandas as gpd
import pandas as pd
from git import Repo
import json

In [None]:
#| hide
repo = Repo('.', search_parent_directories=True)
fp = str(repo.working_tree_dir) + "/testData/"

# Check that the files exist using fastcore (both census and geo)

## Load the geography and census data
>  Solomons islands geography data is organised at the levels
> - adm0 - The country as as whole, Solomon Islands
> - adm1 - Also referred to as the province e.g. Honiara, Malaita
> - adm2 - The Consituency e.g. Central Honiara
> - adm3 - Ward, the smallest geography I am reporting. E.g. Cruz
> Solomon islands census data has been used from the 2009 and 2019 census. For the respective census:
> 2009
> - We have the total population in for each of the administration regions
>  2019
>  - There is only data available down to the province level

In [None]:
#| export
class SolomonGeo:
    # TODO work out how to format the attributes
    # Look at nbdev docs maybe?
    # TODO change all data to int?
    # TODO - should I make this a dataclass for the auto functionaliy? potentially should try it out
    '''
    Load the solomon islands geography data 
    Attributes:
        adm3    Geopandas dataframe containing admin 3 geographies.
    '''
    def __init__(self, 
                geo_df): # A geopandas dataset containing population and geography boundaries for each aggregation
        self.geo_df = geo_df

    @classmethod
    def read_test(cls,
                 )-> gpd.GeoDataFrame: # The geopandas dataset for given aggregation
        '''
        Initialise the object using the local testing data
        '''
        # TODO - need to pass a filepath here
        df = cls.elt('ward', '2009')
        #cls.adm3 = cls.elt('constituency', '2009')

        return cls(
            geo_df = df
        )

    @classmethod
    def extract_from_file(cls, 
                            aggregation:str, # Inicates the aggregation of the data
                            year:str, # The year of that data, only relevant for census data
                 )-> (pd.DataFrame, 
                      gpd.GeoDataFrame): # Returns input pandas and geopandas datasets
        '''
        Extract and return input datasets from file
        '''
        repo = Repo('.', search_parent_directories=True)
        pw = str(repo.working_tree_dir) + "/testData/"
        return (
            pd.read_csv(pw + 'sol_census_' + aggregation + '_' + year + '.csv'), 
            gpd.read_file(pw + 'sol_geo_' + aggregation + '.json')
        )

    @classmethod
    def elt(cls, 
            aggregation:str, # Inicates the aggregation of the data
            year:str, # The year of that data, only relevant for census data
           )-> gpd.GeoDataFrame: # The geopandas dataset for given aggregation
        '''
        Load and transform given filepath into a geojason geopandas dataframe
        '''
        repo = Repo('.', search_parent_directories=True)
        pw = str(repo.working_tree_dir) + "/testData/"
        
        geo = cls.load_geo(pw + 'sol_geo_' + aggregation + '.json')
        df = cls.load_census(pw + 'sol_census_' + aggregation + '_' + year + '.csv')
        # Add a column that indicates level of aggregation
        geo['agg'] = aggregation
        adm3 = geo.merge(df, on=['id', 'geo_name']).set_index("geo_name")
        return adm3

    @classmethod
    def load_geo(cls, pw:str, # The pathway to the dataset
           )-> gpd.GeoDataFrame: # The geojason dataset for given aggregation
        '''
        Load and transform given filepath into a geojason geopandas dataframe
        '''
        geo = gpd.read_file(pw)
        # Rename columns and keep only necessary ones.
        # Note that id can be province id, contsituency id etc.
        geo.columns = geo.columns.str.replace(r'^[a-zA-Z]+name$', 'geo_name', case = False, regex = True)
        geo.rename(columns = {geo.columns[0]:'id'}, inplace=True)
        geo = geo[['id', 'geo_name', 'geometry']]
        return geo

    @classmethod
    def load_census(cls, pw:str, # Pathway of the dataset
           )-> pd.DataFrame: # A pandas dataframe
        '''
        Load and transform data from filepath into pandas dataset
        '''
        df = pd.read_csv(pw)
        # Remove any missing 
        df = df.dropna()
        # Rename columns to be consistent across geography
        df.columns = df.columns.str.replace(r'^[a-zA-Z]+_name$', 'geo_name', case = False, regex = True)
        df['id'] = df['id'].astype(int).astype(str)  # Change type of id
        return df

    # TODO add this as a @patch method to seperate out
    def get_geojson(self,
                   ) -> dict: # Geo JSON formatted dataset
        '''
        A getter method for the GeoDataFrame that returns a Geo JSON
        '''
        return json.loads(self.geo_df.to_json())


In [None]:
#| hide
show_doc(SolomonGeo)

---

[source](https://github.com/Gippers/SolomonIslandsDataMap/blob/main/SolomonIslandsDataMap/load_data.py#L14){target="_blank" style="float:right; font-size:smaller"}

### SolomonGeo

>      SolomonGeo (geo_df)

Load the solomon islands geography data 
Attributes:
    adm3    Geopandas dataframe containing admin 3 geographies.

|    | **Details** |
| -- | ----------- |
| geo_df | A geopandas dataset containing population and geography boundaries for each aggregation |

In [None]:
#| hide
show_doc(SolomonGeo.read_test)

---

[source](https://github.com/Gippers/SolomonIslandsDataMap/blob/main/SolomonIslandsDataMap/load_data.py#L29){target="_blank" style="float:right; font-size:smaller"}

### SolomonGeo.read_test

>      SolomonGeo.read_test ()

Initialise the object using the local testing data

In [None]:
#| hide
show_doc(SolomonGeo.get_geojson)

---

[source](https://github.com/Gippers/SolomonIslandsDataMap/blob/main/SolomonIslandsDataMap/load_data.py#L90){target="_blank" style="float:right; font-size:smaller"}

### SolomonGeo.get_geojson

>      SolomonGeo.get_geojson ()

A getter method for the GeoDataFrame that returns a Geo JSON

# Testing

In [None]:
sol_geo = SolomonGeo.read_test()

In [None]:
sol_geo.geo_df

Unnamed: 0_level_0,id,geometry,agg,male_pop,female_pop,total_pop
geo_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Nggosi,1001,"MULTIPOLYGON (((159.92475 -9.42335, 159.92641 ...",ward,5240,4822,10062
Mbumburu,1002,"MULTIPOLYGON (((159.94507 -9.43466, 159.94519 ...",ward,1912,1713,3625
Rove - Lengakiki,1003,"MULTIPOLYGON (((159.94362 -9.42779, 159.94382 ...",ward,1464,1149,2613
Cruz,1004,"MULTIPOLYGON (((159.95288 -9.43426, 159.95284 ...",ward,125,107,232
Vavaea,1005,"MULTIPOLYGON (((159.95970 -9.42874, 159.95877 ...",ward,3788,3208,6996
...,...,...,...,...,...,...
Duff Islands,913,"MULTIPOLYGON (((167.24674 -9.93260, 167.24704 ...",ward,262,249,511
Utupua,914,"MULTIPOLYGON (((166.50509 -11.30801, 166.50353...",ward,586,582,1168
Vanikoro,915,"MULTIPOLYGON (((166.98326 -11.67945, 166.98327...",ward,625,668,1293
Tikopia,916,"MULTIPOLYGON (((168.84036 -12.28504, 168.84096...",ward,604,681,1285


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()