# BII project land use and intensity
This notebook contains the code for pre-processing the variables in to the required format and the subsequent extraction of the data.

1. Step 1: A 1km and 8km grid is defined.
2. Step 2: Each variable is imported, filtered and scaled. If area is required, converted to area per catgeory per 1x1 km cell. If mean variable is required, the mean value per variable per 1x1 km block is obtained.
3. Step 3: GEEML is used to extract the 1x1km data for each 1x1km grid cell.
4. Step 4: The Expert decision tree is applied for each 1x1km cell in python.
5. Step 5: The intensity scores are derived by first noralising all variables and thereafter, weighting them equally (1/#vars)

## Setup

In [1]:
import os
from datetime import datetime

import ee
# ee.Authenticate()
ee.Initialize(opt_url='https://earthengine-highvolume.googleapis.com')
import geemap, eerepr
from tqdm.auto import tqdm
from geeml.utils import createGrid, getCountry, eeprint
from geeml.extract import extractor

In [2]:
# get sub-saharan countries
# https://code.earthengine.google.com/443ea31edb2f25f58a3346e7dc8c1064
countries = ee.FeatureCollection("USDOS/LSIB/2017")
c2 = ee.FeatureCollection("USDOS/LSIB_SIMPLE/2017")

africa = c2.filter(ee.Filter.eq('wld_rgn', 'Africa'))
sub = countries.filterBounds(africa).aggregate_array('COUNTRY_NA')\
.removeAll(['Algeria', 'Egypt', 'Libya', 'Morocco', 'Tunisia', 'Spain [Canary Is]', 'Spain [Plazas de Soberania]', 'Portugal [Madeira Is]'])

sub_africa = countries.filter(ee.Filter.inList('COUNTRY_NA', sub))

In [3]:
country_list = sub_africa.aggregate_array('COUNTRY_NA').getInfo()

dd = r"C:\Users\coach\myfiles\miscellenous\hayley\outputs"

config = {}
for country in country_list:
    csavepath = f"{dd}/{country}"
    caoi = sub_africa.filter(ee.Filter.eq('COUNTRY_NA', country))
    config[country] = {'dd':csavepath,
                       'aoi':caoi}
print(config.keys())

dict_keys(['Benin', 'Niger', 'Nigeria', 'Djibouti', 'Equatorial Guinea', 'Eritrea', 'Ethiopia', 'Gabon', 'Gambia, The', 'Mozambique', 'Namibia', 'Gaza Strip (disp)', 'Ghana', 'Guinea', 'Guinea-Bissau', 'Kenya', 'Sierra Leone', 'Koualou (disp)', 'Lesotho', 'Liberia', 'Madagascar', 'Malawi', 'Mali', 'Mauritania', 'Mayotte (Fr)', 'Rwanda', 'Sao Tome & Principe', 'Senegal', 'Somalia', 'South Sudan', 'Sudan', 'Swaziland', 'Tanzania', 'Togo', 'Uganda', 'Western Sahara (disp)', 'Zambia', 'Zimbabwe', 'Israel', 'Abyei (disp)', 'Angola', 'Botswana', 'Comoros', 'Congo, Dem Rep of the', 'Congo, Rep of the', 'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cameroon', "Cote d'Ivoire", 'Central African Rep', 'Chad', 'South Africa'])


In [5]:
country = 'South Africa'

# Get current date- used for saving files
date = datetime.today().strftime('%d%m%Y')

# Set path for saving data
dd = config.get(country).get('dd')
if not os.path.exists(dd):
    os.makedirs(dd)
    os.chdir(dd)
else:
    os.chdir(dd)
    
print(os.getcwd())

C:\Users\coach\myfiles\miscellenous\hayley\outputs\South Africa


In [6]:
# Sanity check that the correct country is selected
Map = geemap.Map()
aoi = config.get(country).get('aoi')
Map.centerObject(aoi, 3)
Map.addLayer(aoi)
Map

Map(center=[-28.97574975972159, 25.201998135703736], controls=(WidgetControl(options=['position', 'transparent…

## Data extraction

In [7]:
# Define projection
wkt = '\
PROJCS["Africa_Albers_Equal_Area_Conic",\
    GEOGCS["GCS_WGS_1984",\
    DATUM["WGS_1984",\
    SPHEROID["WGS_1984",6378137,298.257223563]],\
    PRIMEM["Greenwich",0],\
    UNIT["Degree",0.017453292519943295]],\
    PROJECTION["Albers_Conic_Equal_Area"],\
    PARAMETER["False_Easting",0],\
    PARAMETER["False_Northing",0],\
    PARAMETER["longitude_of_center",25],\
    PARAMETER["Standard_Parallel_1",20],\
    PARAMETER["Standard_Parallel_2",-23],\
    PARAMETER["latitude_of_center",0],\
    UNIT["Meter",1],\
    AUTHORITY["EPSG","102022"]]'

# ALbers equal area for africa
epsg_102022 = ee.Projection(wkt)

grid1km, _ = createGrid(1000, aoi, crs = epsg_102022)
grid8km, _ = createGrid(8000, aoi, crs = epsg_102022)
grid1kmr, _ = createGrid(1000, aoi, vect= False, crs = epsg_102022)
# Create a 8km raster with unique id
grid8kmr, _ = createGrid(8000, aoi, vect = False, crs = epsg_102022)

In [8]:
# Urban cover (10m>>1km) 2019
area = ee.Image.pixelArea().divide(1e6).clip(aoi.geometry().buffer(5000))
urban_cover = ee.ImageCollection("projects/sat-io/open-datasets/WSF/WSF_2019").filterBounds(aoi)\
.mosaic().eq(255).unmask(0)
urbanAreaImage = area.multiply(urban_cover).rename('areakm2_urban')

# Crop cover (30 m)
crop_cover = ee.ImageCollection("users/potapovpeter/Global_cropland_2019").filterBounds(aoi).mosaic()
cropAreaImage = area.multiply(crop_cover).rename('areakm2_cropCover')

# Protected area
prot_areas = ee.FeatureCollection("projects/ee-geethensingh/assets/WDPA_Africa_strict").filterBounds(aoi)
protectedAreaImage = area.clipToCollection(prot_areas).unmask(0).rename('areakm2_protArea')    

# Plantation and tree crop
result = ee.ImageCollection("users/liuzhujun/SDPT_NEW").mosaic()
plantag = ee.Image("users/duzhenrong/SDPT/sdpt_plantag")
china_plantyear = ee.ImageCollection("users/liuzhujun/SDPT_China").mosaic().rename('plantyear')
DescalesOP = ee.ImageCollection("users/liuzhujun/Descales").mosaic()
op = DescalesOP.updateMask(DescalesOP.gt(1980)).rename('plantyear')
sdpt_name = ee.Image("users/duzhenrong/SDPT/sdpt_name")

china_plantyear=china_plantyear.updateMask(china_plantyear.gt(1980))
result=result.rename('plantyear').updateMask((sdpt_name.lt(120).And(sdpt_name.gt(0))).Or(sdpt_name.gt(129))).toInt32()

plantationForest=ee.ImageCollection([result.updateMask(plantag.eq(1)),china_plantyear]).mosaic().gt(0).unmask(0) 
plantationAreaImage = area.updateMask(plantationForest).rename('areakm2_plantation')
treeCrop=result.multiply(plantag.eq(2))
treeCrop =ee.ImageCollection([treeCrop,op]).mosaic().gt(0)  
treeCropAreaImage = area.multiply(treeCrop).rename('areakm2_treeCrop')

#Soil nutrients
Soil_Lownutrient = ee.FeatureCollection("projects/ee-geethensingh/assets/Bell_1982_nutrient_map")\
.filter(ee.Filter.eq('Nut_status', 'Low'))
Soil_Mednutrient = ee.FeatureCollection("projects/ee-geethensingh/assets/Bell_1982_nutrient_map")\
.filter(ee.Filter.eq('Nut_status', 'Medium'))
Soil_Highnutrient = ee.FeatureCollection("projects/ee-geethensingh/assets/Bell_1982_nutrient_map")\
.filter(ee.Filter.eq('Nut_status', 'High'))

soilLowNutrientAreaImage = area.clipToCollection(Soil_Lownutrient).unmask(0).rename('areakm2_slowNutrArea')
soilMedNutrientAreaImage = area.clipToCollection(Soil_Mednutrient).unmask(0).rename('areakm2_sMedNutriArea')
soilHighNutrientAreaImage = area.clipToCollection(Soil_Highnutrient).unmask(0).rename('areakm2_sHighNutriArea')

# Population Density
popDensity = ee.ImageCollection("CIESIN/GPWv411/GPW_UNWPP-Adjusted_Population_Density")\
.select('unwpp-adjusted_population_density').filterBounds(aoi).mosaic().rename('popDensity').unmask(0)
                                       
# Grazing Intensity
area = ee.Image('projects/ee-geethensingh/assets/GrazingDensity/GI_8_Areakm').unmask(0)
cattleDensity = ee.Image('projects/ee-geethensingh/assets/GrazingDensity/5_Ct_2010_Da').divide(area).rename('cattleDensity').unmask(0)
sheepDensity = ee.Image('projects/ee-geethensingh/assets/GrazingDensity/5_Sh_2010_Da').divide(area).rename('sheepDensity').unmask(0)
goatDensity = ee.Image('projects/ee-geethensingh/assets/GrazingDensity/5_Gt_2010_Da').divide(area).rename('goatDensity').unmask(0)

# Precipitation
years = list(range(1991,2021))
precipitation = ee.ImageCollection("UCSB-CHG/CHIRPS/DAILY").filterBounds(aoi)  
precipitation = ee.Image(ee.ImageCollection(ee.List(years).map(lambda year: precipitation\
.filterDate(ee.Number(year).format(), ee.Number(year).add(1).format()).sum())).mean().rename('mm_precipitation')).unmask(0)

# Nitrogen input
proj = ee.Projection('EPSG:4326').scale(0.25,0.25)
grid = aoi.geometry().coveringGrid(proj)

nInput = ee.FeatureCollection("projects/ee-geethensingh/assets/Nfur_15arcmins_transformed").filterBounds(aoi)\
.map(lambda x: x.set("Nfer_type",ee.Algorithms.ObjectType(x.get("Nfer_kgha_"))))
nInputStr = nInput.filter(ee.Filter.eq('Nfer_type', 'String')).map(lambda x: x.set("Nfer_kgha_",ee.Number.parse(x.get("Nfer_kgha_")).toFloat()))
nInputNumber = nInput.filter(ee.Filter.neq('Nfer_type', 'String')).map(lambda x: x.set("Nfer_kgha_",ee.Number(x.get("Nfer_kgha_")).toFloat()))

nInput = nInputStr.merge(nInputNumber)

def pointToGrid(ft):
    feat = ee.Feature(ft);
    first = nInput.filterBounds(feat.geometry()).aggregate_first('Nfer_kgha_');
    return feat.set('Nfer_kgha_', first);

nInput = grid.filterBounds(nInput).map(pointToGrid)\
.reduceToImage(**{'properties': ee.List(["Nfer_kgha_"]), 'reducer': ee.Reducer.first()})\
.rename('Nfer_kgha').unmask(0)

# FieldSize
fieldSize = ee.FeatureCollection("projects/ee-geethensingh/assets/dominant_field_sizes").filterBounds(aoi)\
.filter(ee.Filter.neq('field_size', 'NA')).map(lambda x: x.set("field_size_",ee.Number.parse(x.get("field_size"))))

def maxSize(ft):
    feat = ee.Feature(ft);
    max_ = fieldSize.filterBounds(feat.geometry()).aggregate_max('field_size_');
    return feat.set('max_fieldSize', max_);

fieldSize = grid1km.filterBounds(fieldSize).map(maxSize).reduceToImage(**{
  'properties': ['max_fieldSize'],
  'reducer': ee.Reducer.first()
}).rename('fieldSize');

In [None]:
# Get current date- used for saving files
date = datetime.today().strftime('%d%m%Y')

# # Extract data
sumCovariates = cropAreaImage.addBands([protectedAreaImage, soilLowNutrientAreaImage,\
                                    soilMedNutrientAreaImage, soilHighNutrientAreaImage,\
                                    plantationAreaImage, treeCropAreaImage])
# extract 30m datasets for 1km grid using sum reducer
extractor(sumCovariates, aoi.geometry(), scale = 30, dd = dd, target = grid1km, crs = epsg_102022, num_threads = 25)\
.extractByGrid(reduce = True, reducer = ee.Reducer.sum(), gridSize = 100000, batchSize = 3500, filename = 'sum1_1km.csv')

# Extract 10m datasets for 1km grid using sum reducer
extractor(urbanAreaImage, aoi.geometry(), scale = 10, dd = dd, target = grid1km, crs = epsg_102022, num_threads = 25)\
.extractByGrid(reduce = True, reducer = ee.Reducer.sum(), gridSize = 100000, batchSize = 3500, filename = 'sum2_1km.csv')

# Extract 30m datatsets for 1km grid using sum reducer
extractor(sumCovariates, aoi.geometry(), scale = 30, dd = dd, target = grid8km, crs = epsg_102022, num_threads = 25)\
.extractByGrid(reduce = True, reducer = ee.Reducer.sum(), gridSize = 100000, batchSize = 3500, filename = 'sum1_8km.csv')

#Extract 10m datasets for 8km grid using sum reducer
extractor(urbanAreaImage, aoi.geometry(), scale = 10, dd = dd, target = grid8km, crs = epsg_102022, num_threads = 25)\
.extractByGrid(reduce = True, reducer = ee.Reducer.sum(), gridSize = 100000, batchSize = 3500, filename = 'sum2_8km.csv')

fieldSize = ee.Image.pixelCoordinates(epsg_102022).addBands([fieldSize, grid1kmr.rename('id1km'), grid8kmr.rename('id8km')])

# extract field size and unique id's at 1km scale using 1km grid using first reducer
dd = os.getcwd()
extractor(fieldSize, aoi.geometry(), scale = 1000, dd = dd, target = grid1km, crs = epsg_102022, num_threads = 25)\
.extractByGrid(reduce = True, reducer = ee.Reducer.first(), gridSize = 100000, batchSize = 3500, filename = 'fieldSize_1km.csv')

# extract field size and unique id's at 1km scale for 8km grid using first reducer
extractor(fieldSize, aoi.geometry(), scale = 1000, dd = dd, target = grid8km, crs = epsg_102022, num_threads = 25)\
.extractByGrid(reduce = True, reducer = ee.Reducer.first(), gridSize = 100000, batchSize = 3500, filename = 'fieldSize_8km.csv')

# Extract precipitation and nitrogen Input
meanCovariates = precipitation.addBands([nInput, popDensity,\
                                        sheepDensity, goatDensity,cattleDensity])

extractor(meanCovariates, aoi.geometry(), scale = 1000, dd = dd, target = grid1km, crs = epsg_102022, num_threads = 25)\
.extractByGrid(reduce = True, reducer = ee.Reducer.mean(), gridSize = 100000, batchSize = 3500, filename = 'mean_1km.csv')

extractor(meanCovariates, aoi.geometry(), scale = 1000, dd = dd, target = grid8km, crs = epsg_102022, num_threads = 25)\
.extractByGrid(reduce = True, reducer = ee.Reducer.mean(), gridSize = 100000, batchSize = 3500, filename = 'mean_8km.csv')

sum2_1km.csv: |                                                                        | [  0.0%] in 00:00 (et…

## Prepare data
## 1km

In [9]:
# For grid cells with crop_cover and no data, assign field size of nearest cell.
import math
import os

import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from tqdm.auto import tqdm

from scipy.spatial import cKDTree
from shapely.geometry import Point

In [10]:
def ckdnearest(gdA, gdB):

    nA = np.array(list(gdA.geometry.apply(lambda x: (x.x, x.y))))
    nB = np.array(list(gdB.geometry.apply(lambda x: (x.x, x.y))))
    btree = cKDTree(nB)
    dist, idx = btree.query(nA, k=1)
    gdB_nearest = gdB.iloc[idx].drop(columns="geometry").reset_index(drop=True)
    gdf = pd.concat(
        [
            gdA.reset_index(drop=True),
            gdB_nearest,
            pd.Series(dist, name='dist')
        ], 
        axis=1)

    return gdf

In [11]:
# #Field Size data
df = pd.read_csv(r"fieldSize_1km.csv", low_memory=False, header = None, index_col = False, skiprows=1)
df.columns = ['x', 'y', 'fieldSize', 'id1km', 'id8km', 'gid']

In [13]:
df2 = pd.read_csv(r"sum1_1km.csv", low_memory=False, header = None, index_col = False, skiprows=1)
df2.columns = ['areakm2_cropCover', 'areakm2_protArea',
       'areakm2_slowNutrArea', 'areakm2_sMedNutriArea',
       'areakm2_sHighNutriArea', 'areakm2_plantation', 'areakm2_treeCrop', 'gid']

df0 = pd.read_csv(r"sum2_1km.csv", low_memory=False, header = None, index_col = False, skiprows=1)
df0.columns = ['areakm2_urban', 'gid']

# Merge datasets into single datset based on matching GID column
df1 = pd.merge(df2, df0, on= 'gid', how = 'outer').fillna(0)

In [14]:
# Mean covariate data
dfmean = pd.read_csv(r"mean_1km.csv", low_memory=False, header = None, index_col = False, skiprows=1)
dfmean.columns = ['mean_precip','Nfer_kgha', 'popDensity', 'sheepDensity', 'goatDensity', 'cattleDensity', 'gid']
dfmean= dfmean[dfmean['gid']!= '245,-2948']

In [15]:
# Merge datasets into single datset based on matching GIS column
dfJoin = pd.merge(df1, dfmean, on= 'gid')
dfAll = pd.merge(dfJoin, df, on= 'gid').drop_duplicates().reset_index(drop=True)

In [16]:
# Fill in missing mean precipitation data
# by copying attribute of closest point
missing = dfAll.mean_precip.isna().sum()
print('The number of missing mean_precip values:', missing)
if missing>0:
    # Blocks with mean-precip data
    sdf2 = dfAll.loc[dfAll['mean_precip'].notna(), ['mean_precip', 'x', 'y']]
    gdf2 =  gpd.GeoDataFrame(sdf2.drop(['x', 'y'],axis=1), geometry = gpd.points_from_xy(sdf2.x, sdf2.y))

    # All columns without join-attributes(s) column
    sdf1 = dfAll.loc[dfAll['mean_precip'].isna(), ['gid','x', 'y']]
    gdf1 =  gpd.GeoDataFrame(sdf1.drop(['x', 'y'],axis=1), geometry = gpd.points_from_xy(sdf1.x, sdf1.y))

    joinNearest1 = ckdnearest(gdf1, gdf2)

    result1 = pd.DataFrame(joinNearest1)[['gid', 'mean_precip']]
    result1.columns = ['gid', 'mean_precip2']

    dfAll = pd.merge(dfAll, result1, how = 'outer',on = 'gid').drop_duplicates()
    dfAll['mean_precip'].fillna(dfAll['mean_precip2'], inplace = True)
    dfAll.drop('mean_precip2', axis=1, inplace=True)

The number of missing mean_precip values: 0


In [17]:
# select fieldSize column for cells that are cropcover or treeCrop
dfs1 = dfAll.loc[(dfAll['areakm2_cropCover']>0)|(dfAll['areakm2_treeCrop']>0), ['x', 'y', 'fieldSize', 'gid']]

# get rows with fieldsize data
dfs2 = dfs1.dropna().loc[
      (dfs1['fieldSize']!= 3507), ['fieldSize', 'x', 'y']]
# all rows without fieldsize column(including rows with data)
dfs3 = dfs1[['x', 'y', 'gid']]

# with attribute
gdf2 = gpd.GeoDataFrame(dfs2.drop(['x', 'y'],axis=1), geometry = gpd.points_from_xy(dfs2.x, dfs2.y))

# Missing data
gdf3 = gpd.GeoDataFrame(dfs3.drop(['x', 'y'],axis=1), geometry = gpd.points_from_xy(dfs3.x, dfs3.y))

In [18]:
joinNearest = ckdnearest(gdf3, gdf2)
result = pd.DataFrame(joinNearest)[['gid', 'fieldSize']]
result.columns = ['gid', 'fieldSize2']

finaldf = pd.merge(dfAll, result, how= 'outer',on = 'gid').drop_duplicates()
finaldf['fieldSize2'] = finaldf['fieldSize2'].fillna(0)
finaldf['fieldSize'] = finaldf['fieldSize2'].astype('int')

dfF = finaldf.drop('fieldSize2', axis=1).reset_index(drop=True)

In [19]:
dfF.loc[:,'id8km'] = dfF.loc[:,'id8km'].fillna(0)
dfF.loc[:,'id1km'] = dfF.loc[:,'id1km'].fillna(0)
dfF = dfF[dfF['areakm2_treeCrop']!= '2726,-3471']

# Format data types
dfF.loc[:,'areakm2_treeCrop'] = dfF.loc[:,'areakm2_treeCrop'].astype('float64')
dfF.loc[:,'gid'] = dfF.loc[:,'gid'].astype('string')
dfF.loc[:,'id8km'] = dfF.loc[:,'id8km'].astype('int64')
dfF.loc[:,'Nfer_kgha'] = dfF.loc[:,'Nfer_kgha'].astype('float64')
dfF.loc[:,'mean_precip'] = dfF.loc[:,'mean_precip'].astype('float32')

# Add scaled fieldSize
dfF['scaled_fieldSize'] = 0
dfF.loc[dfF['fieldSize'] == 3502, 'scaled_fieldSize'] = 1.0
dfF.loc[dfF['fieldSize'] == 3503, 'scaled_fieldSize'] = 0.8
dfF.loc[dfF['fieldSize'] == 3504, 'scaled_fieldSize'] = 0.6
dfF.loc[dfF['fieldSize'] == 3505, 'scaled_fieldSize'] = 0.4
dfF.loc[dfF['fieldSize'] == 3506, 'scaled_fieldSize'] = 0.2

In [20]:
# Fill in missing Nutrient data
# Copy attribute of closest point
df = dfF
df['maxN'] = df[['areakm2_slowNutrArea','areakm2_sMedNutriArea', 'areakm2_sHighNutriArea']].max(axis=1)

missing = df[df.maxN==0].shape[0]
print('The number of missing soil nutrient data values:', missing)
if missing>0:
    # Blocks with Nutrient areas
    sdf2 = df.loc[df['maxN']>0, ['areakm2_slowNutrArea','areakm2_sMedNutriArea', 'areakm2_sHighNutriArea', 'x', 'y']]
    gdf2 =  gpd.GeoDataFrame(sdf2.drop(['x', 'y'],axis=1), geometry = gpd.points_from_xy(sdf2.x, sdf2.y))

    # All columns without join-attributes(s) column
    sdf1 = df[['gid', 'x', 'y']]
    gdf1 =  gpd.GeoDataFrame(sdf1.drop(['x', 'y'],axis=1), geometry = gpd.points_from_xy(sdf1.x, sdf1.y))

    joinNearest1 = ckdnearest(gdf1, gdf2)

    result1 = pd.DataFrame(joinNearest1)[['gid', 'areakm2_slowNutrArea','areakm2_sMedNutriArea', 'areakm2_sHighNutriArea']]
    result1.columns = ['gid', 'areakm2_slowNutrArea2','areakm2_sMedNutriArea2', 'areakm2_sHighNutriArea2']

    finaldf = pd.merge(df, result1, how = 'outer',on = 'gid')

The number of missing soil nutrient data values: 346895


In [21]:
# Export data
outfile = f'{country}_POC_{date}.feather'
finaldf.drop_duplicates(subset = ['x', 'y']).reset_index(drop=True).to_feather(outfile)

In [78]:
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

# We restrict to South Africa.
ax = world[world.continent == 'South Africa'].plot(
    color='white', edgecolor='black')

# plot ``GeoDataFrame``.
gdf.plot(ax=ax, color='red')
plt.show()

NameError: name 'gdf' is not defined

Error in callback <function _draw_all_if_interactive at 0x000001BD391E5F70> (for post_execute):


ValueError: cannot convert float NaN to integer

ValueError: cannot convert float NaN to integer

<Figure size 640x480 with 1 Axes>

## Decision tree

In [22]:
infile = f'{country}_POC_{date}.feather'
df = pd.read_feather(infile).dropna().reset_index(drop=True)

In [23]:
# Add small value to zero precip
finaldf.loc[finaldf.mean_precip==0, 'mean_precip'] = 0.1
# Round of to the nearest 400
max_round = math.ceil(max(finaldf['mean_precip']) / 400) * 400
cut_bins = list(range(0, max_round+1, 400))
# discretize precipitation into categories 
cut_labels = [str(item) for item in range(0, len(cut_bins)-1)]
finaldf["precipDiscrete"] = pd.cut(x = finaldf["mean_precip"].astype(float), bins= cut_bins, labels = cut_labels)
print(finaldf.precipDiscrete.value_counts())

# Identify the max nutrient area
finaldf['maxNutr'] = finaldf[['areakm2_slowNutrArea2','areakm2_sMedNutriArea2', 'areakm2_sHighNutriArea2']].idxmax(axis=1)

# Get unique category based on precip bucket and max nutrient category
finaldf['combination_LS_intensity'] = finaldf[['precipDiscrete','maxNutr']].agg(tuple, axis=1)
finaldf['lsIntensityCat'] = finaldf['combination_LS_intensity'].factorize()[0]

1    569599
0    564693
2     89293
3      1581
Name: precipDiscrete, dtype: int64


In [24]:
# Get unique combinations of precipitation and maxNutrient categories
combinations = list(set(finaldf['lsIntensityCat']))
print("Livestock Intensity categories (based on unique combinations of discretised precipitation and max nutrient category):",combinations)

# Compute the sum of sheep, goat and cattle density
finaldf['lsSum'] = finaldf[['sheepDensity', 'goatDensity', 'cattleDensity']].sum(axis=1)

# Scale data
finaldf['scaled_livestockIntensity']= 0
for combination in tqdm(combinations):
    qq99 = finaldf['lsSum'].quantile(0.99)
    print(f'The 99th percentile of sum density is: {qq99}')
    # apply min max scaling to all data less than 99th quantile per unique combination.
    scaler = MinMaxScaler()
    finaldf.loc[(finaldf['lsIntensityCat'] == combination)&(finaldf['lsSum']<=qq99), 'scaled_livestockIntensity']\
    = scaler.fit_transform(pd.DataFrame(finaldf.loc[(finaldf['lsIntensityCat'] == combination)&(finaldf['lsSum']<=qq99)\
                                                    , 'scaled_livestockIntensity']))
    # for data greater than 99th percentile per category, equal to max value 1
    finaldf.loc[(finaldf['lsIntensityCat'] == combination)&(finaldf['lsSum']>qq99), 'scaled_livestockIntensity'] = 1

Livestock Intensity categories (based on unique combinations of discretised precipitation and max nutrient category): [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


  0%|          | 0/11 [00:00<?, ?it/s]

The 99th percentile of sum density is: 102.65250155634293
The 99th percentile of sum density is: 102.65250155634293
The 99th percentile of sum density is: 102.65250155634293
The 99th percentile of sum density is: 102.65250155634293
The 99th percentile of sum density is: 102.65250155634293
The 99th percentile of sum density is: 102.65250155634293
The 99th percentile of sum density is: 102.65250155634293
The 99th percentile of sum density is: 102.65250155634293
The 99th percentile of sum density is: 102.65250155634293
The 99th percentile of sum density is: 102.65250155634293
The 99th percentile of sum density is: 102.65250155634293


In [25]:
# Scale variables
colnames = ['Nfer_kgha', 'popDensity']

for col in colnames:
    # create scaled variable
    finaldf['scaled_'+col] = finaldf[col]
    # get 99th percentile
    qq99 = finaldf[col].quantile(0.99)
    print(col + ':', qq99)
    # apply min max scaling to all data less than 99th quantile.
    scaler = MinMaxScaler()
    finaldf.loc[finaldf[col]<=qq99, 'scaled_'+ col] = scaler.fit_transform(pd.DataFrame(finaldf.loc[finaldf[col]<=qq99, col]))
    # for data greater than 99th percentile, equal to max value 1
    finaldf.loc[finaldf[col]>qq99, 'scaled_' + col] = 1

Nfer_kgha: 21.672
popDensity: 894.8745574951172


In [26]:
#1= Urban
#2= Timber plantations
#3= Tree Croplands
#4= CropLands
#5= Protected areas
#6= Rangelands/Near-natural lands

def ExpertDT(df):

    #6= Rangelands/Near-natural lands
    df['Land_Use'] = 6

    #5= Protected areas
    df.loc[df['areakm2_protArea']>0.2, 'Land_Use'] = 5

    #4= CropLands
    df.loc[df['areakm2_cropCover']>0.2, 'Land_Use'] = 4

    #3= Tree Croplands
    df.loc[df['areakm2_treeCrop']>0.2, 'Land_Use'] = 3

    #2= Timber plantations
    df.loc[df['areakm2_plantation']>0.2, 'Land_Use'] = 2

    #1= Urban
    df.loc[(df['areakm2_urban']>0.2)|
           (df['popDensity']>1000) , 'Land_Use'] = 1
    return df
    
result = ExpertDT(finaldf)
print('The number of cells per landcover category')
result['Land_Use'].value_counts()

The number of cells per landcover category


6    1082684
4      74893
5      32481
1      16871
2      15238
3       2999
Name: Land_Use, dtype: int64

## Intensity

In [27]:
result['intensity'] = -1

#1= Urban
if result.loc[(result['Land_Use']==1), 'intensity'].shape[0]>0:
    scaler = MinMaxScaler()
    result.loc[(result['Land_Use']==1), 'intensity'] = scaler.fit_transform(pd.DataFrame(result.loc[(result['Land_Use']==1), 'areakm2_urban']))
    result.loc[(result['Land_Use']==1), 'intensity'] = (result['areakm2_urban']+\
                                                        result['scaled_popDensity'])/2

#3= Tree Croplands
if result.loc[(result['Land_Use']==3), 'intensity'].shape[0]>0:
    scaler = MinMaxScaler()
    result.loc[(result['Land_Use']==3), 'intensity'] = scaler.fit_transform(pd.DataFrame(result.loc[(result['Land_Use']==3), 'areakm2_treeCrop']))
    result.loc[(result['Land_Use']==3), 'intensity'] = (result['areakm2_treeCrop']+\
                                                        result['scaled_fieldSize']+\
                                                        result['scaled_Nfer_kgha']\
                                                       )/3

#4 = Croplands
if result.loc[(result['Land_Use']==4), 'intensity'].shape[0]>0:
    scaler = MinMaxScaler()
    result.loc[(result['Land_Use']==4), 'intensity'] = scaler.fit_transform(pd.DataFrame(result.loc[(result['Land_Use']==4), 'areakm2_cropCover']))
    result.loc[(result['Land_Use']==4), 'intensity'] = (result['areakm2_cropCover']+\
                                                        result['scaled_fieldSize']+\
                                                        result['scaled_Nfer_kgha']\
                                                       )/3

#6 = Rangelands/ Near-naturallands
if result.loc[(result['Land_Use']==6), 'intensity'].shape[0]>0:
    result.loc[(result['Land_Use']==6), 'intensity'] = (result['scaled_livestockIntensity']+\
                                                        result['scaled_Nfer_kgha'])/2

In [28]:
gdf = gpd.GeoDataFrame(result.drop(['x', 'y','combination_LS_intensity'], axis = 1), geometry = gpd.points_from_xy(result.x, result.y), crs= 'EPSG:4326')

In [29]:
gdf.drop(['areakm2_slowNutrArea','areakm2_sMedNutriArea', 'areakm2_sHighNutriArea'], axis=1, inplace=True)

In [30]:
gdf.columns

Index(['areakm2_cropCover', 'areakm2_protArea', 'areakm2_plantation',
       'areakm2_treeCrop', 'gid', 'areakm2_urban', 'mean_precip', 'Nfer_kgha',
       'popDensity', 'sheepDensity', 'goatDensity', 'cattleDensity',
       'fieldSize', 'id1km', 'id8km', 'scaled_fieldSize', 'maxN',
       'areakm2_slowNutrArea2', 'areakm2_sMedNutriArea2',
       'areakm2_sHighNutriArea2', 'precipDiscrete', 'maxNutr',
       'lsIntensityCat', 'lsSum', 'scaled_livestockIntensity',
       'scaled_Nfer_kgha', 'scaled_popDensity', 'Land_Use', 'intensity',
       'geometry'],
      dtype='object')

In [31]:
gdf.columns = [ 'km2_cropCov',
 'km2_protAr',
 'km2_plntn',
 'km2_trCrp',
 'gid',
 'km2_urban',
 'mean_precip',
 'mean_Nfer',
 'sum_popDen',
 'sum_shpDen',
 'sum_gtDen',
 'sum_ctlDen',        
 'fieldSize',
 'id1km',
 'id8km',
 'fieldSzScd',
 'maxSNtrAr',      
 'km2_sLwNtr',
 'km2_sMdNtr',
 'km2_sHgNtr',
 'precipDisc',
 'maxSNtrCat',
 'LSIntenCat',
 'LSSum',
 'LSIntenScd',
 'NferScd',
 'popDenScd',     
 'Land_Use',
 'Intensity',
 'geometry']

In [32]:
gdf.loc[:,'precipDisc'] = gdf.loc[:,'precipDisc'].astype('int')
gdf.loc[:,'maxSNtrCat'] = gdf.loc[:,'maxSNtrCat'].astype('string')

In [33]:
print(os.getcwd())
outfile = f"{country}_1km_{date}.shp"
gdf.to_file(outfile)

C:\Users\coach\myfiles\miscellenous\hayley\outputs\South Africa


# 8km - Aggregation method

In [124]:
# Aggregate 1km data by 8km grid ids
grouped_data = result.groupby('id8km').agg({'Land_Use': pd.Series.mode, 'intensity': ['mean','std']})
# Format into dataframe
grouped_data.columns = ['_'.join(i).rstrip('_') for i in grouped_data.columns.values]
grouped_data = grouped_data.reset_index()
grouped_data

Unnamed: 0,id8km,Land_Use_mode,intensity_mean,intensity_std
0,0,6,0.156839,0.399982
1,25,6,0.514178,0.225300
2,57,6,0.211921,0.043774
3,72,6,0.125059,0.024506
4,96,6,0.075002,0.005644
...,...,...,...,...
19428,999773,4,0.727722,0.004564
19429,999797,6,0.307565,0.035039
19430,999839,4,0.495474,0.090302
19431,999910,6,0.057655,0.000643


In [125]:
# Add coordinates
result8km = pd.merge(result[['id8km', 'x', 'y']], grouped_data, on = 'id8km', how='inner')\
.drop_duplicates(['id8km', 'intensity_mean', 'intensity_std'])
# rename columns
result8km.columns = ['id8km', 'x', 'y', 'Land_Use', 'MuIntnsty', 'StdIntnsty']
result8km

Unnamed: 0,id8km,x,y,Land_Use,MuIntnsty,StdIntnsty
0,33293,-99500,-3799500,6,-0.010609,0.547418
104,3037,-95500,-3799500,6,0.145177,0.379336
196,289838,-87500,-3799500,6,-0.025513,0.527104
294,262196,-79500,-3799500,6,0.245256,0.013126
386,741568,-71500,-3799500,6,0.228525,0.016932
...,...,...,...,...,...,...
1953737,289715,560500,-2607500,6,0.245986,0.428805
1953835,115883,568500,-2607500,6,0.358959,0.287424
1953931,457724,576500,-2607500,5,-0.364208,0.715252
1954021,702136,584500,-2607500,5,-0.476051,0.718904


In [126]:
# When computing the mode, some cells have 2 categories with the same number of cells. The lower category is selected.

# add type attribute- rows with two land uses will have a list or array data type
result8km['type'] = result8km['Land_Use'].apply(type).apply(str)
# overwrite array/lists with first element in list
result8km.loc[result8km['type'] == "<class 'numpy.ndarray'>", 'Land_Use'] = result8km.loc[result8km['type'] == "<class 'numpy.ndarray'>", 'Land_Use'].apply(lambda x: x[0])
# set data type to integer
result8km['Land_Use'] = result8km['Land_Use'].astype(int)

In [127]:
print(os.getcwd())
gdf8km = gpd.GeoDataFrame(result8km.drop(['x', 'y', 'type'], axis=1), geometry = gpd.points_from_xy(result8km.x, result8km.y), crs= 'EPSG:4326')
outfile = f"{country}_8km_v1_{date}.shp"
gdf8km.to_file(outfile)

C:\Users\coach\myfiles\miscellenous\hayley\outputs\SA


## 8km- From extracted data at 8km

In [19]:
# For grid cells with crop_cover and no data, assign field size of nearest cell.
import math
import os

import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from tqdm.auto import tqdm

from scipy.spatial import cKDTree
from shapely.geometry import Point

In [20]:
def ckdnearest(gdA, gdB):

    nA = np.array(list(gdA.geometry.apply(lambda x: (x.x, x.y))))
    nB = np.array(list(gdB.geometry.apply(lambda x: (x.x, x.y))))
    btree = cKDTree(nB)
    dist, idx = btree.query(nA, k=1)
    gdB_nearest = gdB.iloc[idx].drop(columns="geometry").reset_index(drop=True)
    gdf = pd.concat(
        [
            gdA.reset_index(drop=True),
            gdB_nearest,
            pd.Series(dist, name='dist')
        ], 
        axis=1)

    return gdf

In [21]:
# #Field Size data
df = pd.read_csv(r"fieldSize2_8km.csv", low_memory=False, header = None, index_col = False, skiprows=1)
df.columns = ['x', 'y', 'fieldSize', 'id1km', 'id8km', 'gid']

In [22]:
df1 = pd.read_csv(r"sum_8km.csv", low_memory=False, header = None, index_col = False, skiprows=1)
df1.columns = ['areakm2_urban', 'areakm2_cropCover', 'areakm2_protArea',
       'areakm2_slowNutrArea', 'areakm2_sMedNutriArea',
       'areakm2_sHighNutriArea', 'areakm2_plantation', 'areakm2_treeCrop',
       'popDensity', 'sheepDensity', 'goatDensity', 'cattleDensity', 'gid']

In [74]:
# Mean covariate data
dfmean = pd.read_csv(r"mean_8km.csv", low_memory=False, header = None, index_col = False, skiprows=1)
dfmean.columns = ['mean_precip', 'Nfer_kgha', 'gid']

In [75]:
# Merge datasets into single datset based on matching GIS column
dfJoin = pd.merge(df1, dfmean, on= 'gid')
dfAll = pd.merge(dfJoin, df, on= 'gid').drop_duplicates().reset_index(drop=True)

In [76]:
# Fill in missing mean precipitation data
# by copying attribute of closest point
missing = dfAll['mean_precip'].isna().sum()
if missing>0:
    # Blocks with mean-precip data
    sdf2 = dfAll.loc[dfAll['mean_precip'].notna(), ['mean_precip', 'x', 'y']]
    gdf2 =  gpd.GeoDataFrame(sdf2.drop(['x', 'y'],axis=1), geometry = gpd.points_from_xy(sdf2.x, sdf2.y))

    # All columns without join-attributes(s) column
    sdf1 = dfAll.loc[dfAll['mean_precip'].isna(), ['gid','x', 'y']]
    gdf1 =  gpd.GeoDataFrame(sdf1.drop(['x', 'y'],axis=1), geometry = gpd.points_from_xy(sdf1.x, sdf1.y))

    joinNearest1 = ckdnearest(gdf1, gdf2)

    result1 = pd.DataFrame(joinNearest1)[['gid', 'mean_precip']]
    result1.columns = ['gid', 'mean_precip2']

    dfAll = pd.merge(dfAll, result1, how = 'outer',on = 'gid').drop_duplicates()
    dfAll['mean_precip'].fillna(dfAll['mean_precip2'], inplace = True)
    dfAll.drop('mean_precip2', axis=1, inplace=True)

In [77]:
# Fill missing field size data
# select fieldSize column for cells that are cropcover
dfs1 = dfAll.loc[(dfAll['areakm2_cropCover']>0)|(dfAll['areakm2_treeCrop']>0), ['x', 'y', 'fieldSize', 'gid']]
# get rows with fieldsize data
dfs2 = dfs1.dropna().loc[
      (dfs1['fieldSize']!= 3507), ['fieldSize', 'x', 'y']]#[['x', 'y', 'fieldSize']]
# all rows without fieldsize column(including rows with data)
dfs3 = dfs1[['x', 'y', 'gid']]

# with attribute
gdf2 = gpd.GeoDataFrame(dfs2.drop(['x', 'y'],axis=1), geometry = gpd.points_from_xy(dfs2.x, dfs2.y))

# Missing data
gdf3 = gpd.GeoDataFrame(dfs3.drop(['x', 'y'],axis=1), geometry = gpd.points_from_xy(dfs3.x, dfs3.y))

joinNearest = ckdnearest(gdf3, gdf2)
result = pd.DataFrame(joinNearest)[['gid', 'fieldSize']]
result.columns = ['gid', 'fieldSize2']

finaldf = pd.merge(dfAll, result, how= 'outer',on = 'gid').drop_duplicates()
finaldf['fieldSize2'] = finaldf['fieldSize2'].fillna(0)
finaldf['fieldSize'] = finaldf['fieldSize2'].astype('int')

dfF = finaldf.drop('fieldSize2', axis=1).reset_index(drop=True)

In [78]:
dfF.loc[:,'id8km'] = dfF.loc[:,'id8km'].fillna(0)
dfF.loc[:,'id1km'] = dfF.loc[:,'id1km'].fillna(0)

# Format data types
dfF.loc[:,'areakm2_treeCrop'] = dfF.loc[:,'areakm2_treeCrop'].astype('float64')
dfF.loc[:,'gid'] = dfF.loc[:,'gid'].astype('string')
dfF.loc[:,'id8km'] = dfF.loc[:,'id8km'].astype('int64')
dfF.loc[:,'Nfer_kgha'] = dfF.loc[:,'Nfer_kgha'].astype('float64')
dfF.loc[:,'mean_precip'] = dfF.loc[:,'mean_precip'].astype('float32')

# Add scaled fieldSize
dfF['scaled_fieldSize'] = 0
dfF.loc[dfF['fieldSize'] == 3502, 'scaled_fieldSize'] = 1.0
dfF.loc[dfF['fieldSize'] == 3503, 'scaled_fieldSize'] = 0.8
dfF.loc[dfF['fieldSize'] == 3504, 'scaled_fieldSize'] = 0.6
dfF.loc[dfF['fieldSize'] == 3505, 'scaled_fieldSize'] = 0.4
dfF.loc[dfF['fieldSize'] == 3506, 'scaled_fieldSize'] = 0.2

  dfF.loc[:,'gid'] = dfF.loc[:,'gid'].astype('string')
  dfF.loc[:,'id8km'] = dfF.loc[:,'id8km'].astype('int64')
  dfF.loc[:,'mean_precip'] = dfF.loc[:,'mean_precip'].astype('float32')


In [79]:
# Fill in missing Nutrient data
# Copy attribute of closest point
df = dfF
df['maxN'] = df[['areakm2_slowNutrArea','areakm2_sMedNutriArea', 'areakm2_sHighNutriArea']].max(axis=1)

# Blocks with Nutrient areas
sdf2 = df.loc[df['maxN']>0, ['areakm2_slowNutrArea','areakm2_sMedNutriArea', 'areakm2_sHighNutriArea', 'x', 'y']]
gdf2 =  gpd.GeoDataFrame(sdf2.drop(['x', 'y'],axis=1), geometry = gpd.points_from_xy(sdf2.x, sdf2.y))

# All columns without join-attributes(s) column
sdf1 = df.loc[df['maxN'] == 0, ['gid', 'x', 'y']]
gdf1 =  gpd.GeoDataFrame(sdf1.drop(['x', 'y'], axis=1), geometry = gpd.points_from_xy(sdf1.x, sdf1.y))

joinNearest1 = ckdnearest(gdf1, gdf2)

result1 = pd.DataFrame(joinNearest1)[['gid', 'areakm2_slowNutrArea','areakm2_sMedNutriArea', 'areakm2_sHighNutriArea']]
result1.columns = ['gid', 'areakm2_slowNutrArea2','areakm2_sMedNutriArea2', 'areakm2_sHighNutriArea2']

finaldf = pd.merge(df, result1, how = 'outer',on = 'gid').drop_duplicates()

# df = finaldf.drop('fieldSize2', axis=1).reset_index(drop=True)

In [80]:
finaldf.loc[df['maxN']==0, 'areakm2_slowNutrArea'] = finaldf.loc[df['maxN']==0, 'areakm2_slowNutrArea2']
finaldf.loc[df['maxN']==0, 'areakm2_sMedNutriArea'] = finaldf.loc[df['maxN']==0, 'areakm2_sMedNutriArea2']
finaldf.loc[df['maxN']==0, 'areakm2_sHighNutriArea'] = finaldf.loc[df['maxN']==0, 'areakm2_sHighNutriArea2']
finaldf.drop(['areakm2_slowNutrArea2','areakm2_sMedNutriArea2', 'areakm2_sHighNutriArea2'], axis=1, inplace= True)

In [81]:
# Export data
outfile = f"{country}_POC_8km_{date}.feather"
finaldf.reset_index(drop=True).to_feather(outfile)

## Decision tree

In [82]:
infile = f"{country}_POC_8km_{date}.feather"
finaldf = pd.read_feather(infile).dropna().reset_index(drop=True)

In [83]:
# Add small value to zero precip
finaldf.loc[finaldf.mean_precip==0, 'mean_precip'] = 0.1
#round of to the nearest 400
max_round = math.ceil(max(finaldf['mean_precip']) / 400) * 400
cut_bins = list(range(0, max_round+1, 400))
# discretize precipitation into categories 
cut_labels = [str(item) for item in range(0, len(cut_bins)-1)]
finaldf["precipDiscrete"] = pd.cut(x = finaldf["mean_precip"].astype(float), bins= cut_bins, labels = cut_labels)
print("Categries for 400mm discretised rainfall:", finaldf.precipDiscrete.value_counts())

# Identify the max nutrient area
finaldf['maxNutr'] = finaldf[['areakm2_slowNutrArea','areakm2_sMedNutriArea', 'areakm2_sHighNutriArea']].idxmax(axis=1)

# Get unique category based on precip bucket and max nutrient category
finaldf['combination_LS_intensity'] = finaldf[['precipDiscrete','maxNutr']].agg(tuple, axis=1)
finaldf['lsIntensityCat'] = finaldf['combination_LS_intensity'].factorize()[0]

Categries for 400mm discretised rainfall: 2    8020
3    6031
1    4689
0    1231
Name: precipDiscrete, dtype: int64


In [84]:
combinations = list(set(finaldf['lsIntensityCat']))
print('Livestock Intensity categories (based on unique combinations of discretised precipitation and max nutrient category):',combinations)

finaldf['scaled_livestockIntensity']= 0
for combination in tqdm(combinations):
    scaler = MinMaxScaler()
    colnames = ['sheepDensity', 'goatDensity', 'cattleDensity']
    finaldf.loc[finaldf['lsIntensityCat'] == combination, 'scaled_livestockIntensity'] = scaler.fit_transform(finaldf.loc[finaldf['lsIntensityCat'] == combination, colnames]).sum(axis=1)

Livestock Intensity categories (based on unique combinations of discretised precipitation and max nutrient category): [0, 1, 2, 3, 4]


  0%|          | 0/5 [00:00<?, ?it/s]

In [85]:
# Scale variables
colnames = ['Nfer_kgha', 'popDensity']

for col in colnames:
    # create scaled variable
    finaldf['scaled_'+col] = finaldf[col]
    # get 99th percentile
    qq99 = finaldf[col].quantile(0.99)
    print(col + ':', qq99)
    # apply min max scaling to all data less than 99th quantile.
    scaler = MinMaxScaler()
    finaldf.loc[finaldf[col]<=qq99, 'scaled_'+ col] = scaler.fit_transform(pd.DataFrame(finaldf.loc[finaldf[col]<=qq99, col]))
    # for data greater than 99th percentile, equal to max value 1
    finaldf.loc[finaldf[col]>qq99, 'scaled_' + col] = 1

Nfer_kgha: 2.03008
popDensity: 10488.552641296386


In [86]:
#1= Urban
#2= Timber plantations
#3= Tree Croplands
#4= CropLands
#5= Protected areas
#6= Rangelands/Near-natural lands

def ExpertDT(df):

    #6= Rangelands/Near-natural lands
    df['Land_Use'] = 6

    #5= Protected areas
    df.loc[df['areakm2_protArea']>0.2, 'Land_Use'] = 5

    #4= CropLands
    df.loc[df['areakm2_cropCover']>0.2, 'Land_Use'] = 4

    #3= Tree Croplands
    df.loc[df['areakm2_treeCrop']>0.2, 'Land_Use'] = 3

    #2= Timber plantations
    df.loc[df['areakm2_plantation']>0.2, 'Land_Use'] = 2

    #1= Urban
    df.loc[(df['areakm2_urban']>0.2)|
           (df['popDensity']>1000) , 'Land_Use'] = 1
    return df
    
result = ExpertDT(finaldf)

In [87]:
result['Land_Use'].value_counts()

6    12743
1     4887
4     1295
5     1045
3        1
Name: Land_Use, dtype: int64

## Intensity

In [88]:
result['intensity'] = -1

#1= Urban
result.loc[(result['Land_Use']==1), 'intensity'] = ((result['areakm2_urban']/64)+\
                                                    result['scaled_popDensity'])/2

#3= Tree Croplands
result.loc[(result['Land_Use']==3), 'intensity'] = ((result['areakm2_treeCrop']/64)+\
                                                    result['scaled_fieldSize']+\
                                                    result['scaled_Nfer_kgha']\
                                                   )/3

#4 = Croplands
result.loc[(result['Land_Use']==4), 'intensity'] = ((result['areakm2_cropCover']/64)+\
                                                    result['scaled_fieldSize']+\
                                                    result['scaled_Nfer_kgha']\
                                                   )/3

#6 = Rangelands/ Near-naturallands
result.loc[(result['Land_Use']==6), 'intensity'] = (result['scaled_livestockIntensity']+\
                                                    result['scaled_Nfer_kgha'])/2

In [89]:
gdf = gpd.GeoDataFrame(result.drop(['x', 'y','combination_LS_intensity'], axis = 1), geometry = gpd.points_from_xy(result.x, result.y), crs= 'EPSG:4326')

In [90]:
gdf.columns

Index(['areakm2_urban', 'areakm2_cropCover', 'areakm2_protArea',
       'areakm2_slowNutrArea', 'areakm2_sMedNutriArea',
       'areakm2_sHighNutriArea', 'areakm2_plantation', 'areakm2_treeCrop',
       'popDensity', 'sheepDensity', 'goatDensity', 'cattleDensity', 'gid',
       'mean_precip', 'Nfer_kgha', 'fieldSize', 'id1km', 'id8km',
       'scaled_fieldSize', 'maxN', 'precipDiscrete', 'maxNutr',
       'lsIntensityCat', 'scaled_livestockIntensity', 'scaled_Nfer_kgha',
       'scaled_popDensity', 'Land_Use', 'intensity', 'geometry'],
      dtype='object')

In [91]:
gdf.columns = ['km2_urban',
 'km2_cropCov',
 'km2_protAr',
 'km2_sLwNtr',
 'km2_sMdNtr',
 'km2_sHgNtr',              
 'km2_plntn',
 'km2_trCrp',
 'sum_popDen',
 'sum_shpDen',
 'sum_gtDen',
 'sum_ctlDen',
 'gid',
 'mean_precip',
 'mean_Nfer',
 'fieldSize',
 'id1km',
 'id8km',
 'fieldSzScd',
 'maxSNtrAr',      
 'precipDisc',
 'maxSNtrCat',
 'LSIntenCat',
 'LSIntenScd',
 'NferScd',
 'popDenScd',
 'Land_Use',
 'Intensity',
 'geometry']

In [92]:
gdf.Intensity.describe()

count    19971.000000
mean         0.043381
std          0.268040
min         -1.000000
25%          0.032292
50%          0.061536
75%          0.111992
max          0.883670
Name: Intensity, dtype: float64

In [93]:
gdf.isna().sum()

km2_urban      0
km2_cropCov    0
km2_protAr     0
km2_sLwNtr     0
km2_sMdNtr     0
km2_sHgNtr     0
km2_plntn      0
km2_trCrp      0
sum_popDen     0
sum_shpDen     0
sum_gtDen      0
sum_ctlDen     0
gid            0
mean_precip    0
mean_Nfer      0
fieldSize      0
id1km          0
id8km          0
fieldSzScd     0
maxSNtrAr      0
precipDisc     0
maxSNtrCat     0
LSIntenCat     0
LSIntenScd     0
NferScd        0
popDenScd      0
Land_Use       0
Intensity      0
geometry       0
dtype: int64

In [94]:
gdf.loc[:,'precipDisc'] = gdf.loc[:,'precipDisc'].astype('int')
gdf.loc[:,'maxSNtrCat'] = gdf.loc[:,'maxSNtrCat'].astype('string')

  gdf.loc[:,'precipDisc'] = gdf.loc[:,'precipDisc'].astype('int')
  gdf.loc[:,'maxSNtrCat'] = gdf.loc[:,'maxSNtrCat'].astype('string')


In [95]:
print(os.getcwd())
outfile = f"{country}_8km_v2_{date}.shp"
gdf.to_file(outfile)

C:\Users\coach\myfiles\miscellenous\hayley\outputs\Angola


  gdf.to_file(outfile)


In [69]:
# create config file
# iterate over countries
# Download data
# preprocess data
# Expert DT
# Intensity scores