In [1]:
import os

# Set the proxy environment variables
os.environ['http_proxy'] = 'http://127.0.0.1:7890'
os.environ['https_proxy'] = 'http://127.0.0.1:7890'

In [12]:
import ee
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import json
import random
from tqdm.auto import tqdm

In [3]:
ee.Initialize()

### Define basic parameters

In [4]:
# define the number of non-built control points
num_non_built = 5500

# define the data for NDVI creationg
Date = ee.DateRange('2020-01-01','2022-12-31')


In [5]:
# import the NDVI img
NDVI = ee.ImageCollection("MODIS/006/MOD13Q1")\
         .filterDate(Date)\
         .select("NDVI")\
         .median()

In [6]:
# define the region to analysis
region    = ['华东','东北','中南','华北','西北','西南']
region_en = ['huadong','dongbei','zhongnan','huabei','xibei','xinan']

### Calculate the sample_point_num for each NDVI value

##### Step_1: Compute the histogram of NDVI for each Landsat image

In [15]:

for i,names in tqdm(enumerate(zip(region,region_en)),total=len(region)):
    
    # unpack parameters
    name_cn = names[0]
    name_en = names[1]
    
    # import some spatial constrains
    Target_area = ee.FeatureCollection("users/wangjinzhulala/China_built_up/01_Boundary_shp/China_zone")\
                    .filterMetadata('NAME1','equals',name_cn)
    
        
    # ____________________Step_1:Calculate the area percentage of each NDVI value____________________________
    
    NDVI_frequency = NDVI.reduceRegion(reducer   = ee.Reducer.histogram(200),
                                       geometry  = Target_area.geometry(), 
                                       scale     = 500, 
                                       maxPixels = int(1e13)).getInfo()

    # _______________________________Step_2:unpack the value from histogram_________________________________
    
    count    = [round(i) for i in NDVI_frequency['NDVI']['histogram']]
    nd_value = [round(i) for i in NDVI_frequency['NDVI']['bucketMeans']]  
    
    # store the ndvi histogram to a datafram, 
    # NOTE here we divide ndvi by 100 to ensure that the actual NDVI level is at the 0.01 scale
    NDVI_hist_df = pd.DataFrame({'NDVI':[int(i/100) for i in nd_value],'Freq':count})

    # calculate how many points we shold collect at each ndvi level
    NDVI_hist_df['Select_num'] = NDVI_hist_df['Freq'].apply(lambda x: round(x/NDVI_hist_df['Freq'].sum() * num_non_built))

    # _______________________________Step_3:save the NDVI_hist_df to local disk_________________________________
    NDVI_hist_df.to_csv(f'./Result_df/NDVI_area_propotion_{i+1}_{name_en}.csv',index=False)

    # print out the process
    print(f'NDVI hist computation of {name_en} completed!')

##### Step_2: Create 10K random sample point and extract NDVI value to it

In [6]:
for i,names in tqdm(enumerate(zip(region,region_en)),total=len(region)):
    
    # unpack parameters
    name_cn = names[0]
    name_en = names[1]
    
    export_name = f'NDVI random {name_en}'
    
    # import target area
    Target_area = ee.FeatureCollection("users/wangjinzhulala/China_built_up/01_Boundary_shp/China_zone")\
                    .filterMetadata('NAME1','equals',name_cn)
    

    # create 50K random sample points
    Random_pt_ndvi = NDVI.sample(region     = Target_area,
                                 scale      = 250,
                                 numPixels  = 50000,
                                 geometries = True)

    # export the sample points to GEE asset, because on-the-fly computation required a lot of time and memeory
    # we can impot the result later for better efficienty

    task = ee.batch.Export.table.toAsset(
                            collection  = Random_pt_ndvi,
                            description = export_name,
                            assetId     = f'users/wangjinzhulala/China_built_up/02_control_sample/01_Random_pt_ndvi_0{i+1}_{name_en}')
    task.start()
    
    # print out the process
    print(f'NDVI random points of {name_en} are exported!')

NDVI random points of huadong are exported!
NDVI random points of dongbei are exported!
NDVI random points of zhongnan are exported!
NDVI random points of huabei are exported!
NDVI random points of xibei are exported!
NDVI random points of xinan are exported!


##### Step_3_Random select sample from the 50K points

In [16]:
# create a function for ramdom sampling

def sample_list(x):
    
    L = x['.geo']
    n = int(x['Select_num'])
    
    select = random.sample(L,n)
    
    return select

In [17]:
sample_df_list = []

for i,names in enumerate(zip(region,region_en)):
    
    # unpack parameters
    name_cn = names[0]
    name_en = names[1]

    #________________________Step_1: Preprocessing for 50K sample_______________________

    # read the random points
    random_df = pd.read_csv(f'./Random_pt/Random_pt_ndvi_0{i+1}_{name_en}.csv')

    # convert the NDVI to integers, here divide by 100 to ensure a 0.01 scale 
    # in actual NDVI scale
    random_df['NDVI'] = random_df['NDVI'].apply(lambda x: int(x/100))

    # transform .geo to json
    random_df.drop('system:index',1,inplace=True)
    random_df['.geo'] = random_df['.geo'].apply(lambda x: json.loads(x))

    # collapse all json points with the same NDVI value into one list
    # and store in the df_50K dataframe
    df_50K = pd.DataFrame(random_df.groupby('NDVI')['.geo'].apply(lambda x: list(x)))
    


    #_________________________Step_2: Join df_histgrame with df_50K______________________

    # Select the df_hist that are in the same year with df_50K
    df_hist = pd.read_csv(f'./Result_df/NDVI_area_propotion_{i+1}_{name_en}.csv')

    # Join df_hist and df_50K, remove the rows with a 0 select_num
    df_join = df_hist.join(df_50K, on='NDVI',how='inner')
    df_join = df_join[df_join['Select_num'] > 0]



    #_________________________Step_3: Perform the random sampling

    # Apply the function to collapse all json points of the same nd value into one list
    df_join['Sample'] = df_join.apply(sample_list,1)

    # Extract only necessay data
    df_join_sample = df_join[['NDVI','Sample']]

    # Explode the sample column, so we get the random point at each row
    df_join_sample = df_join_sample.explode('Sample')
    
    # add the region name to df
    df_join_sample['region'] = name_en
    
    # add the sample_df to list
    sample_df_list.append(df_join_sample)

  random_df.drop('system:index',1,inplace=True)
  random_df.drop('system:index',1,inplace=True)
  random_df.drop('system:index',1,inplace=True)
  random_df.drop('system:index',1,inplace=True)
  random_df.drop('system:index',1,inplace=True)
  random_df.drop('system:index',1,inplace=True)


In [7]:
# # concat all random-stratified sample into one datafram
# sample_df = pd.concat(sample_df_list).reset_index(drop=True)

# # unravel the sample to get lon/lat
# sample_df['lon'] = sample_df['Sample'].apply(lambda x: x['coordinates'][0])
# sample_df['lat'] = sample_df['Sample'].apply(lambda x: x['coordinates'][1])
# sample_df.drop('Sample',1,inplace=True)

# # Save the sample_df to disk
# sample_df.to_csv('./Result_df/Sample_point.csv',index=False)

# Load the sample_df
sample_df = pd.read_csv('./Result_df/Sample_point.csv')

In [8]:
sample_df

Unnamed: 0,NDVI,region,lon,lat
0,-14,huadong,117.825121,38.230031
1,-13,huadong,118.862632,37.992384
2,-13,huadong,118.254436,38.035157
3,-12,huadong,118.154338,37.905279
4,-12,huadong,118.804646,37.232732
...,...,...,...,...
32994,87,xinan,95.025083,29.144323
32995,87,xinan,94.751412,27.920226
32996,87,xinan,95.747002,28.551668
32997,88,xinan,95.944912,28.373304
