In [150]:
import numpy as np
import xarray as xr
from pathlib import Path
from utils import get_plot_df
from tqdm.autonotebook import tqdm

# Automatically prints execution time for the individual cells
%load_ext autotime

# Automatically reloads functions defined in external files
%load_ext autoreload
%autoreload 2

# Set xarray to use html as display_style
xr.set_options(display_style="html")

# The path to the project (so absoute file paths can be used throughout the notebook)
PROJ_PATH = Path.cwd().parent

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
time: 28.1 ms


In [6]:
netcdf_path = (PROJ_PATH / 'data' / 'processed' / 'FieldPolygons2019_stats').with_suffix('.nc')
ds = xr.open_dataset(netcdf_path, engine="h5netcdf")
ds  # Remember to close the dataset before the netcdf file can be rewritten in cells above

time: 209 ms


In [7]:
ds.close()

time: 24.7 ms


In [62]:
df = ds.to_dataframe()
df = df.reset_index()  # Removes MultiIndex
df = df.drop(columns=['cvr', 'gb', 'gbanmeldt', 'journalnr', 'marknr', 'pass_mode', 'relative_orbit'])

time: 7.84 s


In [108]:
pd.set_option('display.max_rows', 100)
df.drop(columns=['polarization']).head(1000000)

Unnamed: 0,date,field_id,afgkode,afgroede,imk_areal,satellite,stats_max,stats_mean,stats_median,stats_min,stats_std
0,2018-07-08,61853445,151,"Kartofler, stivelses-",1.83,S1B,-17.0,-23.705883,-23.0,-31.0,3.019831
1,2018-07-08,61853445,151,"Kartofler, stivelses-",1.83,S1B,-9.0,-12.705882,-12.0,-18.0,1.898597
2,2018-07-08,61853445,151,"Kartofler, stivelses-",1.83,S1B,17.0,10.470589,10.0,4.0,3.046071
3,2018-07-08,61952339,151,"Kartofler, stivelses-",6.02,S1B,-15.0,-20.635212,-20.0,-30.0,2.294672
4,2018-07-08,61952339,151,"Kartofler, stivelses-",6.02,S1B,-10.0,-14.428169,-15.0,-21.0,2.151264
...,...,...,...,...,...,...,...,...,...,...,...
999995,2018-08-01,62201333,252,"Permanent græs, normalt udbytte",0.97,S1B,,,,,
999996,2018-08-01,62356008,252,"Permanent græs, normalt udbytte",1.74,S1B,-16.0,-21.921053,-21.0,-29.0,3.489696
999997,2018-08-01,62356008,252,"Permanent græs, normalt udbytte",1.74,S1B,-12.0,-14.552631,-14.0,-17.0,1.389763
999998,2018-08-01,62356008,252,"Permanent græs, normalt udbytte",1.74,S1B,15.0,6.789474,6.5,2.0,3.188449


time: 665 ms


In [109]:
df[df['date']=='2018-08-01']

Unnamed: 0,date,field_id,polarization,afgkode,afgroede,imk_areal,satellite,stats_max,stats_mean,stats_median,stats_min,stats_std
807744,2018-08-01,61853445,VH,151,"Kartofler, stivelses-",1.83,S1B,-14.0,-17.715328,-17.0,-25.0,2.064468
807745,2018-08-01,61853445,VV,151,"Kartofler, stivelses-",1.83,S1B,-9.0,-11.656935,-12.0,-15.0,1.421616
807746,2018-08-01,61853445,VV-VH,151,"Kartofler, stivelses-",1.83,S1B,13.0,5.525548,5.0,0.0,2.470498
807747,2018-08-01,61952339,VH,151,"Kartofler, stivelses-",6.02,S1B,-14.0,-21.742937,-22.0,-30.0,2.564759
807748,2018-08-01,61952339,VV,151,"Kartofler, stivelses-",6.02,S1B,-9.0,-14.744350,-15.0,-22.0,2.210251
...,...,...,...,...,...,...,...,...,...,...,...,...
1009675,2018-08-01,62131272,VV,252,"Permanent græs, normalt udbytte",2.54,S1B,,,,,
1009676,2018-08-01,62131272,VV-VH,252,"Permanent græs, normalt udbytte",2.54,S1B,,,,,
1009677,2018-08-01,63199619,VH,252,"Permanent græs, normalt udbytte",3.29,S1B,-18.0,-24.351725,-24.0,-34.0,2.785507
1009678,2018-08-01,63199619,VV,252,"Permanent græs, normalt udbytte",3.29,S1B,-12.0,-15.479310,-15.0,-20.0,1.589161


time: 87.4 ms


In [None]:
# Start by finding number of fields, dates, and polarizations
num_fields = len(df['field_id'].unique())
num_dates = len(df['date'].unique())
num_polarizations = len(df['polarization'].unique())

# Get the labels (and ensure that our dataframe is formatted as it is suppposed to be)
for i, date in enumerate(df['date'].unique()):  # Loop over all dates
    # Get lists with afgkode and field_ids for all fields for a single date
    df_date = df[df['date'] == date]   # Extract a df with all values for that date
    y = df_date['afgkode'].iloc[::num_polarizations].values  # Extract 'afgkode' from every N'th row 
    y_field_id = df_date['field_id'].iloc[::num_polarizations].values  # Extract 'field_id' from every N'th row
    
    # Store the lists from the first date
    if i == 0:  
        y_initial = y
        y_field_id_initial = y_field_id
    
    # Check that the lists for every date matches the first date
    assert np.array_equal(y, y_initial)
    assert np.array_equal(y_field_id, y_field_id_initial)
    
# The feature array should have all features (all polarizations for all dates) as a single row per field.
# NOTE: This could be probably be done faster, but this way is simple and easy to understand
print("Converting dataframe to NumPy feature array")
X = np.zeros((num_fields, num_dates*num_polarizations))  # Initialize array
for i, field_id in enumerate(tqdm(y_field_id)):  # Loop over all fields
    df_field = df[df['field_id'] == field_id]  # Extract df for the specific field
    X[i, :] = df_field['stats_mean'].values  # Extract the values ('stats_mean') and insert them into feature array
    # NOTE: Test with using the stats_std also for classification
    #X[i, ??:] = df_field['stats_std'].values

# Print numbers and shapes to give impression of dataset size
print(f"Number of fields: {num_fields}")
print(f"Number of dates: {num_dates}")
print(f"Number of polarizations: {num_polarizations}")
print(f"Shape of feature array: {np.shape(X)}")
print(f"Shape of label array: {np.shape(y)}")

HBox(children=(FloatProgress(value=0.0, max=67312.0), HTML(value='')))

In [None]:
X = np.zeros((num_fields, num_dates*num_polarizations))
for i, field_id in enumerate(y_field_id):
    df_field = df[df['field_id'] == field_id]
    X[i, :] = df_field['stats_mean'].values
    
    
#for i, date in enumerate(df['date'].unique()):
#    for j, polarization in enumerate(df['polarization'].unique()):
#        print(polarization)
#        print(j)
#        X[i, j*(i+1)] 
#    break

In [123]:
df.iloc[num_fields*num_polarizations]

date              2018-07-14 00:00:00
field_id                     61853445
polarization                       VH
afgkode                           151
afgroede        Kartofler, stivelses-
imk_areal                        1.83
satellite                         S1A
stats_max                         -17
stats_mean                   -22.3667
stats_median                      -23
stats_min                         -31
stats_std                     2.97191
Name: 201936, dtype: object

time: 35.4 ms


In [124]:
df.iloc[2*num_fields*num_polarizations]

date              2018-07-20 00:00:00
field_id                     61853445
polarization                       VH
afgkode                           151
afgroede        Kartofler, stivelses-
imk_areal                        1.83
satellite                         S1B
stats_max                         -17
stats_mean                    -25.313
stats_median                      -26
stats_min                         -33
stats_std                     3.23665
Name: 403872, dtype: object

time: 39.5 ms
