In [1]:
###########
# IMPORTS #
###########

%pylab inline --no-import-all

import os
from pathlib import Path

import pandas as pd
import numpy as np 

# Change this path to adapt to where you downloaded the data
DATA_PATH = Path("./geolifeclef-2022-lifeclef-2022-fgvc9")

# Create the path to save submission files
SUBMISSION_PATH = Path("./submissions")
os.makedirs(SUBMISSION_PATH, exist_ok=True)

# Clone the GitHub repository
# !rm -rf GLC
# !git clone https://github.com/maximiliense/GLC
    
    
# For evaluation and submission
from GLC.metrics import top_30_error_rate, top_k_error_rate_from_sets, predict_top_30_set
from GLC.submission import generate_submission_file

# For data loading and visualization
from GLC.data_loading.common import load_patch
from GLC.plotting import visualize_observation_patch
from GLC.data_loading.environmental_raster import PatchExtractor



################
# DATA LOADING #
################
print("Observations loading")

# Load train set of observations from France and USA and merge
df_obs_fr = pd.read_csv(DATA_PATH / "observations" / "observations_fr_train.csv", sep=";", index_col="observation_id")
df_obs_us = pd.read_csv(DATA_PATH / "observations" / "observations_us_train.csv", sep=";", index_col="observation_id")
df_obs = pd.concat((df_obs_fr, df_obs_us))

# Same with test set of observations
df_obs_fr_test = pd.read_csv(DATA_PATH / "observations" / "observations_fr_test.csv", sep=";", index_col="observation_id")
df_obs_us_test = pd.read_csv(DATA_PATH / "observations" / "observations_us_test.csv", sep=";", index_col="observation_id")
df_obs_test = pd.concat((df_obs_fr_test, df_obs_us_test))

# Extract observaions as np array
obs_id_test = df_obs_test.index.values

# Test set size
print("Number of observations for testing: {}".format(len(df_obs_test)))

# Display head of the df
print("Train df shape: ", df_obs.shape)
display(df_obs.head(3))
print("Test df shape: ", df_obs_test.shape)
display(df_obs_test.head(3))

Populating the interactive namespace from numpy and matplotlib
Observations loading
Number of observations for testing: 36421
Train df shape:  (1627475, 4)


Unnamed: 0_level_0,latitude,longitude,species_id,subset
observation_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10561949,45.705116,1.424622,241,train
10131188,45.146973,6.416794,101,train
10799362,46.783695,-2.072855,700,train


Test df shape:  (36421, 2)


Unnamed: 0_level_0,latitude,longitude
observation_id,Unnamed: 1_level_1,Unnamed: 2_level_1
10782781,43.601788,6.940195
10364138,46.241711,0.683586
10692017,45.181095,1.533459


In [8]:
# Load the environmental vectors
df_features = pd.read_csv(DATA_PATH / "pre-extracted" / "environmental_vectors.csv", sep=";", index_col="observation_id")
df_features['latitude'] = 0
df_features['longitude'] = 0

# Copy the latitude and longitude columns of the observations in the df
df_features.loc[df_obs.index,["latitude","longitude"]] = df_obs.loc[df_obs.index,["latitude","longitude"]]
df_features.loc[df_obs_test.index,["latitude","longitude"]] = df_obs_test.loc[df_obs_test.index,["latitude","longitude"]]

# Create zero columns for the new features
for c in ['mean_red','mean_green','mean_blue','mean_nir','mean_alt','mean_land']:
    df_features[c] = 0

# Fill nan values
df_features.fillna(np.finfo(np.float32).min, inplace=True)

# Display the result
display(df_features.head(3))


# Load landcover metadata to use the patches
df_landcover_labels = pd.read_csv(DATA_PATH / "metadata" / "landcover_original_labels.csv", sep=";")
df_suggested_landcover_alignment = pd.read_csv(DATA_PATH / "metadata" / "landcover_suggested_alignment.csv", sep=";")
landcover_mapping = df_suggested_landcover_alignment["suggested_landcover_code"].values

display(df_landcover_labels.head(2))
display(df_suggested_landcover_alignment.head(2))

  mask |= (ar1 == a)


Unnamed: 0_level_0,bio_1,bio_2,bio_3,bio_4,bio_5,bio_6,bio_7,bio_8,bio_9,bio_10,...,sltppt,sndppt,latitude,longitude,mean_red,mean_green,mean_blue,mean_nir,mean_alt,mean_land
observation_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10000000,1.420833,6.908333,29.272598,614.1493,15.1,-8.5,23.6,-1.0,9.183333,9.466667,...,34.0,53.0,44.964449,6.734335,0,0,0,0,0,0
10000001,8.8375,9.858334,37.771393,586.8139,23.8,-2.3,26.099998,6.016667,16.383333,16.383333,...,41.0,36.0,42.949856,0.226932,0,0,0,0,0,0
10000002,6.241667,8.35,32.239384,632.8609,21.0,-4.9,25.9,3.033333,14.2,14.2,...,40.0,38.0,45.031666,5.548889,0,0,0,0,0,0


Unnamed: 0,landcover_code,original_landcover_code,landcover_label
0,0,0,Missing Data
1,1,11,Annual Summer Crops


Unnamed: 0,landcover_code,suggested_landcover_code,suggested_landcover_label
0,0,0,Missing Data
1,1,11,Cultivated Crops


In [9]:
from time import sleep
from tqdm import tqdm

def add_patch_info(df, landcover_mapping, DATA_PATH):

    for obs, values in tqdm(df.iterrows(), total=df.shape[0])   :
            patch = load_patch(obs, DATA_PATH, landcover_mapping=landcover_mapping)
            rgb, nir, alt, land = patch[0], patch[1], patch[2], patch[3]

            values['mean_red'] = np.mean(rgb[:,:,0])
            values['mean_green'] = np.mean(rgb[:,:,1])
            values['mean_blue'] = np.mean(rgb[:,:,2])
            values['mean_nir'] = np.mean(nir)
            values['mean_alt'] = np.mean(alt)
            values['mean_land'] = round(np.mean(land))

            df.loc[obs] = values

            # sleep(0)

add_patch_info(df_features, landcover_mapping, DATA_PATH)

df_features.to_csv("./enriched_df/df_features_mean_patches.csv")

  8%|▊         | 140619/1663896 [31:20<5:39:30, 74.78it/s]


KeyboardInterrupt: 

In [11]:
import multiprocessing as mp
mp.cpu_count()

20