In [1]:
###########
# IMPORTS #
###########

%pylab inline --no-import-all

import os
from pathlib import Path

import pandas as pd
import numpy as np 
from time import sleep, time
from tqdm import tqdm

# Change this path to adapt to where you downloaded the data
DATA_PATH = Path("./geolifeclef-2022-lifeclef-2022-fgvc9")

# Create the path to save submission files
SUBMISSION_PATH = Path("./submissions")
os.makedirs(SUBMISSION_PATH, exist_ok=True)

# Clone the GitHub repository
# !rm -rf GLC
# !git clone https://github.com/maximiliense/GLC
    
    
# For evaluation and submission
from GLC.metrics import top_30_error_rate, top_k_error_rate_from_sets, predict_top_30_set
from GLC.submission import generate_submission_file

# For data loading and visualization
from GLC.data_loading.common import load_patch
from GLC.plotting import visualize_observation_patch
from GLC.data_loading.environmental_raster import PatchExtractor



################
# DATA LOADING #
################
print("Observations loading")

# Load train set of observations from France and USA and merge
df_obs_fr = pd.read_csv(DATA_PATH / "observations" / "observations_fr_train.csv", sep=";", index_col="observation_id")
df_obs_us = pd.read_csv(DATA_PATH / "observations" / "observations_us_train.csv", sep=";", index_col="observation_id")
df_obs = pd.concat((df_obs_fr, df_obs_us))

# Same with test set of observations
df_obs_fr_test = pd.read_csv(DATA_PATH / "observations" / "observations_fr_test.csv", sep=";", index_col="observation_id")
df_obs_us_test = pd.read_csv(DATA_PATH / "observations" / "observations_us_test.csv", sep=";", index_col="observation_id")
df_obs_test = pd.concat((df_obs_fr_test, df_obs_us_test))

# Extract observaions as np array
obs_id_test = df_obs_test.index.values

# Test set size
print("Number of observations for testing: {}".format(len(df_obs_test)))

# Display head of the df
print("Train df shape: ", df_obs.shape)
display(df_obs.head(3))
print("Test df shape: ", df_obs_test.shape)
display(df_obs_test.head(3))

Populating the interactive namespace from numpy and matplotlib
Observations loading
Number of observations for testing: 36421
Train df shape:  (1627475, 4)


Unnamed: 0_level_0,latitude,longitude,species_id,subset
observation_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10561949,45.705116,1.424622,241,train
10131188,45.146973,6.416794,101,train
10799362,46.783695,-2.072855,700,train


Test df shape:  (36421, 2)


Unnamed: 0_level_0,latitude,longitude
observation_id,Unnamed: 1_level_1,Unnamed: 2_level_1
10782781,43.601788,6.940195
10364138,46.241711,0.683586
10692017,45.181095,1.533459


In [2]:
# Load the environmental vectors
df_features = pd.read_csv(DATA_PATH / "pre-extracted" / "environmental_vectors.csv", sep=";", index_col="observation_id")
df_features['latitude'] = 0
df_features['longitude'] = 0

# Copy the latitude and longitude columns of the observations in the df
df_features.loc[df_obs.index,["latitude","longitude"]] = df_obs.loc[df_obs.index,["latitude","longitude"]]
df_features.loc[df_obs_test.index,["latitude","longitude"]] = df_obs_test.loc[df_obs_test.index,["latitude","longitude"]]

# Create zero columns for the new features
# for c in ['mean_red','mean_green','mean_blue','mean_nir','mean_alt','mean_land']:
#     df_features[c] = 0

for c in ['mean_alt','mean_land']:
    df_features[c] = 0

# Fill nan values
df_features.fillna(np.finfo(np.float32).min, inplace=True)

# Display the result
display(df_features.head(3))


# Load landcover metadata to use the patches
df_landcover_labels = pd.read_csv(DATA_PATH / "metadata" / "landcover_original_labels.csv", sep=";")
df_suggested_landcover_alignment = pd.read_csv(DATA_PATH / "metadata" / "landcover_suggested_alignment.csv", sep=";")
landcover_mapping = df_suggested_landcover_alignment["suggested_landcover_code"].values

display(df_landcover_labels.head(2))
display(df_suggested_landcover_alignment.head(2))

  mask |= (ar1 == a)


Unnamed: 0_level_0,bio_1,bio_2,bio_3,bio_4,bio_5,bio_6,bio_7,bio_8,bio_9,bio_10,...,cecsol,clyppt,orcdrc,phihox,sltppt,sndppt,latitude,longitude,mean_alt,mean_land
observation_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10000000,1.420833,6.908333,29.272598,614.1493,15.1,-8.5,23.6,-1.0,9.183333,9.466667,...,29.0,13.0,63.0,62.0,34.0,53.0,44.964449,6.734335,0,0
10000001,8.8375,9.858334,37.771393,586.8139,23.8,-2.3,26.099998,6.016667,16.383333,16.383333,...,20.0,22.0,39.0,58.0,41.0,36.0,42.949856,0.226932,0,0
10000002,6.241667,8.35,32.239384,632.8609,21.0,-4.9,25.9,3.033333,14.2,14.2,...,29.0,22.0,54.0,59.0,40.0,38.0,45.031666,5.548889,0,0


Unnamed: 0,landcover_code,original_landcover_code,landcover_label
0,0,0,Missing Data
1,1,11,Annual Summer Crops


Unnamed: 0,landcover_code,suggested_landcover_code,suggested_landcover_label
0,0,0,Missing Data
1,1,11,Cultivated Crops


Enrich with pd.apply and monitor with tqdm

In [5]:
from __future__ import division
from time import sleep
from tqdm import tqdm

tqdm.pandas()

display(df_features.shape)

# def extract_means(row, data_path=DATA_PATH, landcover_mapping=landcover_mapping):
#     obs = row.name
#     patch = load_patch(obs, data_path, landcover_mapping=landcover_mapping)
#     rgb, nir, alt, land = tuple(patch)
#     row["mean_red"] = np.mean(rgb[:,:,0])
#     row['mean_green'] = np.mean(rgb[:,:,1])
#     row['mean_blue'] = np.mean(rgb[:,:,2])
#     row['mean_nir'] = np.mean(nir)
#     row['mean_alt'] = np.mean(alt)
#     row['mean_land'] = round(np.mean(land))    
#     return row

def extract_means(row, data_path=DATA_PATH, landcover_mapping=landcover_mapping):
    obs = row.name
    patch = load_patch(obs, data_path, landcover_mapping=landcover_mapping)
    rgb, nir, alt, land = tuple(patch)
    row['mean_alt'] = np.mean(alt)
    row['mean_land'] = round(np.mean(land))    
    return row

df_features = df_features.progress_apply(lambda x:extract_means(x), axis=1)
df_features.to_csv("./enriched_df/df_features_alt_land_2.csv")


(1663896, 31)

 63%|██████▎   | 1055670/1663896 [3:45:56<2:25:06, 69.86it/s] 

In [4]:
n_estimators = 50
max_depth = 12

import os
from pathlib import Path

import pandas as pd
import numpy as np 

# Change this path to adapt to where you downloaded the data
DATA_PATH = Path("./geolifeclef-2022-lifeclef-2022-fgvc9")

# Create the path to save submission files
SUBMISSION_PATH = Path("./submissions")
os.makedirs(SUBMISSION_PATH, exist_ok=True)

# Clone the GitHub repository
# !rm -rf GLC
# !git clone https://github.com/maximiliense/GLC
    
    
# For evaluation and submission
from GLC.metrics import top_30_error_rate, top_k_error_rate_from_sets, predict_top_30_set
from GLC.submission import generate_submission_file

# For data loading and visualization
from GLC.data_loading.common import load_patch
from GLC.plotting import visualize_observation_patch
from GLC.data_loading.environmental_raster import PatchExtractor


df_env = pd.read_csv("./enriched_df/df_features_alt_land.csv", index_col="observation_id")

# We can finally compute the top 30 error rate on the val set
def predict_func(est, X):
    y_score = est.predict_proba(X)
    s_pred = predict_top_30_set(y_score)
    return s_pred            


# We define a batch predictor to take care of the memory
# as there are more than 17k classes
def batch_predict(predict_func, est, X_df, obs_id, batch_size=1024):
    res = predict_func(est, X_df.head(1).values)
    n_samples, n_outputs, dtype = X_df.shape[0], res.shape[1], res.dtype
    
    preds = np.empty((n_samples, n_outputs), dtype=dtype)
    print(preds.shape)
    
    for i in range(0, len(X_df), batch_size):
        obs_id_batch = obs_id[i:i+batch_size]
        X_batch = X_df.loc[obs_id_batch, :]
        
        # add_patch_info(X_batch, DATA_PATH=DATA_PATH, landcover_mapping=landcover_mapping)
        
        X_batch = X_batch.values
        
        preds[i:i+batch_size] = predict_func(est, X_batch)

        if (i/batch_size)%10 == 0:
            print("Prediction : " + str(100*(i+1)/len(X_df)) + "% completed")
            
    return preds


# Load train set of observations from France and USA and merge
df_obs_fr = pd.read_csv(DATA_PATH / "observations" / "observations_fr_train.csv", sep=";", index_col="observation_id")
df_obs_us = pd.read_csv(DATA_PATH / "observations" / "observations_us_train.csv", sep=";", index_col="observation_id")
df_obs = pd.concat((df_obs_fr, df_obs_us))

# Extract training and validation subsets as np arrays
obs_id_train = df_obs.index[df_obs["subset"] == "train"].values
obs_id_val = df_obs.index[df_obs["subset"] == "val"].values

# Separate values to predict
y_train = df_obs.loc[obs_id_train]["species_id"].values
y_val = df_obs.loc[obs_id_val]["species_id"].values

# Validation proportion
n_val = len(obs_id_val)
print("Validation set size: {} ({:.1%} of train observations)".format(n_val, n_val / len(df_obs)))


# Same with test set of observations
df_obs_fr_test = pd.read_csv(DATA_PATH / "observations" / "observations_fr_test.csv", sep=";", index_col="observation_id")
df_obs_us_test = pd.read_csv(DATA_PATH / "observations" / "observations_us_test.csv", sep=";", index_col="observation_id")
df_obs_test = pd.concat((df_obs_fr_test, df_obs_us_test))

# Extract observaions as np array
obs_id_test = df_obs_test.index.values

# Define the train, val and test set as np arrays
X_train = df_env.loc[obs_id_train].values
X_val = df_env.loc[obs_id_val].values
X_test = df_env.loc[obs_id_test].values

y_train_df = df_obs.loc[obs_id_train]["species_id"]
X_train_df = df_env.loc[obs_id_train]

y_val_df = df_obs.loc[obs_id_val]["species_id"]
X_val_df = df_env.loc[obs_id_val]

X_test_df = df_env.loc[obs_id_test]


# Replace nan values with np.min
X_train_df.fillna(np.finfo(np.float32).min, inplace=True)
X_val_df.fillna(np.finfo(np.float32).min, inplace=True)
X_test_df.fillna(np.finfo(np.float32).min, inplace=True)


# Call a RF classifier, fit it on trainin set
from sklearn.ensemble import RandomForestClassifier
est = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, n_jobs=-1, verbose=1)   
            
print("***** Fitting started *****")
est.fit(X_train, y_train)
print("***** Fitting successful *****\n")


# Validation
print("***** Batch predict started *****")
s_val = batch_predict(predict_func, est, X_val_df, obs_id_val)
print("***** Batch predict successful *****\n")

score_val = top_k_error_rate_from_sets(y_val, s_val)
print("Top-30 error rate: {:.1%}".format(score_val))


# Compute baseline on the test set
print("***** Batch predict test started *****")
s_pred = batch_predict(predict_func, est, X_test_df, obs_id_test)
print("***** Batch predict test successful *****\n")

# Generate the submission file
file = "./submissions/rf_enriched_vect_" + str(n_estimators)+ "_est_" + str(max_depth) + "_max_dp"+ str(round(100*score_val)) +"_score.csv"
generate_submission_file(file, df_obs_test.index, s_pred)

Validation set size: 40080 (2.5% of train observations)
***** Fitting started *****


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  8.9min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 25.4min finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.0s
[Parallel(n_jobs=20)]: Done  50 out of  50 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.


***** Fitting successful *****

***** Batch predict started *****
(40080, 30)


[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.8s
[Parallel(n_jobs=20)]: Done  50 out of  50 | elapsed:    2.2s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.


Prediction : 0.00249500998003992% completed


[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.8s
[Parallel(n_jobs=20)]: Done  50 out of  50 | elapsed:    2.2s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.8s
[Parallel(n_jobs=20)]: Done  50 out of  50 | elapsed:    2.2s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.7s
[Parallel(n_jobs=20)]: Done  50 out of  50 | elapsed:    2.2s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.7s
[Parallel(n_jobs=20)]: Done  50 out of  50 | elapsed:    2.2s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.8s
[Parallel(n_jobs=20)]: Done  50 out of  50 | elapsed:    2.2s finished
[Parall

Prediction : 25.551397205588824% completed


[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.8s
[Parallel(n_jobs=20)]: Done  50 out of  50 | elapsed:    2.2s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.8s
[Parallel(n_jobs=20)]: Done  50 out of  50 | elapsed:    2.2s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.8s
[Parallel(n_jobs=20)]: Done  50 out of  50 | elapsed:    2.2s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.8s
[Parallel(n_jobs=20)]: Done  50 out of  50 | elapsed:    2.2s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0

Prediction : 51.1002994011976% completed


[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.8s
[Parallel(n_jobs=20)]: Done  50 out of  50 | elapsed:    2.3s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.8s
[Parallel(n_jobs=20)]: Done  50 out of  50 | elapsed:    2.2s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.8s
[Parallel(n_jobs=20)]: Done  50 out of  50 | elapsed:    2.2s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.8s
[Parallel(n_jobs=20)]: Done  50 out of  50 | elapsed:    2.2s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.8s
[Parallel(n_jobs=20)]: Done  50 out of  50 | elapsed:    2.2s finished
[Parall

Prediction : 76.64920159680639% completed


[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.8s
[Parallel(n_jobs=20)]: Done  50 out of  50 | elapsed:    2.3s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.8s
[Parallel(n_jobs=20)]: Done  50 out of  50 | elapsed:    2.2s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.7s
[Parallel(n_jobs=20)]: Done  50 out of  50 | elapsed:    2.2s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.8s
[Parallel(n_jobs=20)]: Done  50 out of  50 | elapsed:    2.3s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.8s
[Parallel(n_jobs=20)]: Done  50 out of  50 | elapsed:    2.2s finished
[Parall

***** Batch predict successful *****

Top-30 error rate: 77.5%
***** Batch predict test started *****
(36421, 30)


[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.0s
[Parallel(n_jobs=20)]: Done  50 out of  50 | elapsed:    0.0s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.8s
[Parallel(n_jobs=20)]: Done  50 out of  50 | elapsed:    2.2s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.


Prediction : 0.0027456687076137395% completed


[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.7s
[Parallel(n_jobs=20)]: Done  50 out of  50 | elapsed:    2.3s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.8s
[Parallel(n_jobs=20)]: Done  50 out of  50 | elapsed:    2.3s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.7s
[Parallel(n_jobs=20)]: Done  50 out of  50 | elapsed:    2.2s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.8s
[Parallel(n_jobs=20)]: Done  50 out of  50 | elapsed:    2.3s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.7s
[Parallel(n_jobs=20)]: Done  50 out of  50 | elapsed:    2.1s finished
[Parall

Prediction : 28.118393234672304% completed


[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.8s
[Parallel(n_jobs=20)]: Done  50 out of  50 | elapsed:    2.2s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.8s
[Parallel(n_jobs=20)]: Done  50 out of  50 | elapsed:    2.2s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.8s
[Parallel(n_jobs=20)]: Done  50 out of  50 | elapsed:    2.1s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.8s
[Parallel(n_jobs=20)]: Done  50 out of  50 | elapsed:    2.2s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0

Prediction : 56.234040800637% completed


[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.9s
[Parallel(n_jobs=20)]: Done  50 out of  50 | elapsed:    2.3s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.8s
[Parallel(n_jobs=20)]: Done  50 out of  50 | elapsed:    2.2s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.8s
[Parallel(n_jobs=20)]: Done  50 out of  50 | elapsed:    2.2s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.8s
[Parallel(n_jobs=20)]: Done  50 out of  50 | elapsed:    2.4s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0

Prediction : 84.34968836660168% completed


[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.8s
[Parallel(n_jobs=20)]: Done  50 out of  50 | elapsed:    2.2s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.8s
[Parallel(n_jobs=20)]: Done  50 out of  50 | elapsed:    2.4s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.9s
[Parallel(n_jobs=20)]: Done  50 out of  50 | elapsed:    2.3s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.8s
[Parallel(n_jobs=20)]: Done  50 out of  50 | elapsed:    2.2s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0

***** Batch predict test successful *****



#### Multiprocessing

Enrich with multiprocessing

In [None]:
# import multiprocessing as mp
# from time import time

# def func( arg ):
#     obs, values = arg

#     patch = load_patch(obs, DATA_PATH, landcover_mapping=landcover_mapping)
#     rgb, nir, alt, land = patch[0], patch[1], patch[2], patch[3]

#     values['mean_red'] = np.mean(rgb[:,:,0])
#     values['mean_green'] = np.mean(rgb[:,:,1])
#     values['mean_blue'] = np.mean(rgb[:,:,2])
#     values['mean_nir'] = np.mean(nir)
#     values['mean_alt'] = np.mean(alt)
#     values['mean_land'] = round(np.mean(land))

#     df_features.loc[obs] = values
    
#     return None

# if __name__ == "__main__":
#     start = time()

#     pool = mp.Pool(processes=mp.cpu_count())
#     for _ in tqdm(pool.imap_unordered( func, df_features.iterrows())):
#         pass
#     pool.close()
#     pool.join()
    
#     print("Enrichment duration: ", time()-start)

#     df_features.to_csv("./enriched_df/df_features_mean_patches.csv")

# main()


0it [00:00, ?it/s]

or

In [None]:
# import multiprocessing as mp
# from time import time

# def func( arg ):
#     obs, values = arg

#     patch = load_patch(obs, DATA_PATH, landcover_mapping=landcover_mapping)
#     rgb, nir, alt, land = patch[0], patch[1], patch[2], patch[3]

#     values['mean_red'] = np.mean(rgb[:,:,0])
#     values['mean_green'] = np.mean(rgb[:,:,1])
#     values['mean_blue'] = np.mean(rgb[:,:,2])
#     values['mean_nir'] = np.mean(nir)
#     values['mean_alt'] = np.mean(alt)
#     values['mean_land'] = round(np.mean(land))

#     df_features.loc[obs] = values
#     sleep(1)

#     return None

# if __name__ == "__main__":
#     start = time()

#     pool = mp.Pool(processes=mp.cpu_count())
#     r= list(tqdm(pool.imap_unordered( func, df_features.iterrows())))
    
#     print("Enrichment duration: ", time()-start)
    
    
#     df_features.to_csv("./enriched_df/df_features_mean_patches.csv")

# main()


0it [00:00, ?it/s]