In [1]:
import numpy as np
import matplotlib.pyplot as plt
import multiprocess as mp
import glob
import time
from tqdm import tqdm
import os
import sys
import pandas as pd
from eumap.misc import find_files
from eumap.raster import read_rasters, save_rasters
from eumap.mapper import SpaceOverlay
import geopandas as gpd
from pathlib import Path
import rasterio
import pyproj
from shapely.geometry import Point
from eumap.mapper import LandMapper

import warnings

import matplotlib
import matplotlib.pyplot as plt

from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import train_test_split, cross_val_score, HalvingGridSearchCV, KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.linear_model import Ridge, LinearRegressor
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import StackingRegressor

import joblib
import pickle

warnings.filterwarnings('ignore')

### feature elimination

In [None]:
# # Set up cross-validation
# kf = KFold(n_splits=num_iterations, shuffle=True, random_state=42)
# score = pd.DataFrame()
# score['feat_num'] = np.arange(10,df.shape[1]+1,10)

# for fold_idx, (train_idx, _) in enumerate(kf.split(x)):  # Replace X with your data

#     x_subset = x[train_idx]
#     y_subset = y[train_idx]  
    
#     # Initialize RFECV
#     rfecv = RFECV(estimator=RandomForestRegressor(), step=10, cv=5, min_features_to_select=10, scoring='neg_mean_squared_error')
#     rfecv.fit(x_subset, y_subset)

#     cname = 'score_fold' + f'{str(fold_idx+1)}'
#     score[cname] = rfecv.grid_scores_
#     plt.plot(score['feat_num'], score[cname], marker='o')
#     plt.xlabel('Number of Features')
#     plt.ylabel('Cross-Validation Negative Mean Squared Error')
#     plt.title(f'Scree Plot for Fold {fold_idx + 1}')
#     plt.show()

# # find the best number-of-feature
# best_feature_numbers = []
# for fold_idx in range(num_iterations):
#     cname = 'score_fold' + f'{str(fold_idx+1)}'
#     best_feature_index = score[cname].idxmax()
#     best_feature_number = score.iloc[best_feature_index]['feat_num']
#     best_feature_numbers.append(best_feature_number)

# nof = round(np.mean(best_feature_numbers))
# print("Best feature numbers for each fold:", best_feature_numbers)
# print(f"Average: {np.mean(best_feature_numbers)}")


# Set up cross-validation
score = pd.DataFrame()
score['feat_num'] = np.arange(10,df.shape[1]+1,10)

rfecv = RFECV(estimator=RandomForestRegressor(), step=10, cv=5, min_features_to_select=10, scoring='neg_mean_squared_error')
rfecv.fit(x_subset, y_subset)

score['score'] = rfecv.grid_scores_
plt.plot(score['feat_num'], score['score'], marker='o')
plt.xlabel('Number of Features')
plt.ylabel('Cross-Validation Negative Mean Squared Error')
plt.title(f'Scree Plot')
plt.show()

best_feature_index = score['score'].idxmax()
best = score.iloc[best_feature_index]['feat_num']

In [None]:
# do the final feature selection
estimator = RandomForestClassifier(random_state=11)
rfecv = RFECV(estimator=estimator, step=10, cv=5, min_features_to_select=best, scoring='neg_mean_squared_error')
rfecv.fit(x, y)
x_selected = rfecv.fit_transform(x, y)
x_selected.to_csv('pipeline/oc_selected_v1.csv')
score.to_csv('pipeline/eval_mat_score.vs.featnum.csv')

### hyperparameter Tuning for Base Learners

In [None]:
# Step 3: Hyperparameter Tuning for Base Learners
models = ['rf','gbr','lr','ann','svm','cubist']
x_train, x2, y_train, y2 = train_test_split(x_selected, y, test_size=0.4, random_state=42)

mdls = pd.DataFrame()
mdls['model'] = models
para = []
score = []
r2_tra = []
rmse_tra = []
r2_val = []
rmse_val = []
timel = []

grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [2, 4],
    'max_features': [0.2, 0.5],
    'bootstrap': True,
    'random_state': 11,
    'n_jobs': 40
}

grid_gbr = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [2, 4],
    'subsample': [0.6, 0.8],
    'n_jobs': 40
}

grid_lr = {
    'alpha': [1, 10],
    'n_jobs': 40
}

grid_ann = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50), (50, 50, 25), (25, 100, 25)],
    'activation': ['relu', 'tanh'],
    'solver': ['adam'],
    'alpha': [0.0001, 0.001],
    'learning_rate': ['constant', 'adaptive'],
    'max_iter': [1000, 3000, 6000],
    'early_stopping': True,
    'n_jobs': 40
}

grid_svm = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'kernel': ['linear', 'poly', 'rbf'],
    'degree': [2, 3, 4],
    'coef0': [0, 1, 2],
    'gamma': [0.01, 0.1, 1, 'scale', 'auto'],
    jobs = 40
}

grid_cubist = {
    'committees': [5, 10, 20],
    'neighbors': [5, 10, 20],
    'rules': [50, 100, 200],
    'trials': [5, 10, 20],
    'model': ['tree', 'rules'],
    jobs = 40
}

grids = [grid_rf, grid_gbr, grid_lr, grid_ann, grid_svm, grid_cubist]

estimators = [RandomForestRegressor(),GradientBoostingRegressor(),Ridge(),
              MLPRegressor(), SVR(), CubistRegressor()]

factor = [2,2,1,2,2,2]

In [None]:
def pipeline(x_train,y_train,name,estimator,param_grid,havling_factor):
    sta = time.time()
    grid = HalvingGridSearchCV(estimator, param_grid, cv=5, factor=havling_factor).fit(x_train, y_train)
    ed = time.time()
    
    best = grid.best_estimator_

    best.fit(x_train, y_train)
    joblib.dump(best, f'pipeline/{name}.joblib')

    y_train_pred = best.predict(x_train)
    r2_tra.append(r2_score(y_train, y_train_pred))
    rmse_tra.append(mean_squared_error(y_train, y_train_pred, squared=False))

    y_val_pred = best.predict(x_test)
    r2_val.append(r2_score(y_test, y_val_pred))
    rmse_val.append(mean_squared_error(y_test, y_val_pred, squared=False))
    
    return [grid.best_score_,grid.best_params_,r2_tra,rmse_tra,r2_val,rmse_val,round((ed-sta)/60,2)]

In [None]:
for i in range(len(models)):
    print(name[i])
    templ = pipeline(x_train,y_train,name[i],models[i],estimators[i],grids[i],factors[i])
    score.append(templ[0])
    para.append(templ[1])
    r2_tra.append(templ[2])
    rmse_tra.append(templ[3])
    r2_val.append(templ[4])
    rmse_val.append(templ[5])
    timel.append(templ[6])
    
mdls['model'] = models
mdls['best_para'] = para
mdls['score_hyper_tuning'] = score 
mdls['r2_training'] = r2_tra
mdls['rmse_training'] = rmse_tra
mdls['r2_validation'] = r2_val
mdls['rmse_validation'] = rmse_val
mdls['time'] = timel

### build meta learner on top of base learners

In [None]:
x_meta, x_test, y_meta, y_test = train_test_split(x2, y2, test_size=0.4, random_state=42)

base_learners = {name: joblib.load(f'pipeline/{name}.joblib') for name models}
meta_learner = LinearRegression()

sta = time.time()
meta = StackingRegressor(estimators=list(base_learners.items()), final_estimator=meta_learner, cv = 'prefit', n_jobs=40)
meta.set_params(svm='drop')
meta.fit(x_meta, y_meta)
ed = time.time()

joblib.dump(meta, f'pipeline/meta.joblib')

y_meta_pred = meta.predict(x_meta)
r2_tra.append(r2_score(y_meta, y_meta_pred))
rmse_tra.append(mean_squared_error(y_meta, y_meta_pred, squared=False))

y_test_pred = best.predict(x_test)
r2_val.append(r2_score(y_test, y_test_pred))
rmse_val.append(mean_squared_error(y_test, y_test_pred, squared=False))
    
mdls.loc[len(mdls)] = ['meta',meta.best_score_,metag.best_params_,r2_tra,rmse_tra,r2_val,rmse_val,round((ed-sta)/60,2)]

In [None]:
mdls