In [1]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from eumap.misc import find_files, nan_percentile, GoogleSheet, ttprint

from sklearn.metrics import r2_score, mean_squared_error
from tool_kit import calc_ccc, accuracy_plot

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# # model parameters
# import json
# with open('/mnt/inca/soc_eu_model/data/006_params_annual.json', 'r') as file:
#     params = json.load(file)

folder = '/mnt/inca/soc_eu_model'
df = pd.read_csv(f'{folder}/data/005.0_train.pnts_soc.csv',low_memory=False)

# covariates
with open(f'{folder}/SOC-EU/features/002_selected.covar_rank.freq.txt', 'r') as file:
    lines = file.readlines()
covs = [line.strip() for line in lines]

# target
train = df.loc[df['oc'].notna()]
train = train.loc[train['oc']>5]
train = train.loc[train['ref']!='nl.bis'] # show weird patterns
train.loc[:,'oc_log1p'] = np.log1p(train['oc'])
tgt = 'oc_log1p'

spatial_cv_column = 'tile_id'
cv = GroupKFold(n_splits=5)



NameError: name 'GroupKFold' is not defined

### Conduct hyperparameter tuning for different base models

#### Random forest

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV, GroupKFold
from joblib

# https://zillow.github.io/quantile-forest/user_guide/fit_predict.html#random-forest-predictions

param_grid = {
    'n_estimators': [200, 500, 800, 1000],
    'max_depth': [10, 20, 30],
    'max_features': [0.3, 0.5, 0.7, 'log2', 'sqrt'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

tune_rf = HalvingGridSearchCV(
    estimator=RandomForestRegressor(),
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    n_jobs=90, 
    cv=cv,
    verbose=1
)

ttprint(f'start parameter fine tuning for rf, training size: {len(train)}')
tune_rf.fit(train[covs], train[tgt], groups=train[spatial_cv_column])
ttprint("Finish fine tuning\nBest parameters found: ", tune_rf.best_params_)
joblib.dump(tune_rf.best_params_, f'{folder}/SOC-EU/model/001_best.params_rf.joblib')
joblib.dump(tune_rf.best_estimator_, f'{folder}/SOC-EU/model/002_model_rf.joblib')


#### lightGBM

In [None]:
# https://lightgbm.readthedocs.io/en/latest/index.html
import lightgbm as lgb
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV, GroupKFold
import joblib
import re

# [LightGBM] [Fatal] Do not support special JSON characters in feature name.
def clean_feature_names(df, covs):
    df.columns = df.columns.str.replace(r'[^\w]', '_', regex=True)
    clean_covs = [re.sub(r'[^\w]', '_', i) for i in covs]
    return df, clean_covs

param_grid = {
    'boosting_type': ['gbdt', 'dart'], # traditional Gradient Boosting Decision Tree VS. dropouts meet Multiple Additive Regression Trees (prevent overfitting)
    'num_leaves': [31, 50, 80], # number of leaved in trees
    'max_depth': [-1, 10, 20], # depth of a tree
    'learning_rate': [0.01, 0.1], # shrinkage or step size, this parameter controls the impact of each tree on the final outcome
    'n_estimators': [100, 500, 800], # number of boosting rounds
    'subsample': [0.6, 0.8, 1.0], # fraction of samples to be used for each tree
    'min_child_samples': [10,20,30], # minimum number of data points needed in a leaf
    'verbose': [-1]
}

# HalvingGridSearchCV for tuning
tune_lgbm = HalvingGridSearchCV(
    estimator=lgb.LGBMRegressor(),
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    cv=cv,
    verbose=1
)

# Clean feature names
train_clean, covs_clean = clean_feature_names(train, covs)

# Parameter tuning
ttprint(f'start parameter fine tuning for lightGBM, training size: {len(train_clean)}')
tune_lgbm.fit(train_clean[covs_clean], train_clean[tgt], groups=train[spatial_cv_column])
ttprint("Finish fine tuning\nBest parameters found: ", tune_lgbm.best_params_)
joblib.dump(tune_lgbm.best_params_, f'{folder}/SOC-EU/model/003_best.params_lgbm.joblib')
joblib.dump(tune_lgbm.best_estimator_, f'{folder}/SOC-EU/model/004_model_lgbm.joblib')



#### ANN with torch

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from skorch import NeuralNetRegressor
from sklearn.model_selection import GridSearchCV, GroupKFold
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler

# Convert DataFrame to PyTorch tensors
scaler = StandardScaler()
X_scaled = scaler.fit_transform(train[covs])

target_scaler = StandardScaler()
y_scaled = target_scaler.fit_transform(train[[tgt]]).reshape(-1)

X = torch.tensor(X_scaled, dtype=torch.float32)
y = torch.tensor(y_scaled, dtype=torch.float32).reshape(-1, 1)
groups = train[spatial_cv_column].values

# Define PyTorch model class
class RegressionModel(nn.Module):
    def __init__(self, units, layers, dropout_rate):
        super().__init__()
        layers_list = [nn.Linear(X.shape[1], units), nn.ReLU(), nn.Dropout(dropout_rate)]
        for _ in range(1, layers):
            layers_list += [nn.Linear(units, units), nn.ReLU(), nn.Dropout(dropout_rate)]
        layers_list.append(nn.Linear(units, 1))
        self.net = nn.Sequential(*layers_list)
    
    def forward(self, x):
        return self.net(x)

# Skorch wrapper
def skorch_model(units=64, layers=1, dropout_rate=0.2, learning_rate=0.001):
    return NeuralNetRegressor(
        RegressionModel,
        module__units=units,
        module__layers=layers,
        module__dropout_rate=dropout_rate,
        max_epochs=10,  # This value will be overwritten by GridSearchCV
        lr=learning_rate,
        optimizer=optim.Adam,
        criterion=nn.MSELoss,
        batch_size=64,  # This value will be overwritten by GridSearchCV
        verbose=0
    )

param_grid = {
    'module__units': [64, 128, 256],
    'module__layers': [2,4,6],
    'module__dropout_rate': [0.2, 0.3, 0.4],
    'lr': [0.0005, 0.001, 0.01, 0.02],
    'max_epochs': [10, 20],
    'batch_size': [64, 128]
}

ttprint('start grid search')
cv = GroupKFold(n_splits=3)
grid = GridSearchCV(estimator=skorch_model(), param_grid=param_grid, n_jobs=-1, cv=cv, scoring='neg_mean_squared_error')
grid_result = grid.fit(X, y, groups=groups)
ttprint('finish tuning')

print(f"Best: {grid_result.best_score_} using {grid_result.best_params_}")
joblib.dump(grid_result.best_params_, f'{folder}/SOC-EU/model/005_best.params_ann.joblib')
joblib.dump(grid_result.best_estimator_,  f'{folder}/SOC-EU/model/006_model_ann.joblib')


#### SVM

In [None]:
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.svm import SVR
import joblib

# find the best boundary (hyperplane) that separates data points of different classes in the feature space
param_grid_svm = {
    'C': [0.1, 1, 10, 100],  # regularization parameter, lower values of C lead to a smaller margin in the separating hyperplane
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  # type of hyperplane used to separate the data
    'gamma': ['scale', 'auto', 0.1, 1, 10],  # the influence of a single training example
    'degree': [2, 3, 4]  # degree of the polynomial kernel function (‘poly’). Ignored by all other kernels.
}

# Set up the HalvingGridSearchCV for SVM
tune_svm = HalvingGridSearchCV(
    estimator=SVR(),  
    param_grid=param_grid_svm,
    scoring='neg_mean_squared_error',
    n_jobs=90,  
    cv=cv, 
    verbose=1
)

# Assuming you have defined train, covs, tgt, and spatial_cv_column appropriately
print(f'start parameter fine tuning for SVM, training size: {len(train)}')
tune_svm.fit(train[covs], train[tgt], groups=train[spatial_cv_column])
print("Finish fine tuning\nBest parameters found: ", tune_svm.best_params_)

# Save the best parameters and the best estimator
joblib.dump(tune_svm.best_params_, f'{folder}/SOC-EU/model/009_best.params_svm.joblib')
joblib.dump(tune_svm.best_estimator_, f'{folder}/SOC-EU/model/010_model_svm.joblib')


### lrboost

In [None]:
# https://pypi.org/project/lrboost/

from lrboost import LRBoostRegressor

ttprint(f'fit linear regression Boost regressor')
lrb = LRBoostRegressor(primary_model=RidgeCV(), secondary_model=RandomForestRegressor())
lrb = LRBoostRegressor.fit(train[covs], train[tgt])
ttprint(f'finish fitting linear regression Boost regressor')

joblib.dump(lrb, f'{folder}/SOC-EU/model/011_model_lrb.joblib')

#### Cubist

In [None]:
from cubist import Cubist
# https://pypi.org/project/cubist/
# rule-based predictive model
param_cubist = {
    'n_rules': [100, 300, 500], # number of rules to be generated
    'n_committees': [1, 5, 10], # committee: ensembles of models
    'neighbors': [None, 3, 6, 9], # number of nearest neighbors to use when making a prediction
    'unbiased': [False, True], # whether or not to use an unbiased method of rule generation
    'extrapolation': [0.02, 0.05], # limits the extent to which predictions can extrapolate beyond the range of the training data, a fraction of the total range of the target variable
    'sample': [None, 0.1, 0.5], # fraction of the training data used in building each model
    'cv': [10]
}

tune_cubist = HalvingGridSearchCV(
    estimator=Cubist(),
    param_grid=param_cubist,
    scoring='neg_mean_squared_error',
    n_jobs=90,
    cv=3
)

# Start fine-tuning process
ttprint('start fine tuning cubist')
tune_cubist.fit(train[covs], train[tgt], groups=train[spatial_cv_column])
ttprint('finish fitting')

print("Best parameters:", tune_cubist.best_params_)
joblib.dump(tune_cubist.best_params_, f'{folder}/SOC-EU/model/007_best.params_ann.joblib')
joblib.dump(tune_cubist.best_estimator_,  f'{folder}/SOC-EU/model/008_model_cubist.joblib')