In [1]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from eumap.misc import find_files, nan_percentile, GoogleSheet, ttprint

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV, GroupKFold
import joblib

from sklearn.metrics import r2_score, mean_squared_error
from tool_kit import calc_ccc, accuracy_plot, uncertainty_plot
from sklearn.model_selection import train_test_split, cross_val_score, HalvingGridSearchCV, KFold, GroupKFold

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# read in necessary material
folder = '/mnt/inca/soc_eu_model'
test = pd.read_csv(f'{folder}/data/004.0_validate.pnts_oc.csv',low_memory=False)
train = pd.read_csv(f'{folder}/data/005.0_train.pnts_oc.csv',low_memory=False)

# covariates
with open(f'{folder}/SOC-EU/features/002_selected.covar_rank.freq.txt', 'r') as file:
    lines = file.readlines()
covs = [line.strip() for line in lines]

# dataset
train = train.dropna(subset=covs,how='any')
test = test.dropna(subset=covs,how='any')

# target variable
tgt = 'oc_log1p'

# spatial cross validation
spatial_cv_column = 'tile_id'
cv = GroupKFold(n_splits=5)



### Parameter fine tuning

In [2]:

# random forest
param_grid = {
    'n_estimators': [200, 500, 800, 1000],
    'max_depth': [10, 20, 30],
    'max_features': [0.3, 0.5, 0.7, 'log2', 'sqrt'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

tune_rf = HalvingGridSearchCV(
    estimator=RandomForestRegressor(),
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    n_jobs=90, 
    cv=cv,
    verbose=1
)

ttprint(f'start parameter fine tuning for rf, training size: {len(train)}')
tune_rf.fit(train[covs], train[tgt], groups=train[spatial_cv_column])
ttprint("Finish fine tuning\nBest parameters found: ", tune_rf.best_params_)
joblib.dump(tune_rf.best_params_, f'{folder}/SOC-EU/model/001_best.params_rf.joblib')
joblib.dump(tune_rf.best_estimator_, f'{folder}/SOC-EU/model/002_model_rf.joblib')


[09:16:45] start parameter fine tuning for rf, training size: 99126
n_iterations: 5
n_required_iterations: 6
n_possible_iterations: 5
min_resources_: 407
max_resources_: 99126
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 540
n_resources: 407
Fitting 5 folds for each of 540 candidates, totalling 2700 fits
----------
iter: 1
n_candidates: 180
n_resources: 1221
Fitting 5 folds for each of 180 candidates, totalling 900 fits
----------
iter: 2
n_candidates: 60
n_resources: 3663
Fitting 5 folds for each of 60 candidates, totalling 300 fits
----------
iter: 3
n_candidates: 20
n_resources: 10989
Fitting 5 folds for each of 20 candidates, totalling 100 fits
----------
iter: 4
n_candidates: 7
n_resources: 32967
Fitting 5 folds for each of 7 candidates, totalling 35 fits
[10:46:36] Finish fine tuning
Best parameters found:  {'max_depth': 30, 'max_features': 0.3, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}


['/mnt/inca/soc_eu_model/SOC-EU/model/002_model_rf.joblib']

In [3]:
#lasso linear regression

from sklearn.linear_model import Lasso

param_grid_lasso = {
    'alpha': [0.001, 0.01, 0.1, 1, 10, 100]
}

tune_lasso = HalvingGridSearchCV(
    estimator=Lasso(),
    param_grid=param_grid_lasso,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    cv=cv,
    verbose=1
)

ttprint(f'start parameter fine tuning for Lasso, training size: {len(train)}')
tune_lasso.fit(train[covs], train[tgt], groups=train[spatial_cv_column])
ttprint("Finish fine tuning\nBest parameters found: ", tune_lasso.best_params_)
joblib.dump(tune_lasso.best_params_, f'{folder}/SOC-EU/model/003_best.params_lasso.joblib')
joblib.dump(tune_lasso.best_estimator_, f'{folder}/SOC-EU/model/004_model_lasso.joblib')

[10:46:53] start parameter fine tuning for Lasso, training size: 99126
n_iterations: 2
n_required_iterations: 2
n_possible_iterations: 2
min_resources_: 33042
max_resources_: 99126
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 6
n_resources: 33042
Fitting 5 folds for each of 6 candidates, totalling 30 fits
----------
iter: 1
n_candidates: 2
n_resources: 99126
Fitting 5 folds for each of 2 candidates, totalling 10 fits
[10:50:03] Finish fine tuning
Best parameters found:  {'alpha': 0.001}


['/mnt/inca/soc_eu_model/SOC-EU/model/004_model_lasso.joblib']

In [2]:
# simple ANN
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.pipeline import Pipeline
import joblib

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('mlp', MLPRegressor(max_iter=5000, early_stopping=True, random_state=42))
])

param_grid_ann = {
    'mlp__hidden_layer_sizes': [(50,), (100,), (100, 50), (100, 100)],  # NN structure
    'mlp__activation': ['tanh', 'relu'],  # commonly used activation functions in NN
    'mlp__solver': ['adam', 'sgd'],  # optimizer
    'mlp__alpha': [0.0001, 0.001, 0.01],  # regularization to prevent overfitting
    'mlp__learning_rate': ['constant', 'adaptive'],  # how aggressive the weights update
    'mlp__learning_rate_init': [0.001, 0.01]  # initial learning rate
}

# Define the HalvingGridSearchCV with the pipeline
tune_ann = HalvingGridSearchCV(
    estimator=pipeline,
    param_grid=param_grid_ann,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    cv=3,
    verbose=1
)


ttprint(f'start parameter fine tuning for ANN, training size: {len(train)}')
tune_ann.fit(train[covs], train[tgt], groups=train[spatial_cv_column])
ttprint("Finish fine tuning\nBest parameters found: ", tune_ann.best_params_)

joblib.dump(tune_ann.best_params_, f'{folder}/SOC-EU/model/005_best.params_ann.joblib')
joblib.dump(tune_ann.best_estimator_, f'{folder}/SOC-EU/model/006_model_ann.joblib')


[21:40:28] start parameter fine tuning for ANN, training size: 99126
n_iterations: 5
n_required_iterations: 5
n_possible_iterations: 5
min_resources_: 1223
max_resources_: 99126
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 192
n_resources: 1223
Fitting 3 folds for each of 192 candidates, totalling 576 fits
----------
iter: 1
n_candidates: 64
n_resources: 3669
Fitting 3 folds for each of 64 candidates, totalling 192 fits
----------
iter: 2
n_candidates: 22
n_resources: 11007
Fitting 3 folds for each of 22 candidates, totalling 66 fits
----------
iter: 3
n_candidates: 8
n_resources: 33021
Fitting 3 folds for each of 8 candidates, totalling 24 fits
----------
iter: 4
n_candidates: 3
n_resources: 99063
Fitting 3 folds for each of 3 candidates, totalling 9 fits
[21:47:31] Finish fine tuning
Best parameters found:  {'mlp__activation': 'tanh', 'mlp__alpha': 0.001, 'mlp__hidden_layer_sizes': (100, 100), 'mlp__learning_rate': 'constant', 'mlp__learning_rate_init': 0.

['/mnt/inca/soc_eu_model/SOC-EU/model/006_model_ann.joblib']

In [5]:
from cubist import Cubist
# https://pypi.org/project/cubist/
# rule-based predictive model
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import HalvingGridSearchCV
import joblib
from cubist import Cubist

# Define a pipeline that includes scaling and the Cubist model
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('cubist', Cubist())
])

# Define the parameter grid for Cubist within the pipeline
param_cubist = {
    'cubist__n_rules': [100, 300, 500],  # number of rules to be generated
    'cubist__n_committees': [1, 5, 10],  # committee: ensembles of models
    'cubist__neighbors': [None, 3, 6, 9],  # number of nearest neighbors to use when making a prediction
    'cubist__unbiased': [False, True],  # whether or not to use an unbiased method of rule generation
    'cubist__extrapolation': [0.02, 0.05],  # limits the extent to which predictions can extrapolate beyond the range of the training data, a fraction of the total range of the target variable
    'cubist__sample': [None, 0.1, 0.5]  # fraction of the training data used in building each model
}

# Define the HalvingGridSearchCV with the pipeline
tune_cubist = HalvingGridSearchCV(
    estimator=pipeline,
    param_grid=param_cubist,
    scoring='neg_mean_squared_error',
    n_jobs=90,
    cv=cv
)

# Ensure the data retains feature names
X_train = pd.DataFrame(train[covs].values, columns=covs)
y_train = train[tgt]

# Start fine-tuning process
ttprint('start fine tuning cubist')
tune_cubist.fit(X_train, y_train, groups=train[spatial_cv_column])
ttprint("Finish fine tuning\nBest parameters found: ", tune_cubist.best_params_)

# Save the best parameters and model
joblib.dump(tune_cubist.best_params_, f'{folder}/SOC-EU/model/007_best.params_cubist.joblib')
joblib.dump(tune_cubist.best_estimator_, f'{folder}/SOC-EU/model/008_model_cubist.joblib')


### enxemble machine learning
- loop through each possible combination
- record the metrics
- select the optimal combination of model stacking

In [None]:
import itertools
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from tool_kit import calc_ccc, accuracy_plot, uncertainty_plot
from sklearn.model_selection import cross_val_predict

# Load models
model_list = find_files(f'{folder}/SOC-EU/model/','0*model*.joblib')
model_list = [str(i) for i in model_list]
models = [joblib.load(path) for path in model_list]
model_names = ["rf", "lasso", "ann", "cubist"]
print(model_list)

# Generate all combinations of models (2, 3, and 4)
combinations = []
for r in range(2, 5):
    combinations.extend(itertools.combinations(zip(models, model_names), r))
    
# training dataset
sampled_train = train.groupby(spatial_cv_column, group_keys=False).apply(lambda x: x.sample(min(len(x), 10))) # 44% data

results = []
# Loop through each combination of models
for combination in combinations:
    estimators = [(name, model) for model, name in combination]
    combi_name = ''
    for _, name in combination:
        combi_name = combi_name+' + '+name
    combi_name = combi_name[3::]
    if 'rf' not in combi_name:
        continue
    
    ttprint(f'fitting {combi_name}')
    # Define the Stacking Regressor
    stacking_regressor = StackingRegressor(
        estimators=estimators,
        final_estimator=LinearRegression()
    )
    
    # Fit the stacking regressor
#     y_pred = cross_val_predict(stacking_regressor, sampled_train[covs], sampled_train[tgt], cv=cv, groups=sampled_train[spatial_cv_column], n_jobs=90)  
    stacking_regressor.fit(sampled_train[covs], sampled_train[tgt])
    ttprint('finish fitting')
    y_pred = stacking_regressor.predict(test[covs])
    r2, rmse, ccc = accuracy_plot(test[tgt], y_pred, combi_name) # visuliazation
    error_spatial_plot(test[tgt], y_pred, test['lat'], test['lon'], combi_name)
    sorted_plot(test[tgt],y_pred,combi_name)
    
    # Store the results
    results.append({
        "Models": combi_name,
        "R2_CV": r2,
        "RMSE_CV": rmse,
        "CCC_CV": ccc
    })

# Create a DataFrame to store the results
results_df = pd.DataFrame(results)

results_df.to_csv(f'{folder}/SOC-EU/model/011_metrics_cv.eml.csv', index=False)
results_df

['/mnt/inca/soc_eu_model/SOC-EU/model/002_model_rf.joblib', '/mnt/inca/soc_eu_model/SOC-EU/model/004_model_lasso.joblib', '/mnt/inca/soc_eu_model/SOC-EU/model/006_model_ann.joblib', '/mnt/inca/soc_eu_model/SOC-EU/model/008_model_cubist.joblib']
[07:17:21] fitting rf + lasso


### mapie build

In [None]:
mapie = MapieRegressor(model, method="minmax", cv=5, n_jobs=90) # this cv is to compute the conformal scores, and spatial cross validation
mapie.fit(X[covs], X[tgt], groups=X[spatial_cv_column])

In [3]:
sampled_train = train.groupby(spatial_cv_column, group_keys=False).apply(lambda x: x.sample(frac=0.4))

In [7]:
print(len(sampled_train[spatial_cv_column].unique())) 
print(len(train[spatial_cv_column].unique()))

5481
5481


In [13]:
len(sampled_train)/len(train)

0.4473296612392309