In [1]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

import pandas as pd
import warnings
warnings.filterwarnings('ignore')
    
# covariates
with open('/mnt/inca/soc_eu_model/data/005_covar_annual.static.txt', 'r') as file:
    lines = file.readlines()
covs = [line.strip() for line in lines]

# training data
df = pd.read_csv('/mnt/inca/soc_eu_model/data/002_covar_overlayed.csv',low_memory=False)
def clean_prop(df, prop, limit):
    print(f'\033[1mCleaning {prop}\033[0m')
    tot = len(df)
    print(f'originally with {tot} rows')
    # Clean NaN
    num = df[prop].isna().sum()
    ccol = df.loc[df[prop].isna()]['ref'].unique()
    print(f'{num} ({num/tot*100:.2f}%) rows with NaN, from {ccol}')
    df = df.dropna(subset=[prop])
    
    # check if there are string values that cannot be converted to numerical values,
    # usually it's <LOD (limit of detection), such as '<6', '<LOD', etc
#     df.loc[:,prop] = pd.to_numeric(df.loc[:,prop], errors='coerce')
    df[prop] = pd.to_numeric(df[prop], errors='coerce')
    num = df[prop].isna().sum()
    ccol = df.loc[df[prop].isna()]['ref'].unique()
    print(f'{num} ({num/tot*100:.2f}%) rows with invalid strings, from {ccol}')
    df = df.dropna(subset=[prop])
    
    # Check for values below 0, which are invalid for all properties
    num = len(df.loc[df[prop] < 0])
    ccol = df.loc[df[prop] < 0]['ref'].unique()
    print(f'{num} ({num/tot*100:.2f}%) rows with {prop} < 0, from {ccol}')
    df = df[df[prop] >= 0]
    
    # check for values higher than plausible limit
    if limit:
        num = len(df.loc[df[prop]>limit])
        ccol = df.loc[df[prop]>limit]['ref'].unique()
        print(f'{num} ({num/tot*100:.2f}%) rows with {prop} > limit values, from {ccol}')
        df = df[df[prop] < limit]
    
    print(f'{len(df)} valid data records left')
    return df

tgt = 'oc'
dff = clean_prop(df,tgt,1000)
print()
dff = dff.dropna(subset=covs, how='any')
print(f'{len(dff)} valid data records left after cleaning covariates')
dff.reset_index(drop=True, inplace=True)

[1mCleaning oc[0m
originally with 177792 rows
18009 (10.13%) rows with NaN, from ['portugal.infosolo' 'swiss.nabo' 'foregs' 'nl.bis' 'estonia.kese' 'ukceh'
 'SoDaH' 'gemas' 'croatia.multione' 'netherland.BHR-P' 'Castilla.y.Leon'
 'geocradle' 'MarSOC' 'basque' 'LUCAS']
266 (0.15%) rows with invalid strings, from ['LUCAS']
1405 (0.79%) rows with oc < 0, from ['nl.bis' 'Czech' 'LUCAS' 'Wales.GMEP']
25 (0.01%) rows with oc > limit values, from ['estonia.kese']
158087 valid data records left

152550 valid data records left after cleaning covariates


In [None]:
import numpy as np
import scipy
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

from mapie._typing import NDArray
from mapie.metrics import regression_coverage_score
from mapie.regression import MapieQuantileRegressor, MapieRegressor

train_set, test_set = train_test_split(dff, test_size=0.4, random_state=42)
vld_set, clb_set = train_test_split(test_set, test_size=0.5, random_state=42)

mapie = MapieRegressor(rf, method='plus', cv=10)

mapie.fit(clb_set[covs], clb_set[tgt])

# Evaluate prediction and coverage level on testing set
y_pred, y_pis = mapie.predict(test_set[covs], alpha=0.1)
coverage = regression_coverage_score(test_set[tgt], y_pis[:, 0, 0], y_pis[:, 1, 0])


### Random forest

In [3]:
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(dff, test_size=0.4, random_state=42)
cal_set, val_set = train_test_split(test_set, test_size=0.5, random_state=42)

from joblib import dump, load
rf = load('/mnt/inca/soc_eu_model/data/008_model_rf.joblib')

from mapie import MapieRegressor
from sklearn.model_selection import KFold
cv = KFold(n_splits=5, shuffle=True, random_state=42)

mapie = MapieRegressor(estimator=rf, method='cv_plus', cv=cv)
mapie.fit(cal_set[covs],cal_set[tgt])

y_pred, y_pred_intervals = mapie.predict(X_test, alpha=0.05)

# Display or use the prediction intervals
for i, (interval) in enumerate(y_pred_intervals):
    print(f"Prediction Interval for observation {i}: {interval[0]:.2f} to {interval[1]:.2f}")



[18:50:06] fit random forest
[18:50:19] finish fitting


### Multi layer percepton

In [None]:
from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

param_ann = {
    'mlp__hidden_layer_sizes': [(100,), (150,), (100, 50)],
    'mlp__activation': ['tanh', 'relu'],
    'mlp__solver': ['adam'],
    'mlp__alpha': [0.0001, 0.001],  # Regularization term
    'mlp__learning_rate_init': [0.001, 0.01],
}

pipe = Pipeline([
    ('scaler', StandardScaler()),  # Feature scaling
    ('mlp', MLPRegressor(max_iter=300, random_state=42))
])

tune_ann = HalvingGridSearchCV(
    estimator=pipe,
    param_grid=param_ann,
    scoring='neg_mean_squared_error',
    n_jobs=90,
    cv=3
)

ttprint(f'start fine tuning ann')
tune_ann.fit(train_set[covs], train_set[tgt])
ttprint(f'finish fine tuning')
ann = tune_ann.best_estimator_
dump(ann, '/mnt/inca/soc_eu_model/data/009_model_ann.joblib')

print("Best parameters:", tune_ann.best_params_)
dump(tune_ann.best_parameters, '/mnt/inca/soc_eu_model/data/010_param_ann.joblib')

y_pred_ann = ann.predict(test_set[covs])

[18:50:46] start fine tuning ann


### Cubist

In [None]:
param_cubist = {
    'cubist__n_rules': [100, 300, 500],
    'cubist__n_committees': [1, 5, 10],
    'cubist__neighbors': [None, 5, 9],
    'cubist__unbiased': [False, True],
    'cubist__auto': [True, False],
    'cubist__extrapolation': [0.02, 0.05],
    'cubist__sample': [None, 0.1, 0.5],
    'cubist__cv': 10
}

tune_cubist = HalvingGridSearchCV(
    estimator=Cubist(),
    param_grid=param_cubist,
    scoring='neg_mean_squared_error',
    n_jobs=90,
    cv=3
)

# Start fine-tuning process
ttprint('start fine tuning cubist')
tune_cubist.fit(train_set[covs], train_set[tgt])
ttprint('finish fitting')

cubist = tune_cubist.best_estimator_
dump(cubist, '/mnt/inca/soc_eu_model/data/011_model_cubist.joblib')

print("Best parameters:", tune_cubist.best_params_)
dump(tune_cubist.best_params_, '/mnt/inca/soc_eu_model/data/012_param_cubist.joblib')

y_pred_cubist = cubist.predict(test_set[covs])

### lrb

In [None]:
ttprint(f'fit linear regression Boost regressor')
from lrboost import LRBoostRegressor
lrb = LRBoostRegressor().fit(train_set[covs], train_set[tgt])
y_pred_lrb = lrb.predict(test_set[covs], detail=True)
ttprint(f'finish fitting linear regression Boost regressor')

dump(lrb, '/mnt/inca/soc_eu_model/data/011_model_lrb.joblib')

### evaluation

In [None]:
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import numpy as np
from sklearn.metrics import r2_score, mean_squared_error
from scipy.stats import pearsonr

def calc_ccc(y_true, y_pred):
    pearson_corr = pearsonr(y_true, y_pred)[0]
    mean_true = np.mean(y_true)
    mean_pred = np.mean(y_pred)
    var_true = np.var(y_true)
    var_pred = np.var(y_pred)
    ccc = (2 * pearson_corr * np.sqrt(var_true) * np.sqrt(var_pred)) / (var_true + var_pred + (mean_true - mean_pred)**2)
    return ccc
    
def accuracy_plot(y_test, y_pred, title_text):

    # Calculate metrics
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    ccc = calc_ccc(y_test, y_pred)

    matplotlib.rcParams.update({'font.size': 16})
    fig = plt.figure(figsize = (8,8))
    fig.suptitle(title_text, fontsize=20, fontweight='bold')
    plt.title(f'R2={r2:.2f}, rmse={rmse:.4f}, ccc={ccc:.2f}')
    plt.hexbin(y_test, y_pred, gridsize=(300, 300) , cmap = 'plasma_r', mincnt=1, vmax = 200)#, xscale =25, yscale = 25)
    
    plt.xlabel('SOC - test'), plt.ylabel('SOC - pred')

    # square plot
    ax = plt.gca()
    ax.set_aspect('auto', adjustable='box')

    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

    #diagonal 
    ax.plot([-10, 300], [-10, 300], "-k", alpha = .5)
    ax.set_xlim(-5,200)
    ax.set_ylim(-5,200)
    
    # Create new axes according to image position
    cax = fig.add_axes([ax.get_position().x1+0.05,
                        ax.get_position().y0,
                        0.02,
                        ax.get_position().height])

    # Plot vertical colorbar
    cb = plt.colorbar(cax=cax)
    #cb.outline.set_visible(False)
    #cb.set_ticks([1,100,200,300])

    plt.show()
    
accuracy_plot(test_set[tgt], y_pred_rf, title_text='RF')
accuracy_plot(test_set[tgt], y_pred_lrb['final_prediction'], title_text='LRB')
accuracy_plot(test_set[tgt], y_pred_cubist, title_text='Cubist')
accuracy_plot(test_set[tgt], y_pred_ann, title_text='MLP (NN)')

### ensemble

In [None]:
cubist
ann
rf