In [None]:
# Import required packages
import sys
# !{sys.executable} -m pip install tqdm seaborn 
from glob import glob
import numpy as np
import joblib
import os
import pandas as pd
import json

import time
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

from tqdm.notebook import tqdm
import seaborn as sns
from collections import Counter

# Split data if not already made

In [None]:
trainsize=[ 0.1,0.3, 0.5, 0.7,0.8]

In [None]:


gaussian_kernel = RBF()

models = {
    'ridge': Ridge(),
    'lasso': Lasso(),
    'elastic': ElasticNet(),
    'knn': KNeighborsRegressor(),
    'rfr': RandomForestRegressor(),
    'grad': GradientBoostingRegressor(),
    'svr': SVR(),
    'krr': KernelRidge(),
    'gpr': GaussianProcessRegressor()
}

param_grid = {
    'ridge': {
        'alpha': [0.001, 0.01, 0.1, 1, 10, 50, 100, 1000]
    },
    'knn': {
        'n_neighbors': range(1, 10),
        'weights': ['uniform', 'distance']
    },
    'rfr': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 5, 10],
        'min_samples_split': [2, 5, 10]
    },
    'grad': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.1, 0.01, 0.001],
        'max_depth': [3, 5, 7]
    },
    'svr': {
        'kernel': ['linear', 'rbf'],
        'C': [0.1, 1, 10],
        'epsilon': [0.1, 0.01, 0.001]
    },
    'krr': {
        'kernel': ['linear', 'poly', 'rbf'],
        'alpha': [0.001, 0.01, 0.1, 1],
        'gamma': [1, 0.1, 0.01, 0.001]
    },
    'gpr': {
        'alpha': [0.0001, 0.001, 0.01, 0.1, 1],
        'kernel': [gaussian_kernel]
    },
    'lasso': {
        'alpha': [0.001, 0.01, 0.1, 1],
        'max_iter': [1000, 2000, 3000]
    },
    'elastic': {
        'alpha': [0.001, 0.01, 0.1, 1],
        'l1_ratio': [0.2, 0.5, 0.8],
        'max_iter': [1000, 2000, 3000]
    }
}


In [None]:
def gridsearch(model,X_tr, y_tr,X_tst,y_tst):
    """
    Perform GridSearchCV for a given model

    parameters
    ----------
    model: str
        Name of model

    X_tr: np.array
        X training

    y_tr: np.array
        Y Training

    X_tst: np.array
        X test

    y_tst: np.array
        y test

    Returns
    -------
    scores: dict
        scored models
    """
    st = time.time()
    # Grab model and model parameters to perform gridsearchcv
    current_model = models[model]
    current_param_grid = param_grid[model]

    grid_search = GridSearchCV(current_model, current_param_grid, cv=5,n_jobs=-1)

    print(f'Now fitting {model}... ')

    grid_search.fit(X_tr, y_tr)

    best_model = grid_search.best_estimator_

    print(f'Completed fitting {model} in {time.time() - st} seconds. ')

    # Take the best model and evaluate using known metrics
    model=best_model
    scores = {}
    st = time.time()
    print('Now scoring model... ')
    y_tr_pred = model.predict(X_tr)
    y_tst_pred = model.predict(X_tst)

    plt.scatter(y_tr,y_tr_pred)
    plt.scatter(y_tst,y_tst_pred)
    plt.scatter(y_tr, y_tr)
    plt.scatter(y_tst,y_tst)
    plt.show()
            
    scores['MSE_train'] = mean_squared_error(y_tr, y_tr_pred),
    scores['R2_train'] = r2_score(y_tr, y_tr_pred)
    scores['MAE_train'] = mean_absolute_error(y_tr, y_tr_pred)
    scores['MSE_test'] = mean_squared_error(y_tst, y_tst_pred)
    scores['R2_test'] = r2_score(y_tst, y_tst_pred)
    scores['MAE_test'] = mean_absolute_error(y_tst, y_tst_pred)

    return scores



In [None]:
# Loop over function dirs and training set sizes 
# At the end of the loop save the scores

performance={}
for ts in tqdm(trainsize):
    

    
    with open(f"./{ts}/{ts}_5_DDCC_train.bin",'rb') as f:
        traindata=joblib.load(f)
    
    with open(f"./{ts}/{ts}_5_DDCC_test.bin",'rb') as f:
        testdata=joblib.load(f)
        
    
            
    X_train=traindata['X']
    y_train=traindata['y'].reshape(-1,)
    X_test=testdata['X']
    y_test=testdata['y'].reshape(-1,)       

    performance[ts]={}
    for model in models.keys():
        scores=gridsearch(model,X_train, y_train,X_test,y_test)
        performance[ts][model]=scores

        # Save scores to json
        with open(os.path.join(f"{ts}_{model}_scores.json"), 'w') as outfile:
            json.dump(scores, outfile)
            print(f'Scores saved as {outfile.name}. ')

In [None]:
for k,v in performance.items():
    print(k,pd.DataFrame(v).T.reset_index())

In [None]:
def find_best(function):
    """
    Parameters
    ----------
    function: str

    returns
    -------
    best: tuple
        (model, count)
    """
    dfmelt=[]
    for k,v in performance[function].items():
        df=pd.DataFrame.from_dict(v).loc[['R2_train','R2_test']].T
        df.columns=[df.columns,[k,k]]
        dfmelt.append(df.melt(value_vars=df.columns.tolist(),ignore_index=False))
    dfmelt=pd.concat(dfmelt)
    
    pivottable=dfmelt.reset_index().pivot_table(index=['index','variable_1'], columns='variable_0').astype(float).T
    
    stackedstats=[]
    for c in pivottable.columns.levels[0]:
        dsc=pivottable[c].T.describe().loc[['mean','min','max']]
        dsc=dsc.rename(columns={'value':c})
        stackedstats.append(dsc.T.mean().to_frame().rename(columns={0:c}))
    stackedstats=pd.concat(stackedstats,axis=1)
    
    best=stackedstats.T.idxmax()

    return best

In [None]:
bestdf=pd.DataFrame.from_dict({k: find_best(k) for k,v in performance.items()}).sort_index()

In [None]:
bestdf.to_excel('best.xlsx')

In [None]:
def save_bar(name):
    meltdef=[]
    for k,v in performance[name].items():
        for k1,v1 in v.items():
            for k2,v2 in v1.items():
                if "R2" in k2:
                    meltdef.append((k,k1,k2.replace('R2','R$^{2}$').replace('_',' '),v2))
                    
    df=pd.DataFrame(meltdef,columns=['Train','Model','Metric','Score'])

    BIGGER_SIZE = 12
    
    plt.rc('font', size=BIGGER_SIZE)          # controls default text sizes
    plt.rc('axes', titlesize=BIGGER_SIZE)     # fontsize of the axes title
    plt.rc('axes', labelsize=BIGGER_SIZE)    # fontsize of the x and y labels
    plt.rc('xtick', labelsize=BIGGER_SIZE)    # fontsize of the tick labels
    plt.rc('ytick', labelsize=BIGGER_SIZE)    # fontsize of the tick labels
    plt.rc('legend', fontsize=BIGGER_SIZE)    # legend fontsize
    plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title    
    sns.set_theme(style='white')

    g=sns.catplot(
        df, kind="bar",
        x="Train", y="Score", col="Model", hue='Metric',
        height=3.5, aspect=1, col_wrap=5,palette=sns.color_palette("Paired",2)
    )
    # g.fig.set_size_inches(20,50)
    # extract the matplotlib axes_subplot objects from the FacetGrid
    for ax in g.axes.ravel():
        # iterate through the axes containers
        for c in ax.containers:
            ax.bar_label(c, fmt='{:.2f}',fontsize=10)

    
    g.set_axis_labels("Training Set Ratio", "R$^{2}$")
    g.set_titles("{col_var}={col_name}",y=1,pad=20)
    sns.move_legend(g, "upper left", bbox_to_anchor=(1, 1))
    g.set(ylim=(0, 1))
    g.fig.suptitle(f"{name}".capitalize())
    # plt.title(f"{name}")
    plt.tight_layout()
    plt.savefig(f'{name}.png',dpi=300,bbox_inches='tight')
    plt.show()

In [None]:
for i in dirs: 
    print(i)
    save_bar(i)