In [1]:
import sys
# !{sys.executable} -m pip install tqdm seaborn 
from glob import glob
import numpy as np
import joblib
import os
import pandas as pd
import json

import time
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

from tqdm.notebook import tqdm
import seaborn as sns
from collections import Counter

Fix training and test set!

In [2]:


gaussian_kernel = RBF()

models = {
    'ridge': Ridge(),
    'lasso': Lasso(),
    'elastic': ElasticNet(),
    'knn': KNeighborsRegressor(),
    'rfr': RandomForestRegressor(),
    'grad': GradientBoostingRegressor(),
    'svr': SVR(),
    'krr': KernelRidge(),
    'gpr': GaussianProcessRegressor()
}

param_grid = {
    'ridge': {
        'alpha': [0.001, 0.01, 0.1, 1, 10, 50, 100, 1000]
    },
    'knn': {
        'n_neighbors': range(1, 10),
        'weights': ['uniform', 'distance']
    },
    'rfr': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 5, 10],
        'min_samples_split': [2, 5, 10]
    },
    'grad': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.1, 0.01, 0.001],
        'max_depth': [3, 5, 7]
    },
    'svr': {
        'kernel': ['linear', 'rbf'],
        'C': [0.1, 1, 10],
        'epsilon': [0.1, 0.01, 0.001]
    },
    'krr': {
        'kernel': ['linear', 'poly', 'rbf'],
        'alpha': [0.001, 0.01, 0.1, 1],
        'gamma': [1, 0.1, 0.01, 0.001]
    },
    'gpr': {
        'alpha': [0.0001, 0.001, 0.01, 0.1, 1],
        'kernel': [gaussian_kernel]
    },
    'lasso': {
        'alpha': [0.001, 0.01, 0.1, 1],
        'max_iter': [1000, 2000, 3000]
    },
    'elastic': {
        'alpha': [0.001, 0.01, 0.1, 1],
        'l1_ratio': [0.2, 0.5, 0.8],
        'max_iter': [1000, 2000, 3000]
    }
}


In [3]:
def gridsearch(model,X_tr, y_tr,X_tst,y_tst):
    """
    Perform GridSearchCV for a given model

    parameters
    ----------
    model: str
        Name of model

    X_tr: np.array
        X training

    y_tr: np.array
        Y Training

    X_tst: np.array
        X test

    y_tst: np.array
        y test

    Returns
    -------
    scores: dict
        scored models
    """
    st = time.time()
    current_model = models[model]
    current_param_grid = param_grid[model]

    grid_search = GridSearchCV(current_model, current_param_grid, cv=5,n_jobs=-1)

    print(f'Now fitting {model}... ')

    grid_search.fit(X_tr, y_tr)

    best_model = grid_search.best_estimator_

    print(f'Completed fitting {model} in {time.time() - st} seconds. ')

    model=best_model
    scores = {}
    st = time.time()
    print('Now scoring model... ')
    y_tr_pred = model.predict(X_tr)
    y_tst_pred = model.predict(X_tst)
            
    scores['MSE_train'] = mean_squared_error(y_tr, y_tr_pred),
    scores['R2_train'] = r2_score(y_tr, y_tr_pred)
    scores['MAE_train'] = mean_absolute_error(y_tr, y_tr_pred)
    scores['MSE_test'] = mean_squared_error(y_tst, y_tst_pred)
    scores['R2_test'] = r2_score(y_tst, y_tst_pred)
    scores['MAE_test'] = mean_absolute_error(y_tst, y_tst_pred)

    return scores



In [4]:
# data=pd.read_csv(f'/Users/grierjones/qregress/function-calc-test/linear/linear.csv')
# x=data['x'].to_numpy().reshape(-1,1)
# y=data['y'].to_numpy().reshape(-1,1)
# X_train, X_test, y_train, y_test = train_test_split(x, y, train_size=0.8,test_size=20)


In [5]:
dirs=['quadratic','linear','sine']
trainsize=[0.3, 0.5, 0.7, 0.8, 0.1]

In [14]:
traindata=joblib.load(f'/Users/grierjones/qregress/function-calc-test/linear/linear_train.bin')
testdata=joblib.load(f'/Users/grierjones/qregress/function-calc-test/linear/linear_test.bin')
valdata=joblib.load(f'/Users/grierjones/qregress/function-calc-test/linear/linear_validate.bin')

In [15]:
valdata['X'].shape

(10, 1)

In [None]:
performance={}

for d in tqdm(dirs):
    data=pd.read_csv(f'/Users/grierjones/qregress/function-calc-test/{d}/{d}.csv')
    x=data['x'].to_numpy().reshape(-1,1)
    y=data['y'].to_numpy().reshape(-1,1)
    performance[d]={}
    X_train_O, X_test, y_train_O, y_test = train_test_split(x, y, train_size=80,test_size=20,random_state=42)
    for ts in tqdm(trainsize):
        
        train, _=train_test_split(range(len(X_train_O)), train_size=ts,random_state=42)
        X_train=X_train_O[train]
        y_train=y_train_O[train]
        train_name = os.path.join(d,'All',f"{ts}",f"{d}_train.bin")
        test_name = os.path.join(d,'All',f"{ts}",f"{d}_test.bin")
        print(train_name,test_name)
        train = {
            'X': X_train,
            'y': y_train
        }
        
        test = {
            'X': X_test,
            'y': y_test
        }
        
        
        joblib.dump(train, train_name)
        joblib.dump(test, test_name)
        performance[d][ts]={}
        for model in models.keys():
            scores=gridsearch(model,X_train, y_train,X_test,y_test)
            performance[d][ts][model]=scores
                
            with open(os.path.join(d,'All',f"{ts}",f'{model}_scores.json'), 'w') as outfile:
                json.dump(scores, outfile)
                print(f'Scores saved as {outfile.name}. ')

In [None]:
def find_best(function):
    """
    Parameters
    ----------
    function: str

    returns
    -------
    best: tuple
        (model, count)
    """
    scoredf=sum([pd.DataFrame.from_dict(v).loc[['R2_train','R2_test']].astype(float) for k,v in performance[function].items()])/len(performance['linear'])
    scoredf.loc['absdiff']=(scoredf.loc['R2_train']-scoredf.loc['R2_test']).abs()
    best=max([(k,v) for k,v in Counter(list(scoredf.T['R2_train'].nlargest(3).index)+list(scoredf.T['R2_test'].nlargest(3).index)+list(scoredf.T['absdiff'].nsmallest(3).index)).items()],key=lambda x: x[1])

    return best

In [None]:
bestdf=pd.DataFrame.from_dict({k: find_best(k) for k,v in performance.items()}).sort_index()

In [None]:
bestdf

In [None]:
bestdf.to_excel('best.xlsx')

In [None]:
def save_bar(name):
    meltdef=[]
    for k,v in performance[name].items():
        for k1,v1 in v.items():
            for k2,v2 in v1.items():
                if "R2" in k2:
                    meltdef.append((k,k1,k2.replace('R2','R$^{2}$').replace('_',' '),v2))
                    
    df=pd.DataFrame(meltdef,columns=['Train','Model','Metric','Score'])
    
    g=sns.catplot(
        df, kind="bar",
        x="Train", y="Score", col="Model", hue='Metric',
        height=4, aspect=1, col_wrap=5,palette=sns.color_palette("Paired",2)
    )
    g.set_axis_labels("Training Set Ratio", "R$^{2}$")
    g.set_titles("{col_var}={col_name}")
    sns.move_legend(g, "upper left", bbox_to_anchor=(1, 1))
    # plt.title(f"{name}")
    plt.tight_layout()
    plt.savefig(f'{name}.png',dpi=300,bbox_inches='tight')
    plt.show()

In [None]:
for i in dirs: 
    print(i)
    save_bar(i)