In [None]:
import sys
!{sys.executable} -m pip install openpyxl
from glob import glob
import numpy as np
import joblib
import os
import pandas as pd
import json

import time
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler

from sklearn.svm import SVR
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

from tqdm.notebook import tqdm
import seaborn as sns
from collections import Counter

# SHAP
import shap

In [None]:
os.getcwd()

In [None]:


gaussian_kernel = RBF()

# remove basic linear models (ridge, lasso, elastic) and expensive models (gpr)

models = {
    'ridge': Ridge(),
    'lasso': Lasso(),
    'elastic': ElasticNet(),
    'knn': KNeighborsRegressor(),
    'rfr': RandomForestRegressor(),
    'grad': GradientBoostingRegressor(),
    'svr': SVR(),
    'krr': KernelRidge(),
    'gpr': GaussianProcessRegressor()
}

param_grid = {
    'ridge': {
        'alpha': [0.001, 0.01, 0.1, 1, 10, 50, 100, 1000]
    },
    'knn': {
        'n_neighbors': [1,5,10],
        'weights': ['uniform', 'distance']
    },
    'rfr': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 5, 10],
        'min_samples_split': [2, 5, 10]
    },
    'grad': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.1, 0.01, 0.001],
        'max_depth': [3, 5, 7]
    },
    'svr': {
        'kernel': ['linear', 'rbf'],
        'C': [0.1, 1, 10],
        'epsilon': [0.1, 0.01, 0.001]
    },
    'krr': {
        'kernel': ['linear', 'laplacian', 'rbf'],
        'alpha': [0.001, 0.01, 0.1, 1],
        'gamma': [1, 0.1, 0.01, 0.001]
    },
    'gpr': {
        'alpha': [0.0001, 0.001, 0.01, 0.1, 1],
        'kernel': [gaussian_kernel]
    },
    'lasso': {
        'alpha': [0.001, 0.01, 0.1, 1],
        'max_iter': [1000, 2000, 3000]
    },
    'elastic': {
        'alpha': [0.001, 0.01, 0.1, 1],
        'l1_ratio': [0.2, 0.5, 0.8],
        'max_iter': [1000, 2000, 3000]
    }
}


In [None]:
def gridsearch(model,X_tr, y_tr,X_tst,y_tst):
    """
    Perform GridSearchCV for a given model

    parameters
    ----------
    model: str
        Name of model

    X_tr: np.array
        X training

    y_tr: np.array
        Y Training

    X_tst: np.array
        X test

    y_tst: np.array
        y test

    Returns
    -------
    scores: dict
        scored models
    """
    st = time.time()
    # Grab model and model parameters to perform gridsearchcv
    current_model = models[model]
    current_param_grid = param_grid[model]

    grid_search = GridSearchCV(current_model, current_param_grid, cv=5,n_jobs=-1)

    print(f'Now fitting {model}... ')

    grid_search.fit(X_tr, y_tr)

    best_model = grid_search.best_estimator_

    print(f'Completed fitting {model} in {time.time() - st:.4f} seconds. ')

    # Take the best model and evaluate using known metrics
    model=best_model
    scores = {}
    st = time.time()
    print('Now scoring model... ')
    y_tr_pred = model.predict(X_tr)
    y_tst_pred = model.predict(X_tst)

    # plt.scatter(y_tr,y_tr_pred)
    # plt.scatter(y_tst,y_tst_pred)
    # plt.scatter(y_tr, y_tr)
    # plt.scatter(y_tst,y_tst)
    # plt.show()
            
    scores['MSE_train'] = mean_squared_error(y_tr, y_tr_pred),
    scores['R2_train'] = r2_score(y_tr, y_tr_pred)
    scores['MAE_train'] = mean_absolute_error(y_tr, y_tr_pred)
    
    scores['MSE_test'] = mean_squared_error(y_tst, y_tst_pred)
    scores['R2_test'] = r2_score(y_tst, y_tst_pred)
    scores['MAE_test'] = mean_absolute_error(y_tst, y_tst_pred)
    print(f"Train R2 {scores['R2_train']:.4f}")
    print(f"Test R2 {scores['R2_test']:.4f}")
    print(f"Train MAE {scores['MAE_train']:.4f}")
    print(f"Test MAE {scores['MAE_test']:.4f}")
    print()
    return scores, best_model



In [None]:

X = pd.read_csv("../../database/Morgan_sub.csv.gz", compression='gzip',index_col=0)
scaler = MinMaxScaler
x_scaler = scaler((-1, 1))
y_scaler = scaler((-1, 1))
Y=pd.read_csv(os.path.join(os.path.dirname(os.path.dirname(os.getcwd())),'database','y.csv.gz'), compression='gzip',index_col=0)

In [None]:
data_split = {}

# Scalers
scaler = MinMaxScaler
x_scaler = scaler((-1, 1))
y_scaler = scaler((-1, 1))

# PCA
c=16
pca = PCA(n_components=c)
for ts in [0.1,0.3,0.5,0.7,0.8]:
    # Split data
    train,test=train_test_split(list(X.index), train_size=ts,test_size=0.2,random_state=42)
        
    X_train, X_test, y_train, y_test = x_scaler.fit_transform(pca.fit_transform(X.loc[train])), x_scaler.transform(pca.transform(X.loc[test])), y_scaler.fit_transform(Y.loc[train].to_numpy()).flatten(), y_scaler.transform(Y.loc[test].to_numpy()).flatten()
    
    print("Scaled X:",X_train.shape,X_test.shape)
    print(np.min(X_train),np.max(X_train),np.mean(X_train))
    print("Scaled y:",y_train.shape,y_test.shape)
    print(np.min(y_train),np.max(y_train),np.mean(y_train))  
       
    
    scoring={}
    t0_init=time.perf_counter()
    for m in models.keys():
        t0=time.perf_counter()
        scores,model=gridsearch(m,X_train, y_train,X_test,y_test)
        tf=time.perf_counter()-t0
        scores['timing']=tf
        scoring[m]=scores
        print(m,tf,scoring)
    print(f"Overall {time.perf_counter()-t0_init:.2f}")
    stat_df=pd.concat([pd.DataFrame.from_dict(v).rename(index={0:k}) for k,v in scoring.items()])    
    data_split[ts]=scoring

stackedf = []
for k,v in data_split.items():
    df = pd.DataFrame.from_dict(v)
    df.loc['size'] = len(df.columns)*[k]
    stackedf.append(df)

stackedf = pd.concat(stackedf,axis=1).T.drop(columns=['MSE_train','MSE_test']).reset_index()
stackedf.to_excel(f"PCA{c}_classical.xlsx")
stackedf=stackedf.melt(id_vars=['index','size'],value_vars=['R2_train','R2_test'])

In [None]:
sns.lineplot(data=stackedf,x='size',y='value',hue='index',style='variable')

In [None]:
# # Scalers
# scaler = MinMaxScaler
# x_scaler = scaler((-1, 1))
# y_scaler = scaler((-1, 1))

# # PCA
# components = [5 , 16]
# for c in components:
#     pca = PCA(n_components=c)
#     for ts in [0.1,0.3,0.5,0.7,0.8]:
#         # Split data
#         train,test=train_test_split(list(X.index), train_size=ts,test_size=0.2,random_state=42)
#         X_train, X_test, y_train, y_test = x_scaler.fit_transform(pca.fit_transform(X.loc[train])), x_scaler.transform(pca.transform(X.loc[test])), y_scaler.fit_transform(Y.loc[train].to_numpy()).flatten(), y_scaler.transform(Y.loc[test].to_numpy()).flatten()
        

        
#         print("Scaled X:",X_train.shape,X_test.shape)
#         print(np.min(X_train),np.max(X_train),np.mean(X_train))
#         print("Scaled y:",y_train.shape,y_test.shape)
#         print(np.min(y_train),np.max(y_train),np.mean(y_train))        
#         with open(f'PCA{c}_{ts:.1f}_Morgan_train.bin','wb') as f:
#             joblib.dump({'X':X_train,'y':y_train},f)
#         with open(f'PCA{c}_{ts:.1f}_Morgan_test.bin','wb') as f:
#             joblib.dump({'X':X_test,'y':y_test},f)
#         with open(f'PCA{c}_{ts:.1f}_Morgan_scaler.bin','wb') as f:
#             joblib.dump(y_scaler,f)    
#         print()
#     print()