In [None]:
import numpy as np
import pandas as pd
import sklearn

In [None]:
# Read the Data
df_train = pd.read_csv('X_train.csv')
df_labels = pd.read_csv('y_train.csv')
df_test = pd.read_csv('X_test.csv')

df_id = df_test.iloc[:,0:1]

# Remove first column
df_X = df_train.iloc[:,1:]
df_y = df_labels.iloc[:,1:]
df_test = df_test.iloc[:,1:]

In [None]:
# Shuffle training data
df_all = pd.concat([df_X, df_y], axis=1)
shuffled_all = df_all.sample(frac=1, random_state=0)

df_X = shuffled_all.iloc[:,:-1]
df_y = shuffled_all.iloc[:,-1:]

In [None]:
from feature_engine.selection import DropCorrelatedFeatures, SmartCorrelatedSelection

def remove_X_correlated_features(X_train, alpha=0.99):
    
    dcor_tr = DropCorrelatedFeatures(threshold=alpha)
    X_train_decr = dcor_tr.fit(X_train)

    mask = dcor_tr.get_support()
    return np.array(mask)

def fs_x_correlation(X_train, X_test, alpha=0.99):
    
    mask1 = remove_X_correlated_features(X_train, alpha=alpha)
    
    X_train_decor = X_train[:, mask1]
    X_test_decor = X_test[:, mask1]
    
    return X_train_decor, X_test_decor

In [None]:
from feature_engine.selection import DropConstantFeatures

def drop_constant_features(X_train, X_test):
    
    dconst_tr = DropConstantFeatures(missing_values='ignore')
    X_train_dedup = dconst_tr.fit_transform(X_train)
    X_test_dedup = dconst_tr.transform(X_test)
    
    return X_train_dedup, X_test_dedup

In [None]:
def with_nan_feature_selection(X_train, y_train, X_test, alpha_X=0.99, alpha_y=0.1):

    X_train, X_test = fs_x_correlation(X_train, X_test, alpha=alpha_X)
    X_train, X_test = drop_constant_features(X_train, X_test)
    
    return X_train, X_test

In [None]:
from sklearn.impute import KNNImputer, SimpleImputer

def impute_knn(X_train, X_test, n=20):

    imputer = KNNImputer(n_neighbors=n)
    X_train_imputed = imputer.fit_transform(X_train)
    X_test_imputed = imputer.transform(X_test)
    
    return X_train_imputed, X_test_imputed

def impute_median(X_train, X_test):

    imputer = SimpleImputer(strategy='median')
    X_train_imputed = imputer.fit_transform(X_train)
    X_test_imputed = imputer.transform(X_test)
    
    return X_train_imputed, X_test_imputed

In [None]:
from sklearn.preprocessing import StandardScaler

def scale(X_train, X_val):
    
    scaler = StandardScaler()
    
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    
    return X_train_scaled, X_val_scaled

In [None]:
from sklearn.feature_selection import SelectKBest

def select_k_best(X_train, y_train, X_test, k, score_func):
    
    kbest = SelectKBest(k=k, score_func=score_func)
    X_train_selected = kbest.fit_transform(X_train, y_train)
    X_test_selected = kbest.transform(X_test)
    
    return X_train_selected, X_test_selected

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer, SimpleImputer

def scale_and_impute(X_train, X_test):
    
    scale_and_impute_pipe = Pipeline([('scaler', StandardScaler()),('imputer', SimpleImputer(strategy='median'))])
    X_train_imputed = scale_and_impute_pipe.fit_transform(X_train)
    X_test_imputed = scale_and_impute_pipe.transform(X_test)
    
    return X_train_imputed, X_test_imputed

In [None]:
from pyod.models.ecod import ECOD

def outlier_detection(X_train, y_train, contamination=0.01):

    mask3 = ECOD_outlier_detection(X_train, y_train, contamination)
    
    mask = mask3.astype(int) == 1
    
    X_return = X_train[mask]
    y_return = y_train[mask]
    
    print(X_train.shape, X_return.shape)
    
    return X_return, y_return
    
def ECOD_outlier_detection(X_train, y_train, contamination=0.01):
    
    estimator = ECOD(contamination=contamination)
    estimator.fit(X_train, y_train)
    
    distance = estimator.predict(X_train)
    mask = distance != 1
    
    return mask

In [None]:
# Feature selection before nan value imputation
X_train_raw = df_X.to_numpy()
y_train_raw = df_y.to_numpy().ravel()
X_test_raw = df_test.to_numpy()

X_train_selected_nan, X_test_selected_nan = with_nan_feature_selection(X_train_raw, y_train_raw, X_test_raw, alpha_X=0.9999)
print(X_train_selected_nan.shape, X_train_raw.shape)

In [None]:
# Nan value imputation
X_train_selected, X_test_selected = impute_median(X_train_selected_nan, X_test_selected_nan)

In [None]:
# Feature selection after nan value imputation
from scipy.stats import spearmanr, f, pearsonr
from sklearn.feature_selection import f_regression, mutual_info_regression, chi2, f_classif

def f_spearman(X, y):
    corr_array = []
    p_array = []
    for i in range(X.shape[1]):
        corr, p = spearmanr(X[:,i], y)
        corr_array.append(abs(corr))
        p_array.append(p)
        
    return corr_array, p_array


X_train_kselected, X_test_kselected = select_k_best(X_train_selected, y_train_raw, X_test_selected, 
                                                    k=175, score_func=f_regression)

In [None]:
# Outlier detection
X_train_no_outliers, y_train_no_outliers = outlier_detection(X_train_kselected, y_train_raw, contamination=0.01)

In [None]:
# Scaling
X_train_scaled, X_test_scaled = scale(X_train_no_outliers, X_test_kselected)

In [None]:
X_train = X_train_scaled
X_test = X_test_scaled
y_train = y_train_no_outliers

In [None]:
from sklearn.model_selection import GridSearchCV 
from sklearn.metrics import r2_score

def get_best_parameters(estimator, parameters):
    
    search = GridSearchCV(estimator=estimator, param_grid=parameters, scoring='r2', n_jobs=-1, cv=5, verbose=1)
    search.fit(X_train, y_train)

    print('Best params:', search.best_params_)
    print('score:', search.best_score_)
    print('best:', search.best_estimator_)
    
    return search.best_estimator_

In [None]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RationalQuadratic

rational_kernel = RationalQuadratic(alpha=0.6, length_scale=8)
gpr = GaussianProcessRegressor(random_state=0)
gpr_parameters = {'kernel' : [rational_kernel], 'alpha' : np.logspace(-10, -1, 20), 'normalize_y' : [True, False]}
gpr_final = get_best_parameters(gpr, gpr_parameters)

In [None]:
from cubist import Cubist

cub = Cubist(n_rules=500, composite=True, random_state=0)
cub_parameters = {'n_committees' : [1, 2, 3, 4, 5, 6, 7, 8, 9], 'neighbors' : [3, 4, 5, 6]}
cub_final = get_best_parameters(cub, cub_parameters)

In [None]:
from sklearn.svm import SVR
from sklearn.gaussian_process.kernels import RationalQuadratic

rational_kernel = RationalQuadratic(alpha=0.6, length_scale=8)
svr = SVR()
svr_parameters = {'kernel' : ['rbf', rational_kernel], 'epsilon' : np.logspace(-8, -1, 8), 'C' : np.linspace(50, 80, 10)}
svr_final = get_best_parameters(svr, svr_parameters)

In [None]:
from lightgbm import LGBMRegressor
lgb = LGBMRegressor(random_state=0)
lgb_parameters = {'boosting_type' : ['gbdt'], 'n_estimators' : [2000], 'learning_rate' : np.logspace(-3, 0, 6), 'num_leaves' : np.logspace(3, 5, 3, base=2).astype(int)}
lgb_final = get_best_parameters(lgb, lgb_parameters)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor(random_state=0)
gbr_parameters = {'n_estimators' : [2000], 'learning_rate' : np.logspace(-5, 0, 10), 'min_samples_split' : [2, 3, 4, 5, 6], 'max_depth' : [2, 3, 4]}
gbr_final = get_best_parameters(gbr, gbr_parameters)

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
trees = ExtraTreesRegressor(random_state=0)
trees_parameters = {'n_estimators' : [2000], 'min_samples_split' : [2, 3, 4, 5, 6]}
trees_final = get_best_parameters(trees, trees_parameters)

In [None]:
from catboost import CatBoostRegressor
cat = CatBoostRegressor(verbose=False)
cat_parameters = {'learning_rate' : np.logspace(-5, 0, 10)}
cat_final = get_best_parameters(cat, cat_parameters)

In [None]:
#from sklearn_rvm import EMRVR
from skrvm import RVR
from sklearn.gaussian_process.kernels import RationalQuadratic

rational_kernel = RationalQuadratic(alpha=0.6, length_scale=8)
rvr = RVR()
rvr_parameters = {'kernel' : ['rbf', rational_kernel]}
rvr_final = get_best_parameters(rvr, rvr_parameters)

In [None]:
estimators = [('svr', svr_final), ('lgb', lgb_final), ('trees', trees_final), ('cat', cat_final), ('rvr', rvr_final), ('cub', cub_final), ('gpr', gpr_final)] 

In [None]:
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold, cross_val_score

K = 5
cv_splitter = KFold(n_splits=5, shuffle=False)

In [None]:
for name, regressor in estimators:
    
    score = cross_val_score(estimator=regressor, X=X_train, y=y_train, cv=cv_splitter, scoring='r2', n_jobs=-1)
    mean_score = np.mean(score)

    print(f"{name}: {K} fold CV score is {mean_score} and the list is \n{score}")

In [None]:
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import KFold, cross_val_score

stacking_regressor = StackingRegressor(estimators=estimators, n_jobs=-1)
score = cross_val_score(estimator=stacking_regressor, X=X_train, y=y_train, cv=cv_splitter, scoring='r2', verbose=3)
mean_score = np.mean(score)

print(f"Stacking: {K} fold CV score is {mean_score} and the list is \n{score}")

In [None]:
# Fit to all training data
stacking_regressor.fit(X_train, y_train)

In [None]:
# Create Submission
y_predict = stacking_regressor.predict(X_test)
df_submission = df_id.assign(y=y_predict)
print(df_submission)
df_submission.to_csv('current.csv', index=False)