**ML**

Here we perform classical Machine Learning models fitting, tunability and testing on test_dataset.

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

Datasets

In [17]:
import pandas as pd

In [33]:
train = pd.read_csv('../data/data/model_data/train.csv')

All features are nummeric and there are no NAs. However for future use on other datasets, we will include imputer with median as imputed value.

In [None]:
pipe = Pipeline(
    steps=[('imputer', SimpleImputer(strategy='median')),
           ('scaler', StandardScaler())])

We will analyse 4 popular regression models:
- Lasso
- Ridge
- KNN Regressor
- XGB Regressor

In [14]:
# models 
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

In [15]:
lasso = Pipeline(steps=[('preprocessor', pipe),('lasso', Lasso())])
ridge = Pipeline(steps=[('preprocessor', pipe),('ridge', Ridge())])
knn = Pipeline(steps=[('preprocessor', pipe),('knn', KNeighborsRegressor())])
xgb = Pipeline(steps=[('preprocessor', pipe),('xgb', XGBRegressor())])

### Feature selection

Here we construct the Maximum Relevance Minimum Redundancy algorithm for regression problem. 

In [None]:
import numpy as np
from scipy.stats import pearsonr

def calculate_relevance_pearson(df, target):
    y = df[target]
    X = df.drop(columns=[target])
    relevance = []

    for feature in X.columns:
        correlation, _ = pearsonr(X[feature], y)
        relevance.append(abs(correlation))  
    return np.array(relevance)

def calculate_redundancy(feature1, feature2):
    return abs(pearsonr(feature1, feature2)[0])

def mrmr_feature_selection_regression(df, target, K, relevance_func):
    features = df.columns.drop(target)
    relevance_scores = relevance_func(df, target)
    feature_relevance = dict(zip(features, relevance_scores))

    selected_features = []
    remaining_features = list(features)

    first_feature = max(feature_relevance, key=feature_relevance.get)
    selected_features.append(first_feature)
    remaining_features.remove(first_feature)

    while len(selected_features) < K:
        scores = {}
        for feature in remaining_features:
            redundancy_sum = np.sum([
                calculate_redundancy(df[feature], df[selected])
                for selected in selected_features
            ])
            relevance = feature_relevance[feature]
            redundancy = redundancy_sum / (len(selected_features) ** 2)
            scores[feature] = relevance - redundancy

        next_feature = max(scores, key=scores.get)
        selected_features.append(next_feature)
        remaining_features.remove(next_feature)

    return selected_features

In [30]:
train = pipe.fit(train.drop(columns=['Mean_Radiation']), train['Mean_Radiation'])


In [None]:
K = train.shape[1] - 1
mrmr_feature_selection_regression(train, 'Mean_Radiation', K, calculate_relevance_pearson)


ValueError: array must not contain infs or NaNs

In [40]:

from scipy.stats import uniform, loguniform, randint
from skopt.space import Real, Integer, Categorical

In [None]:
LASSO_PARAMS_R = {'lasso__alpha': loguniform(1e-5, 1e5),
                  'lasso__max_iter': randint(1000,7000),
                  'lasso__fit_intercept': [True, False],}
RIDGE_PARAMS_R = {'ridge__alpha': loguniform(1e-5, 1e5),
                    'ridge__max_iter': randint(1000,7000),
                    'ridge__fit_intercept': [True, False],
                    'ridge__solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']}
KNN_PARAMS_R = {'knn__n_neighbors': randint(1, 100),
                'knn__weights': ['uniform', 'distance'],
                'knn__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}
XGB_PARAMS_R = {'xgb__n_estimators': randint(100, 1000),
                'xgb__max_depth': randint(3, 10),
                'xgb__learning_rate': loguniform(0.01, 0.3),
                'xgb__subsample': uniform(0.6, 0.4),
                'xgb__colsample_bytree': uniform(0.6, 0.4),
                'xgb__gamma': uniform(0, 0.5),
                'xgb__reg_alpha': loguniform(1e-5, 1e5),
                'xgb__reg_lambda': loguniform(1e-5, 1e5)}
LASSO_PARAMS_B = {'lasso__alpha': Real(1e-5, 1e5, prior='log-uniform', transform='identity'),}