In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from catboost import CatBoostRegressor
data = pd.read_csv('Combined.csv')

X = data[[f'Feature_{i+1}' for i in range(50)]]
y = data[[f'Output_{i+1}' for i in range(5)]]  # Assuming 5 outputs

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Confirm split sizes
print("Training set:", X_train.shape, y_train.shape)
print("Validation set:", X_val.shape, y_val.shape)
print("Test set:", X_test.shape, y_test.shape)

model = MultiOutputRegressor(RandomForestRegressor(random_state=42))
model.fit(X_train, y_train)

# Predict on the validation set
y_val_pred = model.predict(X_val)
val_mse = mean_squared_error(y_val, y_val_pred, multioutput='raw_values')
print("Validation MSE for each output:", val_mse)
overall_val_mse = mean_squared_error(y_val, y_val_pred)
print("Overall Validation MSE:", overall_val_mse)



xgb_model = MultiOutputRegressor(xgb.XGBRegressor(objective='reg:squarederror', random_state=42))
xgb_model.fit(X_train, y_train)
xgb_val_pred = xgb_model.predict(X_val)
xgb_val_mse = mean_squared_error(y_val, xgb_val_pred, multioutput='raw_values')

print("XGBoost Validation MSE:", xgb_val_mse)
overall_val_xgb = mean_squared_error(y_val, xgb_val_pred)
print("Overall Validation MSE:", overall_val_xgb)


# CatBoost model
catboost_model = MultiOutputRegressor(CatBoostRegressor(iterations=500, depth=6, learning_rate=0.1, loss_function='MultiRMSE', random_seed=42, verbose=0))
catboost_model.fit(X_train, y_train)
catboost_val_pred = catboost_model.predict(X_val)
catboost_val_mse = mean_squared_error(y_val, catboost_val_pred, multioutput='raw_values')

print("CatBoost Validation MSE:", catboost_val_mse)
overall_val_cat = mean_squared_error(y_val, catboost_val_pred)
print("Overall Validation MSE:", overall_val_cat)



Training set: (640, 50) (640, 5)
Validation set: (80, 50) (80, 5)
Test set: (80, 50) (80, 5)
Validation MSE for each output: [0.00095248 0.00029309 0.00271742 0.00103777 0.00143985]
Overall Validation MSE: 0.0012881226362468185
XGBoost Validation MSE: [0.0009246  0.00030657 0.00237065 0.00074768 0.00166186]
Overall Validation MSE: 0.0012022716022176694
CatBoost Validation MSE: [0.00060637 0.00013584 0.00234464 0.00101009 0.00127334]
Overall Validation MSE: 0.0010740552124411714


In [2]:
from minepy import MINE
import numpy as np

mine = MINE()

mic_scores = np.zeros((X.shape[1], y.shape[1]))  

for i, feature in enumerate(X.columns):
    for j, output in enumerate(y.columns):
        mine.compute_score(X[feature].values, y[output].values)
        mic_scores[i, j] = mine.mic()

average_mic_scores = mic_scores.mean(axis=1)

feature_mic_df = pd.DataFrame({
    'Feature': X.columns,
    'Average_MIC': average_mic_scores
}).sort_values(by='Average_MIC', ascending=False)

print(feature_mic_df)

import pandas as pd
import numpy as np
from sklearn.feature_selection import mutual_info_regression

# Create an array to store mutual information scores for each feature-output pair
mi_scores = np.zeros((X.shape[1], y.shape[1]))

# Calculate mutual information for each feature against each output
for i, output in enumerate(y.columns):
    mi_scores[:, i] = mutual_info_regression(X, y[output].values, random_state=42)

# Compute average mutual information for each feature across all outputs
average_mi_scores = mi_scores.mean(axis=1)

# Create a DataFrame to display features and their average mutual information scores
feature_mi_df = pd.DataFrame({
    'Feature': X.columns,
    'Average_MI': average_mi_scores
}).sort_values(by='Average_MI', ascending=False)

# Display the DataFrame
print(feature_mi_df)


       Feature  Average_MIC
17  Feature_18     0.606275
1    Feature_2     0.577277
18  Feature_19     0.565833
24  Feature_25     0.559473
16  Feature_17     0.550203
14  Feature_15     0.544664
3    Feature_4     0.538556
11  Feature_12     0.534757
6    Feature_7     0.530309
2    Feature_3     0.524645
15  Feature_16     0.514047
12  Feature_13     0.513364
21  Feature_22     0.511601
5    Feature_6     0.463500
19  Feature_20     0.461163
8    Feature_9     0.458065
20  Feature_21     0.455153
7    Feature_8     0.430996
4    Feature_5     0.430190
10  Feature_11     0.409021
23  Feature_24     0.400841
22  Feature_23     0.396367
41  Feature_42     0.389559
33  Feature_34     0.380219
26  Feature_27     0.371027
42  Feature_43     0.351353
31  Feature_32     0.344761
13  Feature_14     0.333625
27  Feature_28     0.331494
30  Feature_31     0.329151
45  Feature_46     0.321862
37  Feature_38     0.320492
35  Feature_36     0.311074
47  Feature_48     0.293028
32  Feature_33     0

In [3]:
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif_data = pd.DataFrame()
vif_data["Feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

print(vif_data.sort_values(by="VIF", ascending=False))


       Feature         VIF
3    Feature_4  542.727007
14  Feature_15  435.237497
24  Feature_25  380.905803
18  Feature_19  345.627178
11  Feature_12  199.864219
15  Feature_16  124.632794
17  Feature_18  100.608891
21  Feature_22  100.274209
1    Feature_2   89.696554
16  Feature_17   32.108044
2    Feature_3   31.222708
20  Feature_21   29.374241
13  Feature_14   26.674050
19  Feature_20   25.932147
5    Feature_6   23.105097
4    Feature_5   22.829768
12  Feature_13   21.328415
6    Feature_7   21.187889
8    Feature_9   21.083165
7    Feature_8   20.286845
9   Feature_10   19.362424
23  Feature_24   15.799666
0    Feature_1   13.682924
10  Feature_11   13.529012
22  Feature_23   13.323548
30  Feature_31    9.244250
32  Feature_33    8.942437
37  Feature_38    8.867152
48  Feature_49    8.773365
38  Feature_39    8.739385
25  Feature_26    8.605336
31  Feature_32    8.335603
27  Feature_28    8.210832
47  Feature_48    8.103190
45  Feature_46    8.038583
46  Feature_47    8.016604
3

In [4]:

# Function to calculate VIF and identify high VIF features
def calculate_vif(data, threshold=20):
    vif_data = pd.DataFrame()
    vif_data["Feature"] = data.columns
    vif_data["VIF"] = [variance_inflation_factor(data.values, i) for i in range(data.shape[1])]
    
    # Identify features to remove
    high_vif_features = vif_data[vif_data["VIF"] >= threshold]["Feature"].tolist()
    
    print("\nFeatures to remove due to high VIF (>= 10):", high_vif_features)
    
    return high_vif_features
X_t = X
# Initial calculation
features_to_remove = calculate_vif(X_t)

# Iteratively remove features with high VIF
iteration = 1
while features_to_remove:
    print(f"\nIteration {iteration}: Removing features {features_to_remove}")
    X_t = X_t.drop(columns=features_to_remove)
    features_to_remove = calculate_vif(X_t)
    iteration += 1

print("\nFinal feature set after VIF filtering:")
print(X_t.columns.tolist())



Features to remove due to high VIF (>= 10): ['Feature_2', 'Feature_3', 'Feature_4', 'Feature_5', 'Feature_6', 'Feature_7', 'Feature_8', 'Feature_9', 'Feature_12', 'Feature_13', 'Feature_14', 'Feature_15', 'Feature_16', 'Feature_17', 'Feature_18', 'Feature_19', 'Feature_20', 'Feature_21', 'Feature_22', 'Feature_25']

Iteration 1: Removing features ['Feature_2', 'Feature_3', 'Feature_4', 'Feature_5', 'Feature_6', 'Feature_7', 'Feature_8', 'Feature_9', 'Feature_12', 'Feature_13', 'Feature_14', 'Feature_15', 'Feature_16', 'Feature_17', 'Feature_18', 'Feature_19', 'Feature_20', 'Feature_21', 'Feature_22', 'Feature_25']

Features to remove due to high VIF (>= 10): []

Final feature set after VIF filtering:
['Feature_1', 'Feature_10', 'Feature_11', 'Feature_23', 'Feature_24', 'Feature_26', 'Feature_27', 'Feature_28', 'Feature_29', 'Feature_30', 'Feature_31', 'Feature_32', 'Feature_33', 'Feature_34', 'Feature_35', 'Feature_36', 'Feature_37', 'Feature_38', 'Feature_39', 'Feature_40', 'Feature_

In [5]:
X_train = X_train[X_t.columns]
X_val = X_val[X_t.columns]
X_test = X_test[X_t.columns]



In [6]:
model = MultiOutputRegressor(RandomForestRegressor(random_state=42))
model.fit(X_train, y_train)

# Predict on the validation set
y_val_pred = model.predict(X_val)
rval_mse = mean_squared_error(y_val, y_val_pred, multioutput='raw_values')
print("Validation MSE for each output:", val_mse)
xgb_model = MultiOutputRegressor(xgb.XGBRegressor(objective='reg:squarederror', random_state=42))
xgb_model.fit(X_train, y_train)
xgb_val_pred = xgb_model.predict(X_val)
rxgb_val_mse = mean_squared_error(y_val, xgb_val_pred, multioutput='raw_values')

print("XGBoost Validation MSE:", xgb_val_mse)

# CatBoost model
catboost_model = MultiOutputRegressor(CatBoostRegressor(iterations=500, depth=6, learning_rate=0.1, loss_function='MultiRMSE', random_seed=42, verbose=0))
catboost_model.fit(X_train, y_train)
catboost_val_pred = catboost_model.predict(X_val)
rcatboost_val_mse = mean_squared_error(y_val, catboost_val_pred, multioutput='raw_values')

print("CatBoost Validation MSE:", catboost_val_mse)

Validation MSE for each output: [0.00095248 0.00029309 0.00271742 0.00103777 0.00143985]
XGBoost Validation MSE: [0.0009246  0.00030657 0.00237065 0.00074768 0.00166186]
CatBoost Validation MSE: [0.00060637 0.00013584 0.00234464 0.00101009 0.00127334]


In [7]:
print("Original Random Forest MSE:", val_mse)
print("Reduced Random Forest MSE:", rval_mse)

print("Original XGBoost MSE:", xgb_val_mse)
print("Reduced XGBoost MSE:", rxgb_val_mse)

print("Original CatBoost MSE:", catboost_val_mse)
print("Reduced CatBoost MSE:", rcatboost_val_mse)



# Percentage change calculations
rf_percentage_change = (rval_mse - val_mse) / val_mse * 100
xgb_percentage_change = (rxgb_val_mse  - xgb_val_mse ) / xgb_val_mse  * 100
catboost_percentage_change = (rcatboost_val_mse - catboost_val_mse) / catboost_val_mse * 100

# Display the results
print("Random Forest MSE Percentage Change:", rf_percentage_change)
print("XGBoost MSE Percentage Change:", xgb_percentage_change)
print("CatBoost MSE Percentage Change:", catboost_percentage_change)


Original Random Forest MSE: [0.00095248 0.00029309 0.00271742 0.00103777 0.00143985]
Reduced Random Forest MSE: [0.00586379 0.00076431 0.01577159 0.01817172 0.00391532]
Original XGBoost MSE: [0.0009246  0.00030657 0.00237065 0.00074768 0.00166186]
Reduced XGBoost MSE: [0.00538631 0.00064996 0.0136334  0.01355023 0.00359942]
Original CatBoost MSE: [0.00060637 0.00013584 0.00234464 0.00101009 0.00127334]
Reduced CatBoost MSE: [0.00485484 0.00057191 0.01386341 0.01742942 0.0031749 ]
Random Forest MSE Percentage Change: [ 515.63364866  160.77266968  480.38921364 1651.0293614   171.92559532]
XGBoost MSE Percentage Change: [ 482.55736215  112.01257144  475.0911623  1712.29559959  116.58977708]
CatBoost MSE Percentage Change: [ 700.63334143  321.032833    491.28092806 1625.5385369   149.33621415]


In [8]:
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

# Use RFECV with a non-linear model (e.g., RandomForest)
model = RandomForestRegressor(random_state=42)
selector = RFECV(estimator=model, step=1, cv=5, scoring='neg_mean_squared_error')
selector.fit(X, y)

# Get the selected features
selected_features = X.columns[selector.support_]
print("Selected features:", selected_features)


Selected features: Index(['Feature_12', 'Feature_13', 'Feature_18', 'Feature_25'], dtype='object')


In [9]:
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Assuming X and y are already defined
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Linear model example
linear_model = LinearRegression()
rfecv_linear = RFECV(estimator=linear_model, step=1, cv=5, scoring='neg_mean_squared_error')
rfecv_linear.fit(X_train, y_train)
selected_features_linear = X.columns[rfecv_linear.support_]
print("Selected features with Linear Regression:", selected_features_linear)

# Non-linear model example (e.g., Random Forest)
non_linear_model = RandomForestRegressor(random_state=42)
rfecv_non_linear = RFECV(estimator=non_linear_model, step=1, cv=5, scoring='neg_mean_squared_error')
rfecv_non_linear.fit(X_train, y_train)
selected_features_non_linear = X.columns[rfecv_non_linear.support_]
print("Selected features with Random Forest:", selected_features_non_linear)

# Evaluate each model with the selected features
X_train_linear = X_train[selected_features_linear]
X_val_linear = X_val[selected_features_linear]
linear_model.fit(X_train_linear, y_train)
linear_val_pred = linear_model.predict(X_val_linear)
linear_val_mse = mean_squared_error(y_val, linear_val_pred)
print("Linear Regression Validation MSE with selected features:", linear_val_mse)

X_train_rf = X_train[selected_features_non_linear]
X_val_rf = X_val[selected_features_non_linear]
non_linear_model.fit(X_train_rf, y_train)
rf_val_pred = non_linear_model.predict(X_val_rf)
rf_val_mse = mean_squared_error(y_val, rf_val_pred)
print("Random Forest Validation MSE with selected features:", rf_val_mse)




Selected features with Linear Regression: Index(['Feature_1', 'Feature_2', 'Feature_3', 'Feature_4', 'Feature_5',
       'Feature_6', 'Feature_7', 'Feature_8', 'Feature_9', 'Feature_10',
       'Feature_11', 'Feature_12', 'Feature_13', 'Feature_14', 'Feature_15',
       'Feature_17', 'Feature_18', 'Feature_19', 'Feature_20', 'Feature_22',
       'Feature_23', 'Feature_25', 'Feature_26', 'Feature_27', 'Feature_28',
       'Feature_29', 'Feature_30', 'Feature_31', 'Feature_32', 'Feature_33',
       'Feature_34', 'Feature_35', 'Feature_36', 'Feature_37', 'Feature_39',
       'Feature_40', 'Feature_41', 'Feature_42', 'Feature_44', 'Feature_45',
       'Feature_46', 'Feature_47', 'Feature_48', 'Feature_49', 'Feature_50'],
      dtype='object')
Selected features with Random Forest: Index(['Feature_2', 'Feature_3', 'Feature_4', 'Feature_6', 'Feature_7',
       'Feature_12', 'Feature_13', 'Feature_15', 'Feature_17', 'Feature_18',
       'Feature_19', 'Feature_20', 'Feature_22', 'Feature_25', '

In [34]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
# List of selected features based on provided MIC scores
selected_features_mic = [
    'Feature_18', 'Feature_2', 'Feature_19', 'Feature_25', 'Feature_17',
    'Feature_15', 'Feature_4', 'Feature_12', 'Feature_7', 'Feature_3',
    'Feature_16', 'Feature_13', 'Feature_22', 'Feature_6', 'Feature_20', 'Feature_9', 'Feature_21'
]

# Subset X_train and X_val using the selected features
X_train_mic = X_train[selected_features_mic]
X_val_mic = X_val[selected_features_mic]
# Fit and evaluate a Linear Regression model
linear_model = LinearRegression()
linear_model.fit(X_train_mic, y_train)
linear_val_pred_mic = linear_model.predict(X_val_mic)
linear_val_mse_mic = mean_squared_error(y_val, linear_val_pred_mic)
print("Linear Regression Validation MSE with selected MIC features:", linear_val_mse_mic)

# Fit and evaluate a Polynomial Regression model (degree 2 as an example)
poly_pipeline = Pipeline([
    ('poly_features', PolynomialFeatures(degree=2)),
    ('linear_regression', LinearRegression())
])
poly_pipeline.fit(X_train_mic, y_train)
poly_val_pred_mic = poly_pipeline.predict(X_val_mic)
poly_val_mse_mic = mean_squared_error(y_val, poly_val_pred_mic)
print("Polynomial Regression Validation MSE with selected MIC features:", poly_val_mse_mic)

# Fit and evaluate a multi-output Gradient Boosting model
gb_model = MultiOutputRegressor(GradientBoostingRegressor())
gb_model.fit(X_train_mic, y_train)
gb_val_pred_mic = gb_model.predict(X_val_mic)
gb_val_mse_mic = mean_squared_error(y_val, gb_val_pred_mic, multioutput='uniform_average')  # Calculate MSE for multi-output
print("Gradient Boosting Validation MSE with selected MIC features:", gb_val_mse_mic)

 # Fit and evaluate a multi-output Random Forest model
rf_model = MultiOutputRegressor(RandomForestRegressor(random_state=42))
rf_model.fit(X_train_mic, y_train)
rf_val_pred_mic = rf_model.predict(X_val_mic)
rf_val_mse_mic = mean_squared_error(y_val, rf_val_pred_mic, multioutput='uniform_average')
print("Random Forest Validation MSE with selected MIC features:", rf_val_mse_mic)

Linear Regression Validation MSE with selected MIC features: 0.0063204933168285295
Polynomial Regression Validation MSE with selected MIC features: 0.0024478580096430618
Gradient Boosting Validation MSE with selected MIC features: 0.0011755406941782855
Random Forest Validation MSE with selected MIC features: 0.0012124787268030092


In [29]:
(linear_val_mse - linear_val_mse_mic)/linear_val_mse*100

-22.38461916252641

In [11]:
(rf_val_mse - overall_val_mse)/rf_val_mse *100

1.3770130724894543

In [12]:
(rf_val_mse_mic - overall_val_mse)/rf_val_mse_mic *100

5.291455594997154

In [13]:
# XGBoost model for feature selection
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
rfecv_xgb = RFECV(estimator=xgb_model, step=1, cv=5, scoring='neg_mean_squared_error')
rfecv_xgb.fit(X_train, y_train)

# Get the selected features
selected_features_xgb = X.columns[rfecv_xgb.support_]
print("Selected features with XGBoost:", selected_features_xgb)

# Evaluate XGBoost with selected features
X_train_xgb = X_train[selected_features_xgb]
X_val_xgb = X_val[selected_features_xgb]
xgb_model.fit(X_train_xgb, y_train)
xgb_val_pred = xgb_model.predict(X_val_xgb)
xgb_val_mse = mean_squared_error(y_val, xgb_val_pred)
print("XGBoost Validation MSE with selected features:", xgb_val_mse)

Selected features with XGBoost: Index(['Feature_1', 'Feature_2', 'Feature_3', 'Feature_4', 'Feature_5',
       'Feature_6', 'Feature_7', 'Feature_8', 'Feature_9', 'Feature_10',
       'Feature_11', 'Feature_12', 'Feature_13', 'Feature_14', 'Feature_15',
       'Feature_16', 'Feature_17', 'Feature_18', 'Feature_19', 'Feature_20',
       'Feature_21', 'Feature_22', 'Feature_23', 'Feature_24', 'Feature_25',
       'Feature_27', 'Feature_28', 'Feature_30', 'Feature_31', 'Feature_32',
       'Feature_33', 'Feature_35', 'Feature_37', 'Feature_38', 'Feature_39',
       'Feature_41', 'Feature_42', 'Feature_43', 'Feature_44', 'Feature_45',
       'Feature_46', 'Feature_47', 'Feature_48', 'Feature_49', 'Feature_50'],
      dtype='object')
XGBoost Validation MSE with selected features: 0.0010523892377625604


In [14]:
(xgb_val_mse - overall_val_xgb)/xgb_val_mse *100

-14.242103499059663

In [17]:
from catboost import CatBoostRegressor
from sklearn.feature_selection import RFECV
from sklearn.metrics import mean_squared_error

# Use CatBoostRegressor directly for feature selection
catboost_model = CatBoostRegressor(iterations=500, depth=6, learning_rate=0.1, loss_function='RMSE', random_seed=42, verbose=0)

# Perform RFECV on a single target column (e.g., the first output)
rfecv_catboost = RFECV(estimator=catboost_model, step=1, cv=5, scoring='neg_mean_squared_error')
rfecv_catboost.fit(X_train, y_train.iloc[:, 0])  # You can loop over y_train columns if needed

# Get the selected features
selected_features_catboost = X_train.columns[rfecv_catboost.support_]
print("Selected features with CatBoost for the first output:", selected_features_catboost)

# Train MultiOutputRegressor with the selected features
from sklearn.multioutput import MultiOutputRegressor

multi_catboost_model = MultiOutputRegressor(CatBoostRegressor(iterations=500, depth=6, learning_rate=0.1, loss_function='RMSE', random_seed=42, verbose=0))
X_train_selected = X_train[selected_features_catboost]
X_val_selected = X_val[selected_features_catboost]

multi_catboost_model.fit(X_train_selected, y_train)
catboost_val_pred = multi_catboost_model.predict(X_val_selected)

# Calculate MSE
catboost_val_mse = mean_squared_error(y_val, catboost_val_pred)
print("CatBoost Validation MSE with selected features:", catboost_val_mse)


Selected features with CatBoost for the first output: Index(['Feature_2', 'Feature_3', 'Feature_4', 'Feature_5', 'Feature_6',
       'Feature_7', 'Feature_8', 'Feature_9', 'Feature_10', 'Feature_12',
       'Feature_13', 'Feature_14', 'Feature_15', 'Feature_16', 'Feature_17',
       'Feature_18', 'Feature_19', 'Feature_20', 'Feature_21', 'Feature_22',
       'Feature_23', 'Feature_24', 'Feature_25', 'Feature_26', 'Feature_27',
       'Feature_28', 'Feature_30', 'Feature_31', 'Feature_34', 'Feature_36',
       'Feature_38', 'Feature_39', 'Feature_40', 'Feature_41', 'Feature_43',
       'Feature_44', 'Feature_45', 'Feature_46', 'Feature_48', 'Feature_49'],
      dtype='object')
CatBoost Validation MSE with selected features: 0.0008618230780961363


In [19]:
from sklearn.neural_network import MLPRegressor
# MLP model
mlp_model = MultiOutputRegressor(MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42))
mlp_model.fit(X_train, y_train)

# Predict on the validation set
mlp_val_pred = mlp_model.predict(X_val)
mlp_val_mse = mean_squared_error(y_val, mlp_val_pred, multioutput='raw_values')
print("MLP Regressor Validation MSE:", mlp_val_mse)
overall_val_mlp = mean_squared_error(y_val, mlp_val_pred)
print("Overall Validation MSE:", overall_val_mlp)

MLP Regressor Validation MSE: [0.00113875 0.00046317 0.00285362 0.00195846 0.00351331]
Overall Validation MSE: 0.0019854614211898885


In [25]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error

# Gradient Boosting model
gbr_model = MultiOutputRegressor(GradientBoostingRegressor(random_state=42))
gbr_model.fit(X_train, y_train)

# Predict on the validation set
gbr_val_pred = gbr_model.predict(X_val)
gbr_val_mse = mean_squared_error(y_val, gbr_val_pred, multioutput='raw_values')
print("Gradient Boosting Regressor Validation MSE:", gbr_val_mse)
overall_val_gbr = mean_squared_error(y_val, gbr_val_pred)
print("Overall Validation MSE:", overall_val_gbr)


Gradient Boosting Regressor Validation MSE: [0.00049871 0.00014605 0.0019072  0.00089673 0.00097978]
Overall Validation MSE: 0.0008856931577206666


In [26]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_selection import RFECV
from sklearn.metrics import mean_squared_error
import numpy as np

# Store selected features for each output
selected_features_per_output = []

# Perform RFECV for each output individually
for i in range(y_train.shape[1]):
    print(f"Performing RFECV for output {i + 1}...")
    gbr_model = GradientBoostingRegressor(random_state=42)
    rfecv_gbr = RFECV(estimator=gbr_model, step=1, cv=5, scoring='neg_mean_squared_error')
    rfecv_gbr.fit(X_train, y_train.iloc[:, i])
    
    selected_features = X_train.columns[rfecv_gbr.support_]
    selected_features_per_output.append(selected_features)
    print(f"Selected features for output {i + 1}: {selected_features}")

# Find the union of selected features across all outputs
common_selected_features = set().union(*selected_features_per_output)
print("Union of selected features across all outputs:", common_selected_features)

# Train a MultiOutputRegressor with Gradient Boosting on the selected features
from sklearn.multioutput import MultiOutputRegressor

X_train_selected = X_train[common_selected_features]
X_val_selected = X_val[common_selected_features]

multi_gbr_model = MultiOutputRegressor(GradientBoostingRegressor(random_state=42))
multi_gbr_model.fit(X_train_selected, y_train)
gbr_val_pred = multi_gbr_model.predict(X_val_selected)

# Calculate the MSE
gbr_val_mse = mean_squared_error(y_val, gbr_val_pred, multioutput='raw_values')
print("Gradient Boosting Regressor Validation MSE with selected features:", gbr_val_mse)
overall_val_gbr = mean_squared_error(y_val, gbr_val_pred)
print("Overall Validation MSE:", overall_val_gbr)


Performing RFECV for output 1...
Selected features for output 1: Index(['Feature_2', 'Feature_4', 'Feature_5', 'Feature_7', 'Feature_8',
       'Feature_12', 'Feature_13', 'Feature_15', 'Feature_16', 'Feature_17',
       'Feature_18', 'Feature_19', 'Feature_20', 'Feature_21', 'Feature_22',
       'Feature_24', 'Feature_25', 'Feature_26', 'Feature_27', 'Feature_30',
       'Feature_39', 'Feature_49'],
      dtype='object')
Performing RFECV for output 2...
Selected features for output 2: Index(['Feature_2', 'Feature_3', 'Feature_4', 'Feature_5', 'Feature_6',
       'Feature_7', 'Feature_8', 'Feature_9', 'Feature_11', 'Feature_12',
       'Feature_13', 'Feature_15', 'Feature_16', 'Feature_17', 'Feature_18',
       'Feature_20', 'Feature_21', 'Feature_22', 'Feature_23', 'Feature_24',
       'Feature_25', 'Feature_28', 'Feature_30', 'Feature_31', 'Feature_32',
       'Feature_33', 'Feature_34'],
      dtype='object')
Performing RFECV for output 3...
Selected features for output 3: Index(['F

  X_train_selected = X_train[common_selected_features]
  X_val_selected = X_val[common_selected_features]


Gradient Boosting Regressor Validation MSE with selected features: [0.00048577 0.000146   0.00185107 0.0009392  0.00103691]
Overall Validation MSE: 0.0008917877079670684


In [30]:
from sklearn.multioutput import MultiOutputRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

# KNN model
knn_model = MultiOutputRegressor(KNeighborsRegressor(n_neighbors=5))
knn_model.fit(X_train, y_train)

# Predict on the validation set with KNN
knn_val_pred = knn_model.predict(X_val)
knn_val_mse = mean_squared_error(y_val, knn_val_pred, multioutput='raw_values')
print("KNN Regressor Validation MSE:", knn_val_mse)
overall_val_knn = mean_squared_error(y_val, knn_val_pred)
print("Overall KNN Validation MSE:", overall_val_knn)

# SVR model
svr_model = MultiOutputRegressor(SVR(kernel='rbf'))
svr_model.fit(X_train, y_train)

# Predict on the validation set with SVR
svr_val_pred = svr_model.predict(X_val)
svr_val_mse = mean_squared_error(y_val, svr_val_pred, multioutput='raw_values')
print("SVR Regressor Validation MSE:", svr_val_mse)
overall_val_svr = mean_squared_error(y_val, svr_val_pred)
print("Overall SVR Validation MSE:", overall_val_svr)


KNN Regressor Validation MSE: [0.00067552 0.00043519 0.00157265 0.00141907 0.00097944]
Overall KNN Validation MSE: 0.0010163747809966719
SVR Regressor Validation MSE: [0.00249404 0.00280899 0.00542869 0.00389307 0.0043833 ]
Overall SVR Validation MSE: 0.0038016219406753838


In [33]:
# Subset the training and validation sets with the selected features
X_train_selected = X_train[selected_features_catboost]
X_val_selected = X_val[selected_features_catboost]

# Train SVR with the selected features
svr_model = MultiOutputRegressor(SVR(kernel='rbf'))
svr_model.fit(X_train_selected, y_train)
svr_val_pred = svr_model.predict(X_val_selected)
svr_val_mse = mean_squared_error(y_val, svr_val_pred)
print("SVR Validation MSE with selected features:", svr_val_mse)

SVR Validation MSE with selected features: 0.003769825845962759
