In [1]:
!pip install Optuna

[0m

In [28]:
import matplotlib.pyplot as plt
import numpy as np
import optuna
import logging

from pathlib import Path
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Ridge, Lasso, LassoCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, cross_val_score
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler

logging.basicConfig(level=logging.ERROR)

In [3]:
!ls '/kaggle/input/data-regression'

X_test.npy  X_train.npy  y_test.npy  y_train.npy


In [4]:
dataset_folder = Path('/kaggle/input/data-regression')
dataset_folder.resolve()

PosixPath('/kaggle/input/data-regression')

In [21]:
x_train_file = 'X_train.npy'
y_train_file = 'y_train.npy'

X_train_given = np.load(dataset_folder.joinpath(x_train_file))
y_train_given = np.load(dataset_folder.joinpath(y_train_file)).ravel()

print('Inputs_train', X_train_given)
print('Labels_train', y_train_given)

Inputs_train [[0.82921188 0.33620837 0.11307146 ... 0.26911057 0.39826737 0.69398368]
 [0.49605241 0.23338204 0.25629519 ... 0.64696802 0.03035201 0.76803069]
 [0.02905734 0.4937862  0.46724899 ... 0.44549826 0.41709255 0.47755807]
 ...
 [0.63059727 0.43887553 0.81276115 ... 0.95515152 0.86576297 0.43993964]
 [0.74742343 0.11276802 0.194641   ... 0.85089992 0.46785343 0.91046313]
 [0.01405356 0.65306567 0.59671311 ... 0.07302376 0.07829235 0.45123912]]
Labels_train [2.03744078 3.86866045 1.82416592 2.50305137 0.63476318 3.64501907
 2.10959603 2.53736946 2.36396361 1.8493183  0.26528054 3.29868999
 0.68686673 1.97761143 2.33217634 2.35332258 2.13428688 2.8661636
 2.60676028 1.84903027 2.39740233 1.84141611 1.66002837 2.32656456
 2.63244469 2.92972212 1.90925142 2.19895187 2.88423105 3.04904381
 2.40316574 3.23375526 0.66999314 2.37406771 1.90287596 3.87498277
 1.81649276 2.48875646 2.67020008 2.25751607 2.82685912 2.89131867
 2.33134054 1.3473857  1.44341091 2.62260362 1.80267415 1.4456

In [7]:
x_test_file = 'X_test.npy'
y_test_file = 'y_test.npy'

X_test_given = np.load(dataset_folder.joinpath(x_test_file))
y_test_given = np.load(dataset_folder.joinpath(y_test_file)).ravel()

# RandomForestRegressor
Launching RandomForestRegressor without any optimisations or processing data will output only 0.51 accurancy.

In [11]:
clf = RandomForestRegressor(max_depth=10, random_state=0).fit(X_train_given, y_train_given.ravel())
y_pred = clf.predict(X_test_given)
print(r2_score(y_test_given, y_pred))

0.512366477739371


With some improvment we got more than 0.90 of accurancy

In [26]:
# Split the data into features and target variable
X = X_train_given
y = y_train_given

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Perform feature selection
selector = SelectFromModel(RandomForestRegressor())
selector.fit(X_scaled, y)
X_selected = selector.transform(X_scaled)

# Split the data into train and test sets
X_train, X_val, y_train, y_val = train_test_split(X_selected, y, test_size=0.2, random_state=42)

# Define the objective function for Optuna
def objective(trial):
    alpha = trial.suggest_loguniform('alpha', 1e-6, 1e6)
    solver = trial.suggest_categorical('solver', ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'])

    ridge = Ridge(alpha=alpha, solver=solver)
    ridge.fit(X_train, y_train)
    y_pred = ridge.predict(X_val)

    return -r2_score(y_val, y_pred)  # Minimize the negative R2 score

# Run the hyperparameter optimization with Optuna
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

# Best hyperparameters
best_params = study.best_params
print("Best Hyperparameters:", best_params)

# Retrain the model with the best hyperparameters
best_ridge = Ridge(alpha=best_params['alpha'], solver=best_params['solver'])
best_ridge.fit(X_train, y_train)

# Evaluate the model
y_pred = best_ridge.predict(X_val)
r2 = r2_score(y_val, y_pred)
print("R2 Score:", r2)

Avec GridSearchCV:  -0.00023463964353065414


In [12]:
X_scaled_test = scaler.fit_transform(X_test_given)

X_selected_test = selector.transform(X_scaled_test)

# Evaluate the model
y_pred = best_ridge.predict(X_selected_test)
r2 = r2_score(y_test_given, y_pred)
print("R2 Score:", r2)

R2 Score: 0.901419757978102


# LassoCV
With LassoCV we got 0.88 accurancy without any improvment.

In [37]:
clf_lassoCV = LassoCV(cv=5, random_state=42).fit(X_train_given, y_train_given)
y_pred = clf_lassoCV.predict(X_test_given)
print("LassoCV:", r2_score(y_test_given, y_pred))

LassoCV: 0.8843280643759462


The result are a little better but not very significant.

In [40]:
# Create a Lasso model
lassoCV = LassoCV(cv=5, random_state=42)

# Define the hyperparameter grid
param_grid = {'eps': [0.001, 0.01, 0.1], 'cv': [3, 5, 10, 15]}

# Create a GridSearchCV object with Lasso and the hyperparameter grid
grid_search = GridSearchCV(lassoCV, param_grid)

# Fit the GridSearchCV object on the training data
grid_search.fit(X_train_given, y_train_given)

# Predict on the validation data using the best model
y_pred = grid_search.predict(X_test_given)

# Calculate the R2 score using the predicted labels
r2 = r2_score(y_test_given, y_pred)

# Print the best hyperparameters and the R2 score
print("Best Hyperparameters:", grid_search.best_params_)
print("R2 Score:", r2)

Best Hyperparameters: {'cv': 10, 'eps': 0.01}
R2 Score: 0.8844685543653195
