In [1]:
import pandas as pd
from xgboost import XGBRegressor
import numpy as np
import sklearn.metrics as metrics

In [2]:
def regression_report(y_true, y_pred, number_of_features):
    # Ensure non-negative values for mean_squared_log_error
    min_y = min(min(y_true), min(y_pred))
    y_true_adj = y_true + (0 if min_y > 0 else -min_y + 1)
    y_pred_adj = y_pred + (0 if min_y > 0 else -min_y + 1)
    
    k = number_of_features
    # Regression metrics
    n = len(y_true)
    explained_variance = metrics.explained_variance_score(y_true_adj, y_pred_adj)
    mean_absolute_error = metrics.mean_absolute_error(y_true_adj, y_pred_adj) 
    mse = metrics.mean_squared_error(y_true_adj, y_pred_adj) 
    mean_squared_log_error = metrics.mean_squared_log_error(y_true_adj, y_pred_adj)
    median_absolute_error = metrics.median_absolute_error(y_true_adj, y_pred_adj)
    r2 = metrics.r2_score(y_true_adj, y_pred_adj)
    adjusted_r2 = 1 - (1-r2)*(n-1)/(n-k-1)

    print('explained_variance: ', round(explained_variance,4))    
    print('mean_squared_log_error: ', round(mean_squared_log_error,4))
    print('median_absolute_error: ', round(median_absolute_error,4))
    print('R2: ', round(r2,4))
    print('Adjusted R2: ', round(adjusted_r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))

In [3]:
df = pd.read_csv("трещ.csv")
df

Unnamed: 0,пролет,Al,C,Cr,Cu,Mn,Mo,Ni,P,S,Si,Mn/S,S+P,% брака от предъявленного
0,8,0.045,0.215,0.174,0.063,1.335,0.013,0.151,0.0113,0.0174,0.371,76.724138,0.0287,12.50
1,8,0.037,0.205,0.214,0.083,1.216,0.021,0.157,0.0136,0.0145,0.362,83.862069,0.0281,5.56
2,8,0.042,0.218,0.185,0.077,1.303,0.005,0.138,0.0132,0.0183,0.415,71.202186,0.0315,11.76
3,7,0.027,0.195,0.159,0.083,1.249,0.011,0.108,0.0116,0.0138,0.314,90.507246,0.0254,10.00
4,8,0.056,0.208,0.203,0.065,1.334,0.026,0.149,0.0131,0.0166,0.411,80.361446,0.0297,5.56
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119,8,0.048,0.240,0.220,0.070,1.360,0.020,0.180,0.0130,0.0200,0.410,68.000000,0.0330,62.50
120,8,0.039,0.211,0.214,0.073,1.274,0.019,0.169,0.0153,0.0216,0.359,58.981481,0.0369,8.33
121,8,0.047,0.248,0.197,0.073,1.281,0.037,0.217,0.0150,0.0189,0.353,67.777778,0.0339,56.25
122,8,0.041,0.210,0.200,0.090,1.300,0.030,0.210,0.0130,0.0220,0.350,59.090909,0.0350,8.33


In [4]:
X=df.iloc[:,:-1]
y=df.iloc[:,-1]

In [11]:
from random import randint
from sklearn.model_selection import RandomizedSearchCV


xgb = XGBRegressor()
params = {
    'max_depth':np.arange(1,5),
    'max_leaves':[15,20,25,30,40,50],
    'n_estimators':[2,5,10,15],
    'learning_rate':[0.1,0.3,0.5,0.8],
    'subsample':[0.1,0.2,0.5], 
    'colsample_bytree':[0.1,0.3,0.5,0.8,0.95]
}
state = randint(0,100)
print(state)
search = RandomizedSearchCV(xgb,params,n_iter=200,cv=5,random_state=state)
search.fit(X,y)
best_xbg = search.best_estimator_
best_xbg

90


cv values

None: By default, if cv is None, a 5-fold cross-validation is performed.

Integers: An integer specifies the number of folds in a (Stratified)KFold. For example, cv=10 means 10-fold cross-validation.

CV Splitters:

KFold: Splits data into k consecutive folds.

StratifiedKFold: Splits data into k folds, making sure each fold has the same proportion of class labels.

GroupKFold: Ensures that the same group is not represented in both testing and training sets.

ShuffleSplit: Generates a user-defined number of independent train/test dataset splits.

StratifiedShuffleSplit: Like ShuffleSplit, but preserves the class distribution within each iteration.

LeaveOneOut (LOO): Each sample is used once as a test set while the rest form the training set.

LeavePOut (LPO): Leaves P samples out for testing and the rest for training.

LeaveOneGroupOut (LOGO): Takes a group array to group observations.

LeavePGroupsOut (LPGO): Leaves P groups out for testing.

TimeSeriesSplit: Provides train/test indices to split time-series data samples.

In [10]:
from sklearn.metrics import make_scorer
from sklearn.model_selection import RepeatedKFold, cross_val_score,cross_val_predict


cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=randint(0,1000))
scoring = make_scorer(metrics.mean_absolute_error)
scores = cross_val_score(best_xbg, X, y, scoring=scoring, cv=cv)
print(scores)
print(np.mean(scores))
print(np.std(scores))
# regression_report(y,best_xbg.predict(X),len(X.columns))

[16.29081865 15.81936024  9.60757846 14.60162322 13.95615306 11.23868109
 11.28877014 16.17389029 19.00671357  9.28253397 14.08546605 16.64608072
 16.18843781 12.66748523 10.83719348  7.34755116 11.98996019 16.48644317
 13.66508162 11.64709862 15.0083518  10.31702908 13.00467067 11.4699228
  9.942519   14.87301839 13.21902424 18.13277306 13.45662877 16.49450597]
13.491512151367642
2.796370624027611
