In [2]:
import sys
import os
os.chdir('..')
sys.path.append('py//models')
from model_funs import *
X, y = get_x_y('data_for_modelling')
X_hold, y_hold = get_x_y('holdout')

In [3]:
# Best KNN
custom_knn_model = get_model_pipeline(X, 
                                      KNeighborsRegressor(algorithm = 'auto', n_neighbors = 17, weights = 'uniform'), 
                                      pca_groups = get_PCA_groups(X, ('all', 10)),
                                      feature_selection = ('k_best', 10),
                                      standartize = True
                    )
            

# Best LASSO
patterns_components = [
    ['parks', min(1, 9)], 
    ['edu', min(1, 6)],
    ['shops', min(1, 5)],
    ['moscow_stations', min(1, 5)],
    ['bus_stops', min(1, 5)],
    ['eco', min(1, 5)],
    ['hospitals', min(1, 2)]
]
lasso_model = get_model_pipeline(
    X = X,
    model = Lasso(alpha = 640, tol = 0.001),
    poly_features = 2,
    pca_groups = get_PCA_groups(X, patterns_components),
    standartize = True
)

# Best XGBoost
xgb_model = XGBRegressor(eval_metric='mae',
                         tree_method='hist',
                         device='cuda',
                         n_estimators=182,
                         max_depth=6,
                         learning_rate=0.11
            )

all: Explained Variance Ratio per Component: [0.26122786 0.05374642 0.04691406 0.03920592 0.0345376  0.03109392
 0.02974972 0.0291486  0.02773239 0.02509565]
all: Cumulative Explained Variance: 0.5785
parks: Explained Variance Ratio per Component: [0.32376045]
parks: Cumulative Explained Variance: 0.3238
edu: Explained Variance Ratio per Component: [0.52977431]
edu: Cumulative Explained Variance: 0.5298
shops: Explained Variance Ratio per Component: [0.395567]
shops: Cumulative Explained Variance: 0.3956
moscow_stations: Explained Variance Ratio per Component: [0.50439374]
moscow_stations: Cumulative Explained Variance: 0.5044
bus_stops: Explained Variance Ratio per Component: [0.49780078]
bus_stops: Cumulative Explained Variance: 0.4978
eco: Explained Variance Ratio per Component: [0.49611701]
eco: Cumulative Explained Variance: 0.4961
hospitals: Explained Variance Ratio per Component: [0.76014283]
hospitals: Cumulative Explained Variance: 0.7601


In [15]:
step = 0.01
weights = []
for w1 in np.arange(0.18, 0.19, step):
    for w2 in np.arange(0.12, 0.13, step):
        w3 = 1.0 - (w1 + w2) 
        if w3 >= 0.65 and w3 <= 0.75: 
            weights.append([round(w1, 2), round(w2, 2), round(w3, 2)])



voting_model = VotingRegressor(
    estimators=[
        ('lasso', lasso_model),
        ('knn', custom_knn_model),
        ('xgb', xgb_model)
    ]
)


from sklearn.metrics import make_scorer, mean_absolute_error
grid_search = GridSearchCV(
    estimator = voting_model,
    param_grid = {'weights': weights},
    scoring = make_scorer(mean_absolute_error, greater_is_better=False),
    cv = 5,
    verbose = 5,
    n_jobs = 5
)

# Fit GridSearchCV
grid_search.fit(X, y)

# Get the best weights
best_weights = grid_search.best_params_['weights']
print(f"Best Weights: {best_weights}")
print("Best MAE: {:.2f}".format(-grid_search.best_score_))

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best Weights: [0.18, 0.12, 0.7]
Best MAE: 20175.86


In [16]:
best_model = grid_search.best_estimator_
best_model.fit(X, y)

errors = best_model.predict(X_hold) - y_hold
print(np.abs(errors).describe())
get_MAE(errors)
get_MAPE(errors, y_hold)

count     1000.0000
mean     19960.8525
std      23540.6275
min         15.7762
25%       5294.9232
50%      12209.6823
75%      24231.1079
max     196808.3427
Name: price, dtype: float64
MAE 19960.852503179474
MAPE 17.8072440296633%


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




0.17807244029663297

In [17]:
print(f"errors 95%: {np.quantile(errors, 0.95)}")
print(f"errors 5%: {np.quantile(errors, 0.05)}")

errors 95%: 42336.042595382234
errors 5%: -57784.4710328726
