In [1]:
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RepeatedKFold
import xgboost as xgb
import data_cleaning

In [None]:
def pseudolabel(X, y, X_test, test_preds, xgb_params, pseudo_xgb_params, early_fit, n_splits=7, n_repeats=2):
    rmse = []
    preds = []
    best_rmse = 0
    test_preds2 = np.zeros(len(X_test))

    kf = RepeatedKFold(n_splits = n_splits, n_repeats = n_repeats, random_state=0)
    kf2 = RepeatedKFold(n_splits = n_splits, n_repeats = n_repeats, random_state=0)

    for fold, ((train_idx, valid_idx),(pseudo_idx, pseudo2_idx)) in enumerate(zip(kf.split(X, y), kf2.split(X_test, test_preds)), 1):
        X_train, y_train = X[train_idx], y[train_idx]
        X_valid, y_valid = X[valid_idx], y[valid_idx]
        
        X_pseudo, y_pseudo = X_test[pseudo_idx], test_preds[pseudo_idx]
        X_pseudo2, y_pseudo2 = X_test[pseudo2_idx], test_preds[pseudo2_idx]
        
        # Run the model on a smaller pseudolabel dataset
        pre_xgb_model = xgb.XGBRegressor(**pseudo_xgb_params)
        pre_xgb_model.fit(np.concatenate([X_train, X_pseudo2]),
                          np.concatenate([y_train, y_pseudo2]),
                          eval_set=[(X_valid, y_valid)],
                          verbose=False,
                          callbacks=[xgb.callback.EarlyStopping(
                              rounds=early_fit,
                              save_best=True)])        
        
        # Finetune the model using the larger pseudo dataset and a more complex model
        # Feed the previous model weights into this new model
        # The evaluation dataset must be the grountruth data
        post_xgb_model = xgb.XGBRegressor(**pseudo_xgb_params)
        post_xgb_model.fit(np.concatenate([X_train, X_pseudo]),
                          np.concatenate([y_train, y_pseudo]),
                          eval_set=[(X_valid, y_valid)],
                          verbose=False,
                          callbacks=[xgb.callback.EarlyStopping(
                              rounds=early_fit*2,
                              save_best=True)],
                          xgb_model=pre_xgb_model)

        preds.append(post_xgb_model.predict(X_test))

        xgb_rmse = mean_squared_error(y_valid, post_xgb_model.predict(X_valid), squared=False)
        rmse.append(xgb_rmse)
        
        print(f'Fold {fold}\n\txgb: {xgb_rmse}')
    
    for n in sorted(range(n_splits*n_repeats), key=lambda k: rmse[k])[:n_splits]:
        test_preds2 += preds[n] / n_splits
        best_rmse += rmse[n] / n_splits
    
    print(f'\nAverage total rmse: {np.array(rmse).mean()}')
    print(f'\nAverage best rmse: {best_rmse}')
    
    return test_preds2