In [None]:
import warnings
import pandas as pd
import numpy as np

from sklearn.model_selection import (GridSearchCV, cross_validate, train_test_split)
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.decomposition import PCA
from sklearn.ensemble import (AdaBoostRegressor, BaggingRegressor, GradientBoostingRegressor,
                              RandomForestRegressor)
from sklearn.feature_selection import (f_regression, mutual_info_regression,
                                       r_regression)
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
warnings.filterwarnings('ignore')

In [None]:
n_feats = [5,10,50]
fss = [r_regression,f_regression,mutual_info_regression]

result_path = "/"
feature_path = "/"

def append_row(df, row):
    return pd.concat([
                df,
                pd.DataFrame([row], columns=row.index)]
          ).reset_index(drop=True)

In [None]:
#@title Regressors & Params

# Define the parameter spaces for each regressor
knn_param_space = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

abr_param_space = {
    'n_estimators': [10],
    'learning_rate': [0.01, 0.1],
    'loss': ['linear', 'square']
}

lr_param_space = {
    'fit_intercept': [True, False],
    'positive': [True, False]
}

svr_param_space = {
    'kernel': ['linear', 'poly', 'rbf'],
    'C': [0.01, 1],
    'degree': [2, 3, 4]
}

dtr_param_space = {
    'criterion': ['mse', 'friedman_mse', 'mae'],
    'max_depth': [None, 5, 10]
}

rfr_param_space = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 5, 10]
}

mlpr_param_space = {
    'hidden_layer_sizes': [(50,), (50, 50)],
    'activation': ['relu', 'tanh'],
    'learning_rate': ['constant', 'adaptive']
}
br_param_space = {
    'estimator': [ DecisionTreeRegressor()],
    'n_estimators': [10]
}

gb_param_space = {
    'n_estimators': [10],
    'learning_rate': [0.01, 0.1],
}

ls_param_space = {
    'alpha': [0.01, 1, 10],
    'fit_intercept': [True, False],
    'positive': [True, False],
    'max_iter': [1000]
}

# Define the regressors and their respective parameter spaces
regressors = {
    'GradientBoostingRegressor':(GradientBoostingRegressor(),gb_param_space),
    'BaggingRegressor' :(BaggingRegressor(),br_param_space),
    'KNeighborsRegressor': (KNeighborsRegressor(), knn_param_space),
    'AdaBoostRegressor': (AdaBoostRegressor(), abr_param_space),
    'LinearRegression': (LinearRegression(), lr_param_space),
    'SVR': (SVR(), svr_param_space),
    'DecisionTreeRegressor': (DecisionTreeRegressor(), dtr_param_space),
    'RandomForestRegressor': (RandomForestRegressor(), rfr_param_space),
    'MLPRegressor': (MLPRegressor(), mlpr_param_space),
    'Lasso':(Lasso(),ls_param_space)
}

In [None]:
import glob
files = glob.glob(feature_path +"*")
datasets = []
for filee in files:
  datasets.append(filee.split("/")[-1].split(".")[0])
datasets

In [None]:
#@title Semi-Supervised
runs_df = pd.DataFrame()

n = 0
# Step 1: Prepare your data
for dataset in datasets:
  for main_name, (main_reg, main_param_space) in regressors.items():
    n=n+1
    path = feature_path + dataset + ".xlsx"
    Y = pd.read_excel(path, sheet_name='Output' , engine='openpyxl',header=0)
    X = pd.read_excel(path, sheet_name='Data' , engine='openpyxl',header=0).reindex()

    YLabeled = pd.DataFrame(Y[Y['Duration']!=0]['Duration'])/365
    YLabeldindxs = Y[Y['Duration']!=0].index
    XLabeled = X.loc[YLabeldindxs]
    YLabeled = YLabeled['Duration']

    YUnlabeld = pd.DataFrame(Y[Y['Duration']==0]['Duration'])/365
    YUnlabeldindxs = Y[Y['Duration']==0].index
    XUnlabeld = X.loc[YUnlabeldindxs]
    YUnlabeld = YUnlabeld['Duration']

    XLabeled.columns = XLabeled.columns.astype(str)
    XUnlabeld.columns = XUnlabeld.columns.astype(str)

    # Step 2: Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(XLabeled, YLabeled, test_size=0.2,random_state=101)

    # Step 3: Scale the features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    XUnlabeld = scaler.transform(XUnlabeld)

    #  Create PCA instance: PCA for 5 components
    pca = PCA(n_components=5)
    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)
    XUnlabeld = pca.transform(XUnlabeld)

    # Step 4: Grid search
    pseudo_grid_search = GridSearchCV(estimator=MLPRegressor(), param_grid=mlpr_param_space, cv=5)
    pseudo_grid_search.fit(X_train, y_train)

    # Step 5: Retrieve the best model from grid search
    pseudo_best_regressor = pseudo_grid_search.best_estimator_

    # Step 6: Predict pseudo-labels
    pseudo_labels = pseudo_best_regressor.predict(XUnlabeld)

    # Step 7: Combine the labeled and pseudo-labeled data
    X_combined = np.concatenate((X_train, XUnlabeld), axis=0)
    y_combined = np.concatenate((y_train, pseudo_labels), axis=0)

    main_grid_search = GridSearchCV(estimator=main_reg, param_grid=main_param_space, cv=5)
    main_grid_search.fit(X_combined, y_combined)

    main_best_regressor = main_grid_search.best_estimator_

    # # Step 8: Train the final model on the combined dataset
    main_best_regressor.fit(X_combined, y_combined)

    # Step 9: Evaluate the best model using cross-validation
    scores = cross_validate(main_best_regressor, X_combined, y_combined, cv=5, scoring='neg_mean_absolute_error')
    mae_cv = -np.mean(scores['test_score'])
    std_cv = np.std(scores['test_score'])

    # Step 3: Make predictions on the test set
    y_pred = main_best_regressor.predict(X_test)

    # Step 4: Calculate the mean absolute error
    mae_tst = mean_absolute_error(y_test, y_pred)

    fs_str = str(fs).split(" ")[1]

    print(str(n*100/170)+','+main_name+','+dataset+","+str(mae_cv)+","+str(std_cv)+","+str(mae_tst))

    run_new_row = pd.Series({"pseudo_ALG" : "MLP" ,"main_ALG" : main_name ,"DATASET" : dataset
                    ,"method":"SSL"
                    ,"vMAE":str(mae_cv)
                    ,"vSTD":str(std_cv)
                    ,"eMAE":str(mae_tst)
                    ,"vs":(scores['test_score'])})
    runs_df = append_row(runs_df, run_new_row)

runs_df.to_csv(result_path+"/results.csv")