In [2]:
import json
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from flaml import AutoML
from sklearn.pipeline import Pipeline


ImportError: cannot import name 'StratifiedGroupKFold' from 'sklearn.model_selection' (/home/max_zaim/anaconda3/envs/zaim/lib/python3.9/site-packages/sklearn/model_selection/__init__.py)

# Helper functions

In [2]:
def calculate_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    print(f"MAE (Mean absolute error): {mae:.4f}")
    print(f"MSE (Mean squared error): {mse:.4f}")
    print(f"RMSE (Root Mean Squared Error): {rmse:.4f}")
    print(f"R2: {r2:.4f}")

def is_valid_entry(entry, features):
    for feature in features:
        if feature not in entry or entry[feature] is None:
            return False
    return True

def train_evaluate_flaml(X_train, y_train, X_test, y_test, time_budget):
    automl = AutoML()
    settings = {
        "time_budget": time_budget,  # in seconds
        "metric": 'r2',
        "task": 'regression',
        "log_file_name": 'flaml_regression.log'
    }
    automl.fit(X_train=X_train, y_train=y_train, **settings)
    y_pred = automl.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    return mae, mse, rmse, r2, automl

def log_transform_features(X):
    return np.log1p(np.abs(X))



# Generate data

In [3]:
with open('experiment/data.json', 'r') as f:
    data = json.load(f)

# Keep only valid data points
features = [
    "plain",
    "params",
    "flops",
    "synflow",
    "snip",
    "grad_norm",
    "epe_nas",
    "grasp",
    "fisher",
    "l2_norm",
    "jacov",
    "zen",
    "nwot",
    "grad_sign",
]
valid_data = [entry for entry in data if is_valid_entry(entry, features + ["val_acc"])]

# Extract the features (zero-cost proxies) and target (validation accuracy)
X = []
y = []

for entry in valid_data:
    feature_values = [entry[feature] for feature in features if feature != "val_acc"]
    X.append(feature_values)
    y.append(entry["val_acc"])

X = np.array(X)
y = np.array(y)
scaler = StandardScaler()
X = scaler.fit_transform(X)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [4]:
def get_results():
    # Check if supervised_experiments/results.json exists
    try:
        with open('supervised_experiments/results.json', 'r') as f:
            results = json.load(f)
            return results
    except:
        return {}

# Evaluate FLAML

# Standard method with all proxies for 60 seconds

Here we use a StandardScaler as well

In [5]:
automl = AutoML()
settings = {
    "time_budget": 60,  # in seconds
    "metric": 'r2',
    "task": 'regression',
    "log_file_name": 'flaml_regression.log'
}

In [None]:
results = get_results()
results_key = "all_zero_cost_proxies_60_seconds"

# Check if results_key exists in supervised_experiments/results.json
if results_key not in results:
    results[results_key] = {}

    automl.fit(X_train=X_train, y_train=y_train, **settings)

    y_pred = automl.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    results[results_key]["mae"] = mae
    results[results_key]["mse"] = mse
    results[results_key]["rmse"] = rmse
    results[results_key]["r2"] = r2

    with open('supervised_experiments/results.json', 'w') as f:
        json.dump(results, f)
else:
    print("Results already exist for", results_key)


# Increase time budget to 120 seconds

In [None]:
automl = AutoML()
settings = {
    "time_budget": 120,  # in seconds
    "metric": 'r2',
    "task": 'regression',
    "log_file_name": 'flaml_regression.log'
}

In [None]:
results = get_results()
results_key = "all_zero_cost_proxies_120_seconds"

# Check if results_key exists in supervised_experiments/results.json
if results_key not in results:
    results[results_key] = {}

    automl.fit(X_train=X_train, y_train=y_train, **settings)

    y_pred = automl.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    results[results_key]["mae"] = mae
    results[results_key]["mse"] = mse
    results[results_key]["rmse"] = rmse
    results[results_key]["r2"] = r2

    with open('supervised_experiments/results.json', 'w') as f:
        json.dump(results, f)
else:
    print("Results already exist for", results_key)


# Time budget of 300s

In [None]:
automl = AutoML()
settings = {
    "time_budget": 300,  # in seconds
    "metric": 'r2',
    "task": 'regression',
    "log_file_name": 'flaml_regression.log'
}

In [None]:
results = get_results()
results_key = "all_zero_cost_proxies_300_seconds"

# Check if results_key exists in supervised_experiments/results.json
if results_key not in results:
    results[results_key] = {}

    automl.fit(X_train=X_train, y_train=y_train, **settings)

    y_pred = automl.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    results[results_key]["mae"] = mae
    results[results_key]["mse"] = mse
    results[results_key]["rmse"] = rmse
    results[results_key]["r2"] = r2

    with open('supervised_experiments/results.json', 'w') as f:
        json.dump(results, f)
else:
    print("Results already exist for", results_key)


# Use early stopping with high time budget

In [None]:
automl = AutoML()
settings = {
    "time_budget": 3000,  # in seconds
    "metric": 'r2',
    "task": 'regression',
    "log_file_name": 'flaml_regression.log',
    "early_stop": True
}

In [None]:
results = get_results()
results_key = "all_zero_cost_proxies_with_early_stopping_max_3000_seconds"

# Check if results_key exists in supervised_experiments/results.json
if results_key not in results:
    results[results_key] = {}

    automl.fit(X_train=X_train, y_train=y_train, **settings)

    y_pred = automl.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    results[results_key]["mae"] = mae
    results[results_key]["mse"] = mse
    results[results_key]["rmse"] = rmse
    results[results_key]["r2"] = r2

    with open('supervised_experiments/results.json', 'w') as f:
        json.dump(results, f)
else:
    print("Results already exist for", results_key)


## Check how the R2-score is progressing 

### Is it worth it to have a higher time budget?

In [None]:
# Print out the results from the automl.fit() call

from flaml.automl.data import get_output_from_log

log_file_name = 'flaml_regression.log'
search_time_list, best_error_list, error_list, config_list, logged_metric_list, = get_output_from_log(log_file_name, time_budget=3000)



# Now we try with Recursive Feature Elimination with Cross Validation

In [None]:
with open('experiment/data.json', 'r') as f:
    data = json.load(f)

# Keep only valid data points
features = [
    "plain",
    "params",
    "flops",
    "synflow",
    "snip",
    "grad_norm",
    "epe_nas",
    "grasp",
    "fisher",
    "l2_norm",
    "jacov",
    "zen",
    "nwot",
    "grad_sign",
]
valid_data = [entry for entry in data if is_valid_entry(entry, features + ["val_acc"])]

# Extract the features (zero-cost proxies) and target (validation accuracy)
X = []
y = []

for entry in valid_data:
    feature_values = [entry[feature] for feature in features if feature != "val_acc"]
    X.append(feature_values)
    y.append(entry["val_acc"])

X = np.array(X)
y = np.array(y)
scaler = StandardScaler()
X = scaler.fit_transform(X)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [None]:

from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.feature_selection import RFECV
from sklearn.model_selection import KFold
class FlamlWrapper(BaseEstimator, RegressorMixin):
    def __init__(self, automl):
        self.automl = automl
    
    def fit(self, X, y):
        self.automl.fit(X, y, task="regression")
        return self
    
    def predict(self, X):
        return self.automl.predict(X)
    
    def score(self, X, y):
        return self.automl.score(X, y)

    @property
    def feature_importances_(self):
        model = self.automl.model.estimator
        if hasattr(model, "feature_importances_"):
            return model.feature_importances_
        elif hasattr(model, "coef_"):
            return np.abs(model.coef_)
        else:
            raise ValueError("Model does not have feature_importances_ or coef_ attribute")

In [None]:
automl = AutoML()
flaml_wrapper = FlamlWrapper(automl)
cv = KFold(n_splits=5, shuffle=True, random_state=42)
rfecv = RFECV(estimator=flaml_wrapper, step=1, cv=cv, scoring="r2")


In [None]:
# rfecv.fit(X_train, y_train)


In [None]:
""" print("Optimal number of features : %d" % rfecv.n_features_)


selected_features_mask = rfecv.support_

# Get the names of the selected features
selected_features = [feature for feature, selected in zip(features, selected_features_mask) if selected]

print("Selected features:", selected_features) """

In [None]:
""" # Transform the training and test data to keep only the selected features
X_train_selected = rfecv.transform(X_train)
X_test_selected = rfecv.transform(X_test)

# Train your FLAML model on the transformed training data
automl.fit(X_train_selected, y_train, task="regression")

# Evaluate the performance on the transformed test data
y_pred = automl.predict(X_test_selected)
r2 = automl.score(X_test_selected, y_test)
print("Test R2 score:", r2) """

# Look at ranking instead of pure regression model

In [None]:

automl = AutoML()
settings = {
    "time_budget": 60,  # in seconds
    "metric": 'r2',
    "task": 'regression',
    "log_file_name": 'flaml_regression.log'
}

In [None]:
from autosklearn.regression import AutoSklearnRegressor
results = get_results()
results_key = "rank"

# Check if results_key exists in supervised_experiments/results.json
if results_key not in results:
    # Convert validation accuracy to ranking
    y_rank = (-y).argsort().argsort() + 1

    # Split the data for ranking
    X_train_rank, X_test_rank, y_train_rank, y_test_rank = train_test_split(X, y_rank, test_size=0.2, random_state=42)
    # Initialize the AutoML regressor
    automl_rank = AutoSklearnRegressor(
        time_left_for_this_task=3600,  # Adjust the time limit based on your needs
        per_run_time_limit=300,
        n_jobs=-1,
        metric=mse
    )

    # Train the AutoML regressor on the ranking dataset
    automl_rank.fit(X_train_rank, y_train_rank)

    # Evaluate the model on the test set
    y_pred_rank = automl_rank.predict(X_test_rank)
