In [7]:
import hopsworks

# Explicitly authenticate using the API key
project = hopsworks.login(api_key_value="KpnqasHfb4WsGo1Z.jp7TSl48y51FcnnVvfgVgRk7cot4me3LXYlde0JOWpnzB0clm5x9Fre7tijqumxU")
fs = project.get_feature_store()

print("Connected to Hopsworks Feature Store")

# Get the feature group
feature_group = fs.get_feature_group(name="aqi_features", version=1)

# Fetch data as a Pandas DataFrame
data = feature_group.read()

# Separate features and target
features = data.drop(columns=["aqi"])  # Replace 'aqi' with your actual target column
target = data["aqi"]

print("Data fetched successfully from the feature store!")


2025-01-23 21:37:47,018 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-01-23 21:37:47,035 INFO: Initializing external client
2025-01-23 21:37:47,037 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-01-23 21:37:50,007 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1211546
Connected to Hopsworks Feature Store
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (2.77s) 
Data fetched successfully from the feature store!


In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Step 1: Transform 'date' column into numeric features
features["date"] = pd.to_datetime(features["date"], errors="coerce")

# Extract useful numeric features from 'date'
features["date_year"] = features["date"].dt.year
features["date_month"] = features["date"].dt.month
features["date_day"] = features["date"].dt.day
features["date_hour"] = features["date"].dt.hour
features["date_minute"] = features["date"].dt.minute

# Convert 'date' to a numeric timestamp
features["date_timestamp"] = features["date"].view('int64') // 10**9

# Drop the original 'date' column
features = features.drop(columns=["date"], errors="ignore")  # Added errors="ignore" for safety

# Step 3: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)





In [11]:
# Import necessary libraries
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pandas as pd
import joblib
import hopsworks

# Define models with updated parameters
models = {
    "Random_Forest": RandomForestRegressor(
        n_estimators=100,
        max_depth=10,
        min_samples_split=2,
        min_samples_leaf=1,
        random_state=42
    ),
    "Gradient_Boosting": GradientBoostingRegressor(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=3,
        subsample=0.8,
        random_state=42
    ),
    "Linear_Regression": LinearRegression(
        fit_intercept=True
    ),
    "Ridge_Regression": Ridge(
        alpha=1.0,
        solver="auto"
    ),
    "Support_Vector_Regressor": SVR(
        kernel="rbf",
        C=1.0,
        epsilon=0.1
    ),
    "Neural_Network": MLPRegressor(
        hidden_layer_sizes=(100,),
        activation="relu",
        solver="adam",
        learning_rate="adaptive",
        max_iter=200,
        random_state=42
    )
}

# Ensure the dataset is not empty
if features.empty or target.empty:
    raise ValueError("Features or target dataset is empty. Please load valid data.")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Train and evaluate each model
results = []
best_model = None
best_model_name = None
best_rmse = float("inf")

for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Store results
    results.append({
        "Model": model_name,
        "RMSE": rmse,
        "MAE": mae,
        "R²": r2
    })

    # Check if this is the best model
    if rmse < best_rmse:
        best_rmse = rmse
        best_model = model
        best_model_name = model_name

    print(f"Model: {model_name}")
    print(f"  RMSE: {rmse}")
    print(f"  MAE: {mae}")
    print(f"  R²: {r2}\n")

# Save the Best Model to Hopsworks Model Registry

# Save the Best Model to Hopsworks Model Registry
if best_model:
    print(f"The best model is: {best_model_name} with RMSE: {best_rmse}")

    # Save the best model locally
    joblib.dump(best_model, "best_model.pkl")

    # Connect to the Hopsworks Model Registry
    project = hopsworks.login(api_key_value="KpnqasHfb4WsGo1Z.jp7TSl48y51FcnnVvfgVgRk7cot4me3LXYlde0JOWpnzB0clm5x9Fre7tijqumxU")  # Replace with your API key
    mr = project.get_model_registry()

    # Register the model metadata
    model_registry = mr.python.create_model(
        name=best_model_name,
        metrics={"rmse": best_rmse, "mae": mae, "r2": r2},
        description=(
            f"Best model for prediction: {best_model_name}. "
            f"Hyperparameters: {best_model.get_params()}. "
            f"Task: Regression."
        ),
        input_example={"features": list(X_train.columns)}  # Save feature names
    )

    # Upload the saved model file to the registry
    model_registry.save("best_model.pkl")
    print(f"Model registered successfully in Hopsworks Model Registry: {best_model_name}")


# Display Results
results_df = pd.DataFrame(results)
print("Summary of Model Performance:")
print(results_df)


Model: Random_Forest
  RMSE: 0.039411907861542535
  MAE: 0.006178570340459672
  R²: 0.9983509512167169

Model: Gradient_Boosting
  RMSE: 0.03614421496745977
  MAE: 0.006776275495255327
  R²: 0.9986130647733639

Model: Linear_Regression
  RMSE: 9.674801162219957e-12
  MAE: 7.249560815068621e-12
  R²: 1.0

Model: Ridge_Regression
  RMSE: 0.008999796041045022
  MAE: 0.005969509625451406
  R²: 0.9999140107976446





Model: Support_Vector_Regressor
  RMSE: 1.1216669627973712
  MAE: 0.6909205057930925
  R²: -0.33569236288031834

Model: Neural_Network
  RMSE: 85.1736499839043
  MAE: 69.84228617764784
  R²: -7700.744429276885

The best model is: Linear_Regression with RMSE: 9.674801162219957e-12
2025-01-23 21:43:14,811 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-01-23 21:43:14,819 INFO: Initializing external client
2025-01-23 21:43:14,820 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-01-23 21:43:18,075 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1211546


Uploading: 100.000%|██████████| 2968/2968 elapsed<00:02 remaining<00:00:05,  1.19s/it]
Uploading: 100.000%|██████████| 990/990 elapsed<00:02 remaining<00:00<00:08,  2.25s/it]
Model export complete: 100%|██████████| 6/6 [00:12<00:00,  2.14s/it]                   

Model created, explore it at https://c.app.hopsworks.ai:443/p/1211546/models/Linear_Regression/2
Model registered successfully in Hopsworks Model Registry: Linear_Regression
Summary of Model Performance:
                      Model          RMSE           MAE           R²
0             Random_Forest  3.941191e-02  6.178570e-03     0.998351
1         Gradient_Boosting  3.614421e-02  6.776275e-03     0.998613
2         Linear_Regression  9.674801e-12  7.249561e-12     1.000000
3          Ridge_Regression  8.999796e-03  5.969510e-03     0.999914
4  Support_Vector_Regressor  1.121667e+00  6.909205e-01    -0.335692
5            Neural_Network  8.517365e+01  6.984229e+01 -7700.744429



