In [5]:
import hopsworks
import joblib
import pandas as pd
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import MinMaxScaler
import numpy as np


# --- Step 1: Connect to Hopsworks and Fetch Data from Feature Store ---
project = hopsworks.login(api_key_value="KpnqasHfb4WsGo1Z.jp7TSl48y51FcnnVvfgVgRk7cot4me3LXYlde0JOWpnzB0clm5x9Fre7tijqumxU")  # Replace with your API key
fs = project.get_feature_store()

print("Connected to Hopsworks Feature Store")

# Get the feature group
feature_group = fs.get_feature_group(name="aqi_features", version=1)

# Fetch data as a Pandas DataFrame
data = feature_group.read()

# Separate features and target
features = data.drop(columns=["aqi"])  # Replace 'aqi' with your actual target column
target = data["aqi"]

print("Data fetched successfully from the feature store!")

# --- Step 2: Preprocess Features ---
# Transform 'date' column into numeric features
features["date"] = pd.to_datetime(features["date"], errors="coerce")

# Extract useful numeric features from 'date'
features["date_year"] = features["date"].dt.year
features["date_month"] = features["date"].dt.month
features["date_day"] = features["date"].dt.day
features["date_hour"] = features["date"].dt.hour
features["date_minute"] = features["date"].dt.minute

# Convert 'date' to a numeric timestamp
features["date_timestamp"] = features["date"].view('int64') // 10**9

# Drop the original 'date' column
features = features.drop(columns=["date"], errors="ignore")

# --- Step 3: Train-Test Split ---
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# --- Step 4: Apply Feature Scaling ---
scaler = MinMaxScaler()  # Use MinMaxScaler for scaling between 0 and 1
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Step 5: Connect to Model Registry and Fetch the Best Model ---
mr = project.get_model_registry()
model_name = "Linear_Regression"  # Replace with the name of your model
model_version = 1  # Use the appropriate version

model_registry = mr.get_model(model_name, version=model_version)
model_dir = model_registry.download()
model_file = f"{model_dir}/best_model.pkl"

# Load the model
model = joblib.load(model_file)
print(f"Loaded model: {model_name}, version: {model_version}")

# --- Step 6: Retrain the Model ---
print(f"Retraining {model_name} on the training dataset...")
model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = model.predict(X_test_scaled)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Model performance on test dataset:")
print(f"  RMSE: {rmse}")
print(f"  MAE: {mae}")
print(f"  R²: {r2}")

# --- Step 7: Predict AQI for the Next 3 Days ---
# Get today's date and generate the next 3 days
#start_date = datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)  # Today's date, time set to midnight
#start date is tomorrow
start_date = start_date + timedelta(days=1)
days_to_predict = 3
future_dates = [start_date + timedelta(days=i) for i in range(days_to_predict)]

# Create a DataFrame for the future dates with all required features
future_features = pd.DataFrame({
    "date_year": [date.year for date in future_dates],
    "date_month": [date.month for date in future_dates],
    "date_day": [date.day for date in future_dates],
    "date_hour": [12] * days_to_predict,  # Assuming predictions are made for midday
    "date_minute": [0] * days_to_predict,  # Assuming predictions are made for the 0th minute
    "date_timestamp": [int(date.timestamp()) for date in future_dates]  # Numeric timestamp
})

# Add missing columns with default values
required_columns = features.columns.tolist()  # Get all columns used in training
for col in required_columns:
    if col not in future_features.columns:
        future_features[col] = 0  # Fill missing columns with default value (e.g., 0)

# Ensure column order matches the training data
future_features = future_features[required_columns]

# Scale the future features using the same scaler
future_features_scaled = scaler.transform(future_features)

# Predict AQI using the retrained model
predicted_aqi = model.predict(future_features_scaled)

# Combine the future dates with their predictions
prediction_results = pd.DataFrame({
    "Date": [date.strftime("%Y-%m-%d") for date in future_dates],
    "Predicted_AQI": np.round(predicted_aqi, 2)  # Round AQI to 2 decimal places
})

# Display the results
print("Predicted AQI for the next 3 days:")
print(prediction_results)


2025-01-24 09:25:24,024 INFO: Closing external client and cleaning up certificates.
Connection closed.
2025-01-24 09:25:24,323 INFO: Initializing external client
2025-01-24 09:25:24,328 INFO: Base URL: https://c.app.hopsworks.ai:443
2025-01-24 09:25:27,386 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1211546
Connected to Hopsworks Feature Store
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (2.36s) 
Data fetched successfully from the feature store!
Loaded model: Linear_Regression, version: 1)... DONE
Retraining Linear_Regression on the training dataset...
Model performance on test dataset:
  RMSE: 1.5000375566646494e-15
  MAE: 1.1818757162853616e-15
  R²: 1.0
Predicted AQI for the next 3 days:
         Date  Predicted_AQI
0  2025-01-24          55.98
1  2025-01-25          55.98
2  2025-01-26          55.98


In [6]:
# Save `prediction_results` to a file
prediction_results.to_pickle("prediction_results.pkl")
