In [7]:
import hopsworks
import pandas as pd
import numpy as np
import joblib
import os
from dotenv import load_dotenv
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

In [8]:
load_dotenv()

True

In [9]:
project = hopsworks.login(
    project=os.getenv("HOPSWORKS_PROJECT_NAME"),
    api_key_value=os.getenv("HOPSWORKS_API_KEY")
)

2026-02-01 14:49:23,190 INFO: Closing external client and cleaning up certificates.
Connection closed.
2026-02-01 14:49:23,516 INFO: Initializing external client
2026-02-01 14:49:23,518 INFO: Base URL: https://c.app.hopsworks.ai:443
2026-02-01 14:49:32,595 INFO: Python Engine initialized.

Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1357978


In [10]:
fs = project.get_feature_store()

In [11]:
aqi_fg = fs.get_feature_group(name="karachi_aqi_weather", version=1)

In [12]:
query = aqi_fg.select_all()

In [13]:
feature_view = fs.get_feature_view(name="karachi_aqi_view", version=1)

In [14]:
if feature_view is None:
    feature_view = fs.create_feature_view(
        name="karachi_aqi_view",
        version=1,
        query=query,
        labels=["pm2_5"]
    )
    print("Feature View created successfully!")
else:
    print("Feature View already exists.")

Feature View already exists.


In [15]:
feature_view = fs.get_feature_view(name="karachi_aqi_view", version=1)

In [16]:
df = feature_view.get_batch_data()

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (3.64s) 


In [17]:
aqi_fg = fs.get_feature_group(name="karachi_aqi_weather", version=1)
df = aqi_fg.read()

Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (2.00s) 


In [18]:
print("--- COLUMNS FOUND ---")
print(df.columns.tolist())

--- COLUMNS FOUND ---
['time', 'pm2_5', 'pm10', 'temperature_2m', 'relative_humidity_2m', 'wind_speed_10m', 'city']


In [19]:
def prepare_data(data):
    data = data.sort_values('time').reset_index(drop=True)
    
    # We use the raw name from your Feature Group
    target_col = 'pm2_5' 
    
    # Time-based features
    data['datetime'] = pd.to_datetime(data['time'], unit='ms')
    data['hour'] = data['datetime'].dt.hour
    
    # Creating the Lags (The Intelligence)
    data['pm2_5_lag_1h'] = data[target_col].shift(1)
    data['pm2_5_lag_24h'] = data[target_col].shift(24)
    
    data = data.dropna()
    
    # Drop columns model shouldn't see
    features = data.drop(columns=[target_col, 'time', 'datetime', 'city', 'pm10'], errors='ignore')
    target = data[target_col]
    
    return features, target

X, y = prepare_data(df)

In [20]:
split_idx = int(len(X) * 0.8)
X_train_raw, X_test_raw = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

In [21]:
# --- 2. Scaling (Vital for Linear Regression) ---
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_raw)
X_test = scaler.transform(X_test_raw)

In [22]:
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(max_depth=10, random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42)
}

In [23]:
results = {}
best_model = None
best_mse = float('inf')
best_model_name = ""

In [24]:
print("\n--- üèÉ Starting Tournament üèÉ ---")


--- üèÉ Starting Tournament üèÉ ---


In [32]:
print("DEBUG: Exact column order for model training:")
print(X.columns.tolist())

DEBUG: Exact column order for model training:
['temperature_2m', 'relative_humidity_2m', 'wind_speed_10m', 'hour', 'pm2_5_lag_1h', 'pm2_5_lag_24h']


In [25]:
for name, model in models.items():
    # Train
    model.fit(X_train, y_train)
    # Predict
    preds = model.predict(X_test)
    
    # Metrics
    mse = mean_squared_error(y_test, preds)
    rmse = np.sqrt(mse)  # Calculate Root Mean Squared Error
    r2 = r2_score(y_test, preds)
    
    results[name] = {"RMSE": rmse, "R2": r2}
    
    print(f"--- {name} ---")
    print(f"   RMSE: {rmse:.2f} (Average error in PM2.5 units)")
    print(f"   R2 Score: {r2:.2f}")

--- Linear Regression ---
   RMSE: 4.47 (Average error in PM2.5 units)
   R2 Score: 0.91
--- Decision Tree ---
   RMSE: 4.95 (Average error in PM2.5 units)
   R2 Score: 0.89
--- Random Forest ---
   RMSE: 4.19 (Average error in PM2.5 units)
   R2 Score: 0.92


In [26]:
if rmse < best_mse: # best_mse is our threshold variable
        best_mse = rmse
        best_model = model
        best_model_name = name

In [27]:
print(f"\nüèÜ The Winner is: {best_model_name} with RMSE {best_mse:.2f}")


üèÜ The Winner is: Random Forest with RMSE 4.19


In [28]:
# --- 4. Save and Register the Winner ---
model_dir = "aqi_model"
if not os.path.exists(model_dir): os.mkdir(model_dir)

In [29]:
#Save the model AND the scaler (we'll need the scaler for predictions tomorrow!)
joblib.dump(best_model, f"{model_dir}/model.pkl")
joblib.dump(scaler, f"{model_dir}/scaler.pkl")

['aqi_model/scaler.pkl']

In [30]:
mr = project.get_model_registry()
karachi_model = mr.python.create_model(
    name="karachi_aqi_model",
    metrics={"mse": best_mse},
    description=f"Best model ({best_model_name}) found during Day 4 tournament."
)
karachi_model.save(model_dir)

Uploading c:\Users\HP\OneDrive\Documents\Desktop\10-Pearls-AQI\AQI-Predictor-Karachi\src\aqi_model/model.pkl: 100.000%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 120081681/120081681 elapsed<03:27 remaining<00:00
Uploading c:\Users\HP\OneDrive\Documents\Desktop\10-Pearls-AQI\AQI-Predictor-Karachi\src\aqi_model/scaler.pkl: 100.000%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1095/1095 elapsed<00:02 remaining<00:00
Model export complete: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6/6 [03:38<00:00, 36.36s/it]                    

Model created, explore it at https://c.app.hopsworks.ai:443/p/1357978/models/karachi_aqi_model/2





Model(name: 'karachi_aqi_model', version: 2)

In [31]:
print(f"‚úÖ {best_model_name} successfully uploaded to Hopsworks!")

‚úÖ Random Forest successfully uploaded to Hopsworks!
