In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from catboost import CatBoostRegressor
import mlflow
from mlflow.tracking import MlflowClient
import mlflow.sklearn
from mlflow.models.signature import infer_signature
import mlflow.exceptions



## Data Loading and Inspection

In [None]:
# Load the dataset
df = pd.read_csv('../data/data.csv')

In [None]:
# Display first few rows of the DataFrame
df.head()

Unnamed: 0,UNIXTime,Data,Time,Radiation,Temperature,Pressure,Humidity,WindDirection(Degrees),Speed,TimeSunRise,TimeSunSet,datetime
0,1472724008,9/1/2016 12:00:00 AM,00:00:08,2.58,51,30.43,103,77.27,11.25,06:07:00,18:38:00,2016-09-01 10:00:08
1,1472724310,9/1/2016 12:00:00 AM,00:05:10,2.83,51,30.43,103,153.44,9.0,06:07:00,18:38:00,2016-09-01 10:05:10
2,1472725206,9/1/2016 12:00:00 AM,00:20:06,2.16,51,30.43,103,142.04,7.87,06:07:00,18:38:00,2016-09-01 10:20:06
3,1472725505,9/1/2016 12:00:00 AM,00:25:05,2.21,51,30.43,103,144.12,18.0,06:07:00,18:38:00,2016-09-01 10:25:05
4,1472725809,9/1/2016 12:00:00 AM,00:30:09,2.25,51,30.43,103,67.42,11.25,06:07:00,18:38:00,2016-09-01 10:30:09


In [None]:
# Display basic information about the DataFrame
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26148 entries, 0 to 26147
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   UNIXTime                26148 non-null  int64  
 1   Data                    26148 non-null  object 
 2   Time                    26148 non-null  object 
 3   Radiation               26148 non-null  float64
 4   Temperature             26148 non-null  int64  
 5   Pressure                26148 non-null  float64
 6   Humidity                26148 non-null  int64  
 7   WindDirection(Degrees)  26148 non-null  float64
 8   Speed                   26148 non-null  float64
 9   TimeSunRise             26148 non-null  object 
 10  TimeSunSet              26148 non-null  object 
 11  datetime                26148 non-null  object 
dtypes: float64(4), int64(3), object(5)
memory usage: 2.4+ MB


In [7]:
# Basic statistics of the DataFrame
df.describe()

Unnamed: 0,UNIXTime,Radiation,Temperature,Pressure,Humidity,WindDirection(Degrees),Speed
count,26148.0,26148.0,26148.0,26148.0,26148.0,26148.0,26148.0
mean,1476988000.0,218.00095,51.861749,30.434092,74.614349,134.404504,6.042688
std,2367042.0,326.410173,6.177191,0.037621,26.24973,79.359142,3.049319
min,1472724000.0,1.13,38.0,30.28,8.0,0.09,0.0
25%,1474960000.0,1.23,47.0,30.41,55.0,74.3075,3.37
50%,1477042000.0,2.9,51.0,30.44,85.0,139.88,5.62
75%,1479008000.0,380.5675,56.0,30.46,97.0,173.54,7.87
max,1481299000.0,1601.26,71.0,30.54,103.0,359.95,27.0


### Data Cleanng

In [8]:
# check duplicate count
df.duplicated().sum()

np.int64(0)

In [9]:
# Check for missing values
print(df.isna().sum())

UNIXTime                  0
Data                      0
Time                      0
Radiation                 0
Temperature               0
Pressure                  0
Humidity                  0
WindDirection(Degrees)    0
Speed                     0
TimeSunRise               0
TimeSunSet                0
datetime                  0
dtype: int64


In [10]:
# Drop duplicates 
df.drop_duplicates(inplace=True)

In [11]:
# Drop rows with all nulls 
df.dropna(how='all', inplace=True)

### Feature Engineering 

In [None]:
# shape of the DataFrame
df.shape

(26148, 12)

##### cyclical encoding

In [13]:
# Convert UNIXTime to datetime
df['DateTime'] = pd.to_datetime(df['UNIXTime'], unit='s')

In [14]:
# Extract datetime features
df['Hour'] = df['DateTime'].dt.hour
df['Minute'] = df['DateTime'].dt.minute
df['Day'] = df['DateTime'].dt.day
df['Month'] = df['DateTime'].dt.month
df['Weekday'] = df['DateTime'].dt.weekday

In [15]:
# Hour of day (0–23)
df['Hour_sin'] = np.sin(2 * np.pi * df['Hour'] / 24)
df['Hour_cos'] = np.cos(2 * np.pi * df['Hour'] / 24)

# Minute (0–59)
df['Minute_sin'] = np.sin(2 * np.pi * df['Minute'] / 60)
df['Minute_cos'] = np.cos(2 * np.pi * df['Minute'] / 60)

# Day of month (1–31)
df['Day_sin'] = np.sin(2 * np.pi * df['Day'] / 31)
df['Day_cos'] = np.cos(2 * np.pi * df['Day'] / 31)

# Month (1–12)
df['Month_sin'] = np.sin(2 * np.pi * df['Month'] / 12)
df['Month_cos'] = np.cos(2 * np.pi * df['Month'] / 12)

# Weekday (0=Monday)
df['Weekday_sin'] = np.sin(2 * np.pi * df['Weekday'] / 7)
df['Weekday_cos'] = np.cos(2 * np.pi * df['Weekday'] / 7)

#### Process Sunrise/Sunset Time

In [16]:
# Convert sunrise/sunset to timedelta
df['TimeSunRise_obj'] = pd.to_timedelta(df['TimeSunRise'])
df['TimeSunSet_obj'] = pd.to_timedelta(df['TimeSunSet'])

# Create actual sunrise/sunset datetime based on date of DateTime
df['SunriseDateTime'] = df['DateTime'].dt.normalize() + df['TimeSunRise_obj']
df['SunsetDateTime'] = df['DateTime'].dt.normalize() + df['TimeSunSet_obj']

# Calculate minutes since sunrise / until sunset
df['MinutesSinceSunrise'] = (df['DateTime'] - df['SunriseDateTime']).dt.total_seconds() / 60
df['MinutesUntilSunset'] = (df['SunsetDateTime'] - df['DateTime']).dt.total_seconds() / 60

##### Drop Columns that are not needed

In [17]:
df.drop(columns=[
    'UNIXTime', 'Data', 'Time',
    'TimeSunRise', 'TimeSunSet',
    'TimeSunRise_obj', 'TimeSunSet_obj',
    'Hour', 'Minute', 'Day', 
    'Month', 'Weekday', 'DateTime'
], inplace=True)


In [None]:
# shape after dropping columns
df.shape

(26148, 21)

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26148 entries, 0 to 26147
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Radiation               26148 non-null  float64       
 1   Temperature             26148 non-null  int64         
 2   Pressure                26148 non-null  float64       
 3   Humidity                26148 non-null  int64         
 4   WindDirection(Degrees)  26148 non-null  float64       
 5   Speed                   26148 non-null  float64       
 6   datetime                26148 non-null  object        
 7   Hour_sin                26148 non-null  float64       
 8   Hour_cos                26148 non-null  float64       
 9   Minute_sin              26148 non-null  float64       
 10  Minute_cos              26148 non-null  float64       
 11  Day_sin                 26148 non-null  float64       
 12  Day_cos                 26148 non-null  float6

In [20]:
df.nunique().sort_values()

Month_sin                     4
Month_cos                     4
Weekday_sin                   7
Weekday_cos                   7
Hour_cos                     22
Hour_sin                     22
Speed                        24
Minute_cos                   26
Day_cos                      26
Minute_sin                   26
Pressure                     27
Day_sin                      31
Temperature                  34
Humidity                     94
SunriseDateTime             136
SunsetDateTime              154
Radiation                 11994
WindDirection(Degrees)    15178
MinutesUntilSunset        16092
MinutesSinceSunrise       16096
datetime                  26148
dtype: int64

#### Feature Selection

In [23]:
df.columns

Index(['Radiation', 'Temperature', 'Pressure', 'Humidity',
       'WindDirection(Degrees)', 'Speed', 'datetime', 'Hour_sin', 'Hour_cos',
       'Minute_sin', 'Minute_cos', 'Day_sin', 'Day_cos', 'Month_sin',
       'Month_cos', 'Weekday_sin', 'Weekday_cos', 'SunriseDateTime',
       'SunsetDateTime', 'MinutesSinceSunrise', 'MinutesUntilSunset'],
      dtype='object')

#### Base Models and Feature Selection

In [None]:
features = ['Temperature', 'Pressure', 'Humidity',
       'WindDirection(Degrees)', 'Speed', 'Hour_sin', 
       'Hour_cos', 'Minute_sin', 'Minute_cos', 'Day_sin', 
       'Day_cos', 'Month_sin', 'Month_cos', 'Weekday_sin', 
       'Weekday_cos', 'MinutesSinceSunrise', 'MinutesUntilSunset']

In [None]:
# Sample Data
X = df[features]
y = df['Radiation']

# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = {
    'RandomForest': RandomForestRegressor(),
    'XGBoost': XGBRegressor(),
    'CatBoost': CatBoostRegressor(verbose=0)
}

# Feature selection threshold
feature_threshold = 0.005 
selected_features_per_model = {}  # Dictionary to store selected features per model
metrics_per_model = {} # Dictionary to store metrics per model


# Train and evaluate each model
for name, model in models.items():
    print(f"\nTraining {name} with all features...")
    model.fit(X_train, y_train)
    
    # Extract feature importances
    if name == 'XGBoost':
        importances = pd.Series(model.get_booster().get_score(importance_type='weight'))
        importances = importances / importances.sum()
        importances = importances.reindex(X.columns, fill_value=0)
    else:
        importances = pd.Series(model.feature_importances_, index=X.columns)
    
    selected_features = importances[importances >= feature_threshold].index.tolist()
    selected_features_per_model[name] = selected_features
    print(f"Selected features for {name}: {selected_features}")
    
    # Evaluate base model
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    r2 = r2_score(y_test, preds)
    metrics_per_model[name] = {'RMSE': rmse, 'R2': r2}
    print(f"{name} Base Model -> RMSE: {rmse:.4f}, R2: {r2:.4f}")

# Final dictionaries to reuse
print("\nSelected Features Per Model:\n", selected_features_per_model)
print("\nBase Model Metrics:\n", metrics_per_model)


Training RandomForest with all features...
Selected features for RandomForest: ['Temperature', 'Pressure', 'Humidity', 'WindDirection(Degrees)', 'Speed', 'Day_sin', 'Day_cos', 'Month_sin', 'MinutesSinceSunrise', 'MinutesUntilSunset']
RandomForest Base Model -> RMSE: 76.5182, R2: 0.9454

Training XGBoost with all features...
Selected features for XGBoost: ['Temperature', 'Pressure', 'Humidity', 'WindDirection(Degrees)', 'Speed', 'Hour_sin', 'Hour_cos', 'Minute_sin', 'Minute_cos', 'Day_sin', 'Day_cos', 'Month_sin', 'Weekday_sin', 'Weekday_cos', 'MinutesSinceSunrise', 'MinutesUntilSunset']
XGBoost Base Model -> RMSE: 80.3027, R2: 0.9399

Training CatBoost with all features...
Selected features for CatBoost: ['Temperature', 'Pressure', 'Humidity', 'WindDirection(Degrees)', 'Speed', 'Hour_sin', 'Hour_cos', 'Minute_sin', 'Minute_cos', 'Day_sin', 'Day_cos', 'Month_sin', 'Month_cos', 'Weekday_sin', 'Weekday_cos', 'MinutesSinceSunrise', 'MinutesUntilSunset']
CatBoost Base Model -> RMSE: 77.976

#### Tuning Models

In [None]:
# Train / Val / Test Split
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.15, random_state=42)

# Define parameter grids for tuning
param_grids = {
    'RandomForest': [
        {}, 
        {'n_estimators': 150, 'max_depth': None},
        {'n_estimators': 300, 'max_depth': 15},
    ],
    'XGBoost': [
        {},
        {'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.05},
        {'n_estimators': 300, 'max_depth': 6, 'learning_rate': 0.03},
    ],
    'CatBoost': [
        {}, 
        {'iterations': 200, 'depth': 5, 'learning_rate': 0.05, 'verbose': 0},
        {'iterations': 300, 'depth': 6, 'learning_rate': 0.03, 'verbose': 0},
    ]
}

# Initialize a list to store all runs
all_runs = []

# Iterate through each model and its parameter grid
for model_name, param_list in param_grids.items():
    features = selected_features_per_model[model_name]

    print(f"\nTuning {model_name}...")

    # Iterate through each set of parameters
    for i, params in enumerate(param_list, 1):
        if model_name == 'RandomForest':
            model = RandomForestRegressor(**params)
        elif model_name == 'XGBoost':
            model = XGBRegressor(**params)
        elif model_name == 'CatBoost':
            params['verbose'] = 0
            model = CatBoostRegressor(**params)

        # Fit the model
        model.fit(X_train[features], y_train)
        # Validate the model
        val_preds = model.predict(X_val[features])
        val_rmse = np.sqrt(mean_squared_error(y_val, val_preds))
        val_r2 = r2_score(y_val, val_preds)

        # Store the run details
        all_runs.append({
            'model_name': model_name,
            'params': params,
            'features': features,
            'val_rmse': val_rmse,
            'val_r2': val_r2,
            'model': model
        })

        print(f"Run {i} | Params: {params} | Val RMSE: {val_rmse:.4f}, Val R2: {val_r2:.4f}")

print(f"\nTotal runs completed: {len(all_runs)}")


#### Log to MLFlow

In [None]:
# set up MLflow tracking and experiment
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("My_Model_Experiment")

# Initialize a list to store logged runs
logged_runs = []

print("Starting test evaluation...\n")

# Iterate through all runs and log to MLflow
for idx, run in enumerate(all_runs, 1):
    try:
        with mlflow.start_run(run_name=f"{run['model_name']}_run_{idx}") as mlflow_run:
            # Log parameters
            mlflow.log_params(run['params'])

            # Log validation metrics
            mlflow.log_metrics({'val_rmse': run['val_rmse'], 'val_r2': run['val_r2']})

            # Infer signature from validation features and predictions
            input_data = X_val[run['features']]
            predictions = run['model'].predict(input_data)
            signature = infer_signature(input_data, predictions)

            # Log model with signature and example input
            mlflow.sklearn.log_model(
                sk_model=run['model'],
                name="model",
                signature=signature,
                input_example=input_data.iloc[:5]
            )

            # Track run info
            logged_runs.append({
                'run_id': mlflow_run.info.run_id,
                **run
            })
    except mlflow.exceptions.MlflowException as e:
        print(f"Failed to log run {idx} for model {run['model_name']}: {e}")

print(f"Logged {len(logged_runs)} runs to MLflow.")


2025/07/14 12:59:50 INFO mlflow.tracking.fluent: Experiment with name 'My_Model_Experiment' does not exist. Creating a new experiment.


🏃 View run RandomForest_run_1 at: http://localhost:5000/#/experiments/1/runs/d3314c244bf64e4389f8516c2df9f69e
🧪 View experiment at: http://localhost:5000/#/experiments/1




🏃 View run RandomForest_run_2 at: http://localhost:5000/#/experiments/1/runs/fd828a3cde2b48e19a719dc43578f35e
🧪 View experiment at: http://localhost:5000/#/experiments/1




🏃 View run RandomForest_run_3 at: http://localhost:5000/#/experiments/1/runs/67a8c9761ceb4ce297fe49264c15029d
🧪 View experiment at: http://localhost:5000/#/experiments/1




🏃 View run XGBoost_run_4 at: http://localhost:5000/#/experiments/1/runs/bd8c4cd4221c45daac70e996c44c2735
🧪 View experiment at: http://localhost:5000/#/experiments/1




🏃 View run XGBoost_run_5 at: http://localhost:5000/#/experiments/1/runs/ebbf444252d3403787a8e24ffcc35cd6
🧪 View experiment at: http://localhost:5000/#/experiments/1




🏃 View run XGBoost_run_6 at: http://localhost:5000/#/experiments/1/runs/de2e7e0998ab429eb0b40382b402f941
🧪 View experiment at: http://localhost:5000/#/experiments/1




🏃 View run CatBoost_run_7 at: http://localhost:5000/#/experiments/1/runs/19e0b7023c034338a542ef94e9502abb
🧪 View experiment at: http://localhost:5000/#/experiments/1




🏃 View run CatBoost_run_8 at: http://localhost:5000/#/experiments/1/runs/feb4f2e67e6d4d6e99f9ac1b6c18d2aa
🧪 View experiment at: http://localhost:5000/#/experiments/1




🏃 View run CatBoost_run_9 at: http://localhost:5000/#/experiments/1/runs/e65d168526824ae8899a0137a17a55a1
🧪 View experiment at: http://localhost:5000/#/experiments/1
Logged 9 runs to MLflow.


In [82]:
# Sort the logged runs by 'val_rmse' ascending (lower is better)
top_3_runs = sorted(logged_runs, key=lambda x: x['val_rmse'])[:3]

print("Top 3 Models by Val RMSE:")
for i, run in enumerate(top_3_runs, 1):
    print(f"{i}. Model: {run['model_name']}, Val RMSE: {run['val_rmse']:.4f}")

Top 3 Models by Val RMSE:
1. Model: RandomForest, Val RMSE: 82.0335
2. Model: RandomForest, Val RMSE: 82.0494
3. Model: RandomForest, Val RMSE: 82.2011


In [None]:
# Load the MLflow client
client = MlflowClient()
# Prepare to evaluate the top 3 runs on the test set
test_results = []

print("Starting test evaluation...\n")

# Iterate through the top 3 runs and evaluate on the test set
for run in top_3_runs:
    run_id = run['run_id']
    model_name = run['model_name']
    features = run['features']

    # Load model from MLflow
    model_uri = f"runs:/{run_id}/model"
    model = mlflow.sklearn.load_model(model_uri)

    # Predict on test data using training-time features
    X_test_subset = X_test[features]
    predictions = model.predict(X_test_subset)

    # Calculate metrics
    test_rmse = np.sqrt(mean_squared_error(y_test, predictions))
    test_r2 = r2_score(y_test, predictions)

    # Save result locally
    test_results.append({
        'run_id': run_id,
        'model_name': model_name,
        'test_rmse': test_rmse,
        'test_r2': test_r2
    })

    # 🚀 Log metrics to existing MLflow run
    with mlflow.start_run(run_id=run_id):
        mlflow.log_metrics({
            "test_rmse": test_rmse,
            "test_r2": test_r2
        })
        mlflow.set_tag("evaluation_status", "tested_with_separate_script")

    print(f"✅ Run {run_id} | Model: {model_name} | Test RMSE: {test_rmse:.4f}, Test R²: {test_r2:.4f}")

# Sort and display final results
test_results_sorted = sorted(test_results, key=lambda x: x['test_rmse'])

print("\nSorted Test Results:")
for res in test_results_sorted:
    print(f"Model: {res['model_name']}, Test RMSE: {res['test_rmse']:.4f}, Test R²: {res['test_r2']:.4f}")

Starting test evaluation...

🏃 View run RandomForest_run_2 at: http://localhost:5000/#/experiments/1/runs/fd828a3cde2b48e19a719dc43578f35e
🧪 View experiment at: http://localhost:5000/#/experiments/1
✅ Run fd828a3cde2b48e19a719dc43578f35e | Model: RandomForest | Test RMSE: 75.6833, Test R²: 0.9468
🏃 View run RandomForest_run_3 at: http://localhost:5000/#/experiments/1/runs/67a8c9761ceb4ce297fe49264c15029d
🧪 View experiment at: http://localhost:5000/#/experiments/1
✅ Run 67a8c9761ceb4ce297fe49264c15029d | Model: RandomForest | Test RMSE: 75.6127, Test R²: 0.9469
🏃 View run RandomForest_run_1 at: http://localhost:5000/#/experiments/1/runs/d3314c244bf64e4389f8516c2df9f69e
🧪 View experiment at: http://localhost:5000/#/experiments/1
✅ Run d3314c244bf64e4389f8516c2df9f69e | Model: RandomForest | Test RMSE: 76.9956, Test R²: 0.9450

Sorted Test Results:
Model: RandomForest, Test RMSE: 75.6127, Test R²: 0.9469
Model: RandomForest, Test RMSE: 75.6833, Test R²: 0.9468
Model: RandomForest, Test RM

In [None]:
# Register the best model based on validation RMSE
client = MlflowClient()
registry_model_name = "MyTopModel"
model_uri = f"runs:/{best_run_id}/model"

# Register the model
registered_model = mlflow.register_model(model_uri, registry_model_name)

print(f"\n✅ Model registered: '{registry_model_name}', Version: {registered_model.version}")

Registered model 'MyTopModel' already exists. Creating a new version of this model...
2025/07/14 15:36:03 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: MyTopModel, version 2



✅ Model registered: 'MyTopModel', Version: 2


Created version '2' of model 'MyTopModel'.


In [None]:
# Add metadata tags to the registered model
client = MlflowClient()
registry_model_name = "MyTopModel"

# Add tags to registered model
client.set_registered_model_tag(
    name=registry_model_name,
    key="model_type",
    value=best_run['model_name']
)

client.set_registered_model_tag(
    name=registry_model_name,
    key="test_rmse",
    value=str(best_run['test_rmse'])
)

client.set_registered_model_tag(
    name=registry_model_name,
    key="features_used",
    value=",".join(best_run['features'])  # Convert list to string
)

print("✅ Added metadata tags to the registered model")

✅ Added metadata tags to the registered model


In [None]:
# Set an alias for the registered model
version = registered_model.version
client.set_registered_model_alias(registry_model_name, "champion", version)

In [None]:
# Load the champion model from the registry
model = mlflow.sklearn.load_model(f"models:/{registry_model_name}@champion")