In [2]:
import pandas as pd
import numpy as np 
import json
import joblib
import sagemaker
import boto3
import os
from time import gmtime, strftime, sleep
from sklearn.metrics import roc_auc_score
from sagemaker.experiments.run import Run, load_run

sagemaker.__version__

'2.165.0'

In [3]:
target_col = "DC_POWER"

In [4]:
session = sagemaker.Session()
sm = session.sagemaker_client

In [5]:
#read data and save it in pandas dataframe
df_gen1 = pd.read_csv("data/Plant_1_Generation_Data.csv")
df_gen2 = pd.read_csv("data/Plant_2_Generation_Data.csv")

df_weather1 = pd.read_csv("data/Plant_1_Weather_Sensor_Data.csv")
df_weather2 = pd.read_csv("data//Plant_2_Weather_Sensor_Data.csv")

## Create an experiment

In [6]:
experiment_name = f"Solar-energy-experiment-{strftime('%d-%H-%M-%S', gmtime())}"

## Feature engineering

In [7]:
# Adjust datetime format
df_gen1['DATE_TIME'] = pd.to_datetime(df_gen1['DATE_TIME'], format='%d-%m-%Y %H:%M')
df_weather1['DATE_TIME'] = pd.to_datetime(df_weather1['DATE_TIME'], format='%Y-%m-%d %H:%M:%S')
df_gen2['DATE_TIME'] = pd.to_datetime(df_gen2['DATE_TIME'], format='%Y-%m-%d %H:%M:%S')  # Updated format
df_weather2['DATE_TIME'] = pd.to_datetime(df_weather2['DATE_TIME'], format='%Y-%m-%d %H:%M:%S')

# Drop unnecessary columns and merge dataframes
df_plant1 = pd.merge(
    df_gen1.drop(columns=['PLANT_ID','AC_POWER','TOTAL_YIELD']),
    df_weather1.drop(columns=['PLANT_ID', 'SOURCE_KEY']),
    on='DATE_TIME'
)

df_plant2 = pd.merge(
    df_gen2.drop(columns=['PLANT_ID','AC_POWER','TOTAL_YIELD']),
    df_weather2.drop(columns=['PLANT_ID', 'SOURCE_KEY']),
    on='DATE_TIME'
)

combined_plant = pd.concat([df_plant1, df_plant2])
combined_plant.drop(['SOURCE_KEY', 'DATE_TIME'], axis=1)

Unnamed: 0,DC_POWER,DAILY_YIELD,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION
0,0.0,0.0,25.184316,22.857507,0.0
1,0.0,0.0,25.184316,22.857507,0.0
2,0.0,0.0,25.184316,22.857507,0.0
3,0.0,0.0,25.184316,22.857507,0.0
4,0.0,0.0,25.184316,22.857507,0.0
...,...,...,...,...,...
67693,0.0,4157.0,23.202871,22.535908,0.0
67694,0.0,3931.0,23.202871,22.535908,0.0
67695,0.0,4322.0,23.202871,22.535908,0.0
67696,0.0,4218.0,23.202871,22.535908,0.0


In [8]:
# Shuffle and split the dataset
train_data, validation_data, test_data = np.split(
    combined_plant.sample(frac=1, random_state=1729),
    [int(0.7 * len(combined_plant)), int(0.9 * len(combined_plant))],
)

print(f"Data split > train:{train_data.shape} | validation:{validation_data.shape} | test:{test_data.shape}")

Data split > train:(95530, 7) | validation:(27294, 7) | test:(13648, 7)


## Model training and validation

In [9]:
train_features = train_data.drop(target_col, axis=1)
train_label = pd.DataFrame(train_data[target_col])

In [14]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Drop non-numeric columns from train_features
train_features_numeric = train_features.drop(['DATE_TIME', 'SOURCE_KEY'], axis=1)


hyperparams = {"fit_intercept" : True, 
               "n_jobs" : None, 
               "copy_X" : True,
               "n_features_in_" :6
              }

model = LinearRegression(hyperparams)
model.fit(train_features_numeric, train_label)

# Evaluate the model
validation_features = validation_data.drop(target_col, axis=1)
validation_label = pd.DataFrame(validation_data[target_col])
validation_features_numeric = validation_features.drop(['DATE_TIME', 'SOURCE_KEY'], axis=1)
predictions = model.predict(validation_features_numeric)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(validation_label, predictions))

# Calculate R2 score
r2 = r2_score(validation_label, predictions)

report_dict = {
    "regression_metrics": {
        "rmse": {
            "value": rmse,
        },
        "r2_score": {
            "value": r2,
        },
    },
}

report_dict

{'regression_metrics': {'rmse': {'value': 1921.2928485641596},
  'r2_score': {'value': 0.6398703920783813}}}

In [15]:
test_features = validation_data.drop(target_col, axis=1)
test_label = pd.DataFrame(test_data[target_col])

# Drop non-numeric columns from train_features
test_features_numeric = test_features.drop(['DATE_TIME', 'SOURCE_KEY'], axis=1)

test_features = test_data.drop(target_col, axis=1)
test_label = pd.DataFrame(test_data[target_col])
test_features_numeric = test_features.drop(['DATE_TIME', 'SOURCE_KEY'], axis=1)
predictions = model.predict(test_features_numeric)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(test_label, predictions))

# Calculate R2 score
r2 = r2_score(test_label, predictions)

report_dict = {
    "regression_metrics": {
        "rmse": {
            "value": rmse,
        },
        "r2_score": {
            "value": r2,
        },
    },
}

report_dict

{'regression_metrics': {'rmse': {'value': 1929.4053898332206},
  'r2_score': {'value': 0.6390534550616151}}}

### Create a run
Create a new run using the [`Run`](https://sagemaker.readthedocs.io/en/stable/experiments/sagemaker.experiments.html#run) class and call the [`log_parameters()`](https://sagemaker.readthedocs.io/en/stable/experiments/sagemaker.experiments.html#sagemaker.experiments.Run.log_parameters) and [`log_artifact()`](https://sagemaker.readthedocs.io/en/stable/experiments/sagemaker.experiments.html#sagemaker.experiments.Run.log_artifact) methods to record information to the run.

You can use [`log_file()`](https://sagemaker.readthedocs.io/en/stable/experiments/sagemaker.experiments.html#sagemaker.experiments.Run.log_file) method to upload local files to S3 to persistently store all data for the run.

In [11]:
output_path = 'data/combined_plant.csv'
combined_plant.to_csv(output_path, index=False)

In [12]:
run_suffix = strftime('%Y-%m-%M-%S', gmtime())

with Run(experiment_name=experiment_name,
         run_name=f"feature-engineering-{run_suffix}",
         run_display_name="feature-engineering",
         sagemaker_session=session) as run:
    run.log_parameters(
        {
            "train": 0.7,
            "validate": 0.2,
            "test": 0.1
        }
    )
    # Log input dataset metadata and output
    run.log_artifact(name="combined_plant", value="./data/combined_plant.csv", media_type="text/csv", is_output=False)
    run.log_artifact(name="train-csv", value="./data/train.csv", media_type="text/csv")
    run.log_artifact(name="validation-csv", value="./data/validation.csv", media_type="text/csv")
    run.log_artifact(name="test-csv", value="./data/test.csv", media_type="text/csv")

In [20]:
# Generate a run suffix for unique IDs
run_suffix = strftime('%Y-%m-%M-%S', gmtime())

n_jobs_values = [0, 2, 5, 10, 15, 20]
# Train the model for different hyperparameter values
for i, n_jobs in enumerate(n_jobs_values):
    hyperparams["n_jobs"] = n_jobs

    print(f"Fit m model n_jobs={n_jobs}")
    run_name = f"training-{i}-{run_suffix}"

    with Run(experiment_name=experiment_name,
             run_name=run_name,
             run_display_name=f"max-depth-{n_jobs}",
             sagemaker_session=session) as run:
        # Train the model
        model = LinearRegression(hyperparams)
        model.fit(train_features_numeric, train_label)
        predictions = model.predict(validation_features_numeric)

        # Calculate RMSE
        rmse_train = np.sqrt(mean_squared_error(validation_label, predictions))

        predictions = model.predict(test_features_numeric)

        # Calculate RMSE
        rmse_test = np.sqrt(mean_squared_error(test_label, predictions))

        # Log metrics to the run
        run.log_parameters({"n_jobs": n_jobs})  # Log n_jobs as a separate parameter
        run.log_metric(name="train_rmse", value=rmse_train, step=n_jobs)
        run.log_metric(name="test_rmse", value=rmse_test, step=n_jobs)

        print(f"train_rmse: {rmse_train:.4f} |test_rmse: {rmse_test:.4f}")


Fit m model n_jobs=0
train_rmse: 1921.2928 |test_rmse: 1929.4054
Fit m model n_jobs=2
train_rmse: 1921.2928 |test_rmse: 1929.4054
Fit m model n_jobs=5
train_rmse: 1921.2928 |test_rmse: 1929.4054
Fit m model n_jobs=10
train_rmse: 1921.2928 |test_rmse: 1929.4054
Fit m model n_jobs=15
train_rmse: 1921.2928 |test_rmse: 1929.4054
Fit m model n_jobs=20
train_rmse: 1921.2928 |test_rmse: 1929.4054
