In [7]:
import pandas as pd
import numpy as np 
import json
import joblib
import sagemaker
import boto3
import os
from time import gmtime, strftime, sleep
import xgboost as xgb
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sagemaker.experiments.run import Run, load_run

sagemaker.__version__

'2.132.0'

In [8]:
!pip install xgboost

[0m

In [9]:
target_col = "DC_POWER"

In [10]:
session = sagemaker.Session()
sm = session.sagemaker_client

In [11]:
#read data and save it in pandas dataframe
df = pd.read_csv("data/combined_plant.csv")

In [12]:
df.head()

Unnamed: 0,DATE_TIME,SOURCE_KEY,DC_POWER,DAILY_YIELD,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION
0,2020-05-15 00:00:00,1BY6WEcLGh8j5v7,0.0,0.0,25.184316,22.857507,0.0
1,2020-05-15 00:00:00,1IF53ai7Xc0U56Y,0.0,0.0,25.184316,22.857507,0.0
2,2020-05-15 00:00:00,3PZuoBAID5Wc2HD,0.0,0.0,25.184316,22.857507,0.0
3,2020-05-15 00:00:00,7JYdWkrLSPkdwr4,0.0,0.0,25.184316,22.857507,0.0
4,2020-05-15 00:00:00,McdE0feGgRqW7Ca,0.0,0.0,25.184316,22.857507,0.0


## Create an experiment

In [13]:
experiment_name = f"Solar-energy-experiment-model2-{strftime('%d-%H-%M-%S', gmtime())}"

## Feature engineering

In [14]:
df.drop(['SOURCE_KEY', 'DATE_TIME'], axis=1)

Unnamed: 0,DC_POWER,DAILY_YIELD,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION
0,0.0,0.0,25.184316,22.857507,0.0
1,0.0,0.0,25.184316,22.857507,0.0
2,0.0,0.0,25.184316,22.857507,0.0
3,0.0,0.0,25.184316,22.857507,0.0
4,0.0,0.0,25.184316,22.857507,0.0
...,...,...,...,...,...
136467,0.0,4157.0,23.202871,22.535908,0.0
136468,0.0,3931.0,23.202871,22.535908,0.0
136469,0.0,4322.0,23.202871,22.535908,0.0
136470,0.0,4218.0,23.202871,22.535908,0.0


In [15]:
# Shuffle and split the dataset
train_data, validation_data, test_data = np.split(
    df.sample(frac=1, random_state=1729),
    [int(0.7 * len(df)), int(0.9 * len(df))],
)

print(f"Data split > train:{train_data.shape} | validation:{validation_data.shape} | test:{test_data.shape}")

Data split > train:(95530, 7) | validation:(27294, 7) | test:(13648, 7)


In [17]:
file_source = "EFS"
file_name = "combined_plant.csv"
input_path = "./data" 
output_path = "./data"

In [19]:
train_data.to_csv(os.path.join(output_path, "train.csv"), index=False, header=False)
validation_data.to_csv(os.path.join(output_path, "validation.csv"), index=False, header=False)
test_data.to_csv(os.path.join(output_path, "test.csv"), index=False, header=False)
    

## Model training and validation

In [35]:
train_features = train_data.drop(target_col, axis=1)
train_label = pd.DataFrame(train_data[target_col])
train_features_numeric = train_features.drop(['DATE_TIME', 'SOURCE_KEY'], axis=1)

In [44]:
dtrain = xgb.DMatrix(train_features_numeric, label=train_label)

In [79]:
hyperparams = {
                "eta": 0.01,
                "booster": 'gblinear',
                "objective":'reg:squarederror',
                "base_score" : 0.5,
                "eval_metric" :'rmse',
                "random_state" : 567
              }

num_boost_round = 300
nfold = 3
early_stopping_rounds = 10

In [56]:
cv_results = xgb.cv(
    params=hyperparams,
    dtrain=dtrain,
    num_boost_round=num_boost_round,
    nfold=nfold,
    early_stopping_rounds=early_stopping_rounds,
    metrics=["rmse"],
    seed=10,
)

In [57]:
metrics_data = {
    "Regressor_metrics": {
        "validation:rmse": {
            "value": cv_results.iloc[-1]["test-rmse-mean"],
            "standard_deviation": cv_results.iloc[-1]["test-rmse-std"]
        },
        "train:rmse": {
            "value": cv_results.iloc[-1]["train-rmse-mean"],
            "standard_deviation": cv_results.iloc[-1]["train-rmse-std"]
        },
    }
}

In [58]:
cv_results

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,3599.320656,4.019947,3599.313984,8.457768
1,3546.256156,3.814369,3546.250040,8.530967
2,3496.405194,3.619088,3496.399750,8.583738
3,3449.586998,3.433729,3449.582332,8.617419
4,3405.628122,3.257882,3405.624338,8.633361
...,...,...,...,...
145,2529.313635,1.629841,2529.401547,1.462686
146,2528.095928,1.635867,2528.184079,1.454139
147,2526.890216,1.641819,2526.978606,1.445884
148,2525.696385,1.647697,2525.785012,1.437911


In [80]:
test_features = test_data.drop(target_col, axis=1)
test_label = pd.DataFrame(test_data[target_col])
test_features_numeric = test_features.drop(['DATE_TIME', 'SOURCE_KEY'], axis=1)
dtest = xgb.DMatrix(test_features_numeric, label=test_label)

In [81]:
run_suffix = strftime('%Y-%m-%M-%S', gmtime())

with Run(experiment_name=experiment_name,
         run_name=f"feature-engineering-{run_suffix}",
         run_display_name="feature-engineering",
         sagemaker_session=session) as run:
    run.log_parameters(
        {
            "train": 0.7,
            "validate": 0.2,
            "test": 0.1
        }
    )
    # Log input dataset metadata and output
    run.log_artifact(name="combined-plant-dataset", value="./data/combined_plant.csv", media_type="text/csv", is_output=False)
    run.log_artifact(name="train-csv", value="./data/train.csv", media_type="text/csv")
    run.log_artifact(name="validation-csv", value="./data/validation.csv", media_type="text/csv")
    run.log_artifact(name="test-csv", value="./data/test.csv", media_type="text/csv")

In [85]:
# in the production code you need to use the unique ids
run_suffix = strftime('%Y-%m-%M-%S', gmtime())

# Train the model for different eta values
for i, d in enumerate([0.01, .05, .1, .5, 0.7,0.8, 0.9]):
    hyperparams["eta"] = d
    
    print(f"Fit estimator with eta={d}")
    run_name = f"training-{i}-{run_suffix}"
    
    with Run(experiment_name=experiment_name,
             run_name=run_name,
             run_display_name=f"eta-{i+1}",
             sagemaker_session=session) as run:
        # Train the model
        model = (
            xgb.train(
                params=hyperparams, 
                dtrain=dtrain, 
                evals = [(dtrain,'train'), (dtest,'eval')], 
                num_boost_round=num_boost_round, 
                early_stopping_rounds=early_stopping_rounds, 
                verbose_eval = 0
            )
        )

        # Calculate metrics
        test_rmse = np.sqrt(mean_squared_error(test_label, model.predict(dtest)))
        train_rmse = np.sqrt(mean_squared_error(train_label, model.predict(dtrain)))

        # test_rmse = np.sqrt(mean_squared_error((test_label, y_pred=model.predict(dtest))))
        # train_rmse = np.sqrt(mean_squared_error((train_label, y_pred=model.predict(dtrain))))
        
        # Log metrics to the run
        run.log_parameters(hyperparams)
        run.log_metric(name="test_rmse", value = test_rmse, step=i+1)
        run.log_metric(name="train_rmse", value = train_rmse, step=i+1)

        # time.sleep(8) # wait until resource tags are propagated to the run

        print(f"Test rmse: {test_rmse:.4f} | Train rmse: {train_rmse:.4f}")

Fit estimator with eta=0.01
Test rmse: 2415.4540 | Train rmse: 2430.8659
Fit estimator with eta=0.05
Test rmse: 2358.3558 | Train rmse: 2373.6647
Fit estimator with eta=0.1
Test rmse: 2330.6882 | Train rmse: 2345.8816
Fit estimator with eta=0.5
Test rmse: 2140.2927 | Train rmse: 2155.0008
Fit estimator with eta=0.7
Test rmse: 2063.3656 | Train rmse: 2077.9571
Fit estimator with eta=0.8
Test rmse: 2029.9390 | Train rmse: 2044.4560
Fit estimator with eta=0.9
Test rmse: 2000.2098 | Train rmse: 2014.6174


In [91]:
# in the production code you need to use the unique ids
run_suffix = strftime('%Y-%m-%M-%S', gmtime())

# Train the model for different base_score values
for i, d in enumerate([ .05, .1, .5, 0.6,0.7]):
    hyperparams["base_score"] = d
    
    print(f"Fit estimator with base_score={d}")
    run_name = f"training-{i}-{run_suffix}"
    
    with Run(experiment_name=experiment_name,
             run_name=run_name,
             run_display_name=f"eta-{i+1}",
             sagemaker_session=session) as run:
        # Train the model
        model = (
            xgb.train(
                params=hyperparams, 
                dtrain=dtrain, 
                evals = [(dtrain,'train'), (dtest,'eval')], 
                num_boost_round=num_boost_round, 
                early_stopping_rounds=early_stopping_rounds, 
                verbose_eval = 0
            )
        )

        # Calculate metrics
        test_rmse = np.sqrt(mean_squared_error(test_label, model.predict(dtest)))
        train_rmse = np.sqrt(mean_squared_error(train_label, model.predict(dtrain)))

        # test_rmse = np.sqrt(mean_squared_error((test_label, y_pred=model.predict(dtest))))
        # train_rmse = np.sqrt(mean_squared_error((train_label, y_pred=model.predict(dtrain))))
        
        # Log metrics to the run
        run.log_parameters(hyperparams)
        run.log_metric(name="test_rmse", value = test_rmse, step=i+1)
        run.log_metric(name="train_rmse", value = train_rmse, step=i+1)

        # time.sleep(8) # wait until resource tags are propagated to the run

        print(f"Test rmse: {test_rmse:.4f} | Train rmse: {train_rmse:.4f}")

Fit estimator with base_score=0.05


ResourceLimitExceeded: An error occurred (ResourceLimitExceeded) when calling the AssociateTrialComponent operation: The account-level service limit 'Total number of trial components allowed in a single trial, excluding those automatically created by SageMaker' is 50 Trial Components, with current utilization of 0 Trial Components and a request delta of 51 Trial Components. Please use AWS Service Quotas to request an increase for this quota. If AWS Service Quotas is not available, contact AWS support to request an increase for this quota.

In [92]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>