In [2]:
%pip install -q xgboost --upgrade sagemaker  --ignore-installed PyYAML

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pytest-astropy 0.8.0 requires pytest-cov>=2.0, which is not installed.
pytest-astropy 0.8.0 requires pytest-filter-subpackage>=0.1, which is not installed.
spyder 4.0.1 requires pyqt5<5.13; python_version >= "3", which is not installed.
spyder 4.0.1 requires pyqtwebengine<5.13; python_version >= "3", which is not installed.
aiobotocore 2.4.2 requires botocore<1.27.60,>=1.27.59, but you have botocore 1.29.160 which is incompatible.
awscli 1.27.154 requires botocore==1.29.154, but you have botocore 1.29.160 which is incompatible.
awscli 1.27.154 requires PyYAML<5.5,>=3.10, but you have pyyaml 6.0 which is incompatible.
awscli 1.27.154 requires rsa<4.8,>=3.1.2, but you have rsa 4.9 which is incompatible.
python-language-server 0.31.7 requires jedi<0.16,>=0.14.1, but you have jedi 0.18.2 which is incompatible.
py

In [3]:
import pandas as pd
import numpy as np 
import json
import joblib
import sagemaker
import boto3
import os
from time import gmtime, strftime, sleep
from sklearn.metrics import roc_auc_score
from sagemaker.experiments.run import Run, load_run

sagemaker.__version__

'2.165.0'

In [4]:
target_col = "DC_POWER"

In [5]:
session = sagemaker.Session()
sm = session.sagemaker_client

In [6]:
#read data and save it in pandas dataframe
df_gen1 = pd.read_csv("data/Plant_1_Generation_Data.csv")
df_gen2 = pd.read_csv("data/Plant_2_Generation_Data.csv")

df_weather1 = pd.read_csv("data/Plant_1_Weather_Sensor_Data.csv")
df_weather2 = pd.read_csv("data//Plant_2_Weather_Sensor_Data.csv")

## Create an experiment

In [7]:
experiment_name = f"Solar-energy-experiment-{strftime('%d-%H-%M-%S', gmtime())}"

## Feature engineering

In [61]:
# Adjust datetime format
df_gen1['DATE_TIME'] = pd.to_datetime(df_gen1['DATE_TIME'], format='%d-%m-%Y %H:%M')
df_weather1['DATE_TIME'] = pd.to_datetime(df_weather1['DATE_TIME'], format='%Y-%m-%d %H:%M:%S')
df_gen2['DATE_TIME'] = pd.to_datetime(df_gen2['DATE_TIME'], format='%Y-%m-%d %H:%M:%S')  # Updated format
df_weather2['DATE_TIME'] = pd.to_datetime(df_weather2['DATE_TIME'], format='%Y-%m-%d %H:%M:%S')

# Drop unnecessary columns and merge dataframes
df_plant1 = pd.merge(
    df_gen1.drop(columns=['PLANT_ID','AC_POWER','TOTAL_YIELD']),
    df_weather1.drop(columns=['PLANT_ID', 'SOURCE_KEY']),
    on='DATE_TIME'
)

df_plant2 = pd.merge(
    df_gen2.drop(columns=['PLANT_ID','AC_POWER','TOTAL_YIELD']),
    df_weather2.drop(columns=['PLANT_ID', 'SOURCE_KEY']),
    on='DATE_TIME'
)

combined_plant = pd.concat([df_plant1, df_plant2])

# adding separate time and date columns
combined_plant["DATE"] = pd.to_datetime(combined_plant["DATE_TIME"]).dt.date # add new column with date
combined_plant["TIME"] = pd.to_datetime(combined_plant["DATE_TIME"]).dt.time # add new column with time

#drop
combined_plant['DATE_TIME'] = combined_plant['DATE_TIME'].astype(np.int64) // 10**9

# add hours and minutes for ml models
combined_plant['HOURS'] = pd.to_datetime(combined_plant['TIME'],format='%H:%M:%S').dt.hour
combined_plant['MINUTES'] = pd.to_datetime(combined_plant['TIME'],format='%H:%M:%S').dt.minute
combined_plant['MINUTES_PASS'] = combined_plant['MINUTES'] + combined_plant['HOURS']*60

combined_plant.drop(['SOURCE_KEY', 'DATE', 'TIME'], axis=1)



Unnamed: 0,DATE_TIME,DC_POWER,DAILY_YIELD,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION,HOURS,MINUTES,MINUTES_PASS
0,1589500800,0.0,0.0,25.184316,22.857507,0.0,0,0,0
1,1589500800,0.0,0.0,25.184316,22.857507,0.0,0,0,0
2,1589500800,0.0,0.0,25.184316,22.857507,0.0,0,0,0
3,1589500800,0.0,0.0,25.184316,22.857507,0.0,0,0,0
4,1589500800,0.0,0.0,25.184316,22.857507,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...
67693,1592437500,0.0,4157.0,23.202871,22.535908,0.0,23,45,1425
67694,1592437500,0.0,3931.0,23.202871,22.535908,0.0,23,45,1425
67695,1592437500,0.0,4322.0,23.202871,22.535908,0.0,23,45,1425
67696,1592437500,0.0,4218.0,23.202871,22.535908,0.0,23,45,1425


In [62]:
combined_plant.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 136472 entries, 0 to 67697
Data columns (total 12 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   DATE_TIME            136472 non-null  int64  
 1   SOURCE_KEY           136472 non-null  object 
 2   DC_POWER             136472 non-null  float64
 3   DAILY_YIELD          136472 non-null  float64
 4   AMBIENT_TEMPERATURE  136472 non-null  float64
 5   MODULE_TEMPERATURE   136472 non-null  float64
 6   IRRADIATION          136472 non-null  float64
 7   DATE                 136472 non-null  object 
 8   TIME                 136472 non-null  object 
 9   HOURS                136472 non-null  int64  
 10  MINUTES              136472 non-null  int64  
 11  MINUTES_PASS         136472 non-null  int64  
dtypes: float64(5), int64(4), object(3)
memory usage: 13.5+ MB


In [54]:
# Shuffle and split the dataset
train_data, validation_data, test_data = np.split(
    combined_plant.sample(frac=1, random_state=1729),
    [int(0.7 * len(combined_plant)), int(0.9 * len(combined_plant))],
)

print(f"Data split > train:{train_data.shape} | validation:{validation_data.shape} | test:{test_data.shape}")

Data split > train:(95530, 12) | validation:(27294, 12) | test:(13648, 12)


## Model training and validation

In [63]:
train_features = train_data.drop(target_col, axis=1)
train_label = pd.DataFrame(train_data[target_col])
train_features = train_features.drop('SOURCE_KEY', axis=1)

In [56]:
train_label.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 95530 entries, 52906 to 32601
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   DC_POWER  95530 non-null  float64
dtypes: float64(1)
memory usage: 1.5 MB


In [64]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

train_features = train_data.drop(target_col, axis=1)
train_label = pd.DataFrame(train_data[target_col])

# Create and fit the linear regression model
hyperparams = {"fit_intercept" : True, 
               "n_jobs" : None, 
               "copy_X" : True,
               "n_features_in_" :6
              }

nfold=6
model = LinearRegression(hyperparams)

linear_model.fit(train_features, train_label)

# Evaluate the model
predictions = linear_model.predict(test_features)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, predictions))

# Calculate R2 score
r2 = r2_score(y_test, predictions)

report_dict = {
    "regression_metrics": {
        "rmse": {
            "value": rmse,
        },
        "r2_score": {
            "value": r2,
        },
    },
}


ValueError: could not convert string to float: 'mqwcsP2rE7J0TFp'

In [57]:
from sklearn.linear_model import LinearRegression

hyperparams = {"fit_intercept" : True, 
               "n_jobs" : None, 
               "copy_X" : True,
               "n_features_in_" :6
              }

nfold=6
model = LinearRegression(hyperparams)


In [58]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error


# Perform cross-validation
cv_scores = cross_val_score(model, train_features, train_label, cv=nfold, scoring='neg_root_mean_squared_error')

# Convert the negative RMSE scores to positive
cv_scores = -cv_scores

# Print the mean and standard deviation of the RMSE scores
print("Mean RMSE:", np.mean(cv_scores))
print("Standard Deviation of RMSE:", np.std(cv_scores))


ValueError: could not convert string to float: 'YxYtjZvoooNbGkE'

ValueError: could not convert string to float: 'mqwcsP2rE7J0TFp'

ValueError: could not convert string to float: 'mqwcsP2rE7J0TFp'



Mean RMSE: nan
Standard Deviation of RMSE: nan


ValueError: could not convert string to float: 'mqwcsP2rE7J0TFp'

ValueError: could not convert string to float: 'mqwcsP2rE7J0TFp'

ValueError: could not convert string to float: 'mqwcsP2rE7J0TFp'



In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

linear_model = LinearRegression()
linear_model.fit(train_features, train_label)

# Evaluate the model
predictions = linear_model.predict(test_features)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, predictions))

# Calculate R2 score
r2 = r2_score(y_test, predictions)

report_dict = {
    "regression_metrics": {
        "rmse": {
            "value": rmse,
        },
        "r2_score": {
            "value": r2,
        },
    },
}


In [None]:
dtrain = xgb.DMatrix(train_features, label=train_label)