In [246]:
!pip install Feature-engine
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
from sklearn.linear_model import RANSACRegressor
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from feature_engine.creation import CyclicalFeatures
import scipy.stats as stats

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
fileName = "Dataset.xlsx"
df_Labels = pd.read_excel(fileName, sheet_name="Plant_1_Data")
df_Labels = df_Labels.drop(['PLANT_ID'], axis=1)
df_Labels["DATE_TIME"] = pd.to_datetime(df_Labels.DATE_TIME)
df_Features = pd.read_excel(fileName, sheet_name="Plant_1_Sensor_Data")
df_Features = df_Features.drop(['PLANT_ID','SOURCE_KEY'], axis=1)
df_Features['DATE_TIME'] = pd.to_datetime(df_Features.DATE_TIME)
print(df_Features.head())

In [None]:
df_Learn = pd.merge(df_Features, df_Labels, how='inner', left_on = ['DATE_TIME'], right_on = ['DATE_TIME'])
df_Learn

In [None]:
'''sourceKeyList = list(df_Learn['SOURCE_KEY'].unique())
sourceKeyDict = {}
for key in sourceKeyList:
  df_Key = df_Learn[df_Learn['SOURCE_KEY'] == key]
  #print(df_Key.describe())
  X = df_Key.drop(['DC_POWER', 'AC_POWER', 'DAILY_YIELD', 'TOTAL_YIELD', 'SOURCE_KEY'], axis = 1)
  #print(X.describe())
  y = df_Key['DC_POWER']
  
  #Train Val Test Split: 80%, 10%, 10%
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
  X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=1/9, random_state=42)
  #print(X_train.head()) print(y_train)
  sourceKeyDict[key] = list([X_train, y_train, X_val, y_val, X_test, y_test])
len(sourceKeyDict)
'''
X = df_Learn.drop(['DC_POWER', 'AC_POWER', 'DAILY_YIELD', 'TOTAL_YIELD'], axis = 1)
X['hour'] = X['DATE_TIME'].dt.hour
X['min'] = X['DATE_TIME'].dt.minute
X = X.drop(['DATE_TIME'], axis = 1)
y = df_Learn['DC_POWER']
display(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
cols_to_scale = ['AMBIENT_TEMPERATURE', 'MODULE_TEMPERATURE', 'IRRADIATION']
scaler = StandardScaler()
scaler.fit(X_train[cols_to_scale])
X_train[cols_to_scale] = scaler.transform(X_train[cols_to_scale])
X_test[cols_to_scale] = scaler.transform(X_test[cols_to_scale])
display(X_train)

In [None]:
cyclical = CyclicalFeatures(variables=['hour', 'min'], drop_original=True)
display(X_train)
cyclical.fit(X_train)
X_train = cyclical.transform(X_train)
X_test = cyclical.transform(X_test)

X_train.shape[1] + 1

In [None]:
ransacReg1 = RANSACRegressor(LinearRegression(), random_state=42, min_samples = 50, loss = 'squared_error')
ransacReg1.fit(X_train.drop(['SOURCE_KEY'], axis = 1), y_train)
prediction1 = ransacReg1.predict(X_test.drop(['SOURCE_KEY'], axis = 1))

ransacReg1.score(X_test.drop(['SOURCE_KEY'], axis = 1), y_test)

In [None]:
#display(X_test)
#vals = pd.DataFrame([[pd.to_datetime('2020-05-15 12:30:00'), '32.14768473',	'52.35325513',	'0.6492476293', 'HmiyD2TTLFNqkNe']], columns=('DATE_TIME', 'AMBIENT_TEMPERATURE', 'MODULE_TEMPERATURE', 'IRRADIATION', 'SOURCE_KEY'))
vals = pd.DataFrame([[pd.to_datetime('2020-05-15 12:30:00'), '32.14768473',	'52.35325513',	'0.6492476293', '1BY6WEcLGh8j5v7']], columns=('DATE_TIME', 'AMBIENT_TEMPERATURE', 'MODULE_TEMPERATURE', 'IRRADIATION', 'SOURCE_KEY'))
#display(vals)
vals['hour'] = vals['DATE_TIME'].dt.hour
vals['min'] = vals['DATE_TIME'].dt.minute
vals = vals.drop(['DATE_TIME'], axis = 1)
display(vals)
vals[cols_to_scale] = scaler.transform(vals[cols_to_scale])
#display(vals)
vals = cyclical.transform(vals)
predicted_value = ransacReg1.predict(vals.drop(['SOURCE_KEY'], axis = 1))
if predicted_value < 0:
  predicted_value = 0.0
display("DC_POWER Prediction for sample: " + str(predicted_value))

In [None]:
pred = ransacReg1.predict(X_test.drop(['SOURCE_KEY'], axis = 1))
pred[pred < 0] = 0.0
mae = mean_absolute_error(y_test, pred)
display(mae)

# Significance Tests

Two tailed Student-T Test for y_test and pred with the number of samples in each being 4568 at a 95% significance level. We choose the T Test since population variance is unknown.

In [None]:
display("Size of samples: " + str(len(y_test)))
#Variance check for two tailed test
display("Variance of Predicted Samples: " + str(np.var(pred)), "Variance of Ground Truth: " + str(np.var(np.exp(y_test)) - 1))
display("Standard Deviation of the Sample: " + str(np.sqrt(np.var(pred))))
#Ratio
display("Ratio to check for nearly equal Variance: " + str(np.var(pred)/np.var(y_test)))

In [None]:
#T-Test
#Equal Variance, hence equal_var = 'True'
tstat, t_pval = stats.ttest_ind(a=pred, b=y_test, equal_var=True)

display("T-Statistic: " + str(tstat.round(3)), "PValue: " + str(t_pval.round(3)))

In [None]:
from sklearn.decomposition import PCA
import plotly.express as px
pca = PCA() #decomposition.PCA(n_components = 2)
view = pca.fit_transform(X_train.drop(['SOURCE_KEY'], axis = 1))
labels = {
    str(i): f"PC {i+1} ({var:.1f}%)"
    for i, var in enumerate(pca.explained_variance_ratio_ * 100)
}

fig = px.scatter_matrix(
    view,
    labels=labels,
    dimensions=range(2)
)
fig.update_traces(diagonal_visible=False)
fig.show()

In [None]:
# inlier = ransacReg1.inlier_mask_
# outlier = np.logical_not(inlier)
# line_X = np.arange(3, 51, 2)
# print(X_test)
# line_y = ransacReg1.predict(line_X[:, np.newaxis])
# plt.scatter(X_test[inlier], y_test[inlier], c='lightblue', marker='o', label='Inliers')
# plt.scatter(X_test[outlier], y_test[outlier], c='green', marker='s', label='Outliers')
# plt.plot(line_X, line_y, color = 'black')
# plt.xlabel('AMBIENT_TEMPERATURE')
# plt.ylabel('DC_POWER')
# plt.legend(loc='upper left')

In [None]:
plt.scatter(df_Learn['MODULE_TEMPERATURE'], df_Learn['DC_POWER'])
plt.show()

plt.scatter(df_Learn['AMBIENT_TEMPERATURE'], df_Learn['DC_POWER'])
plt.show()

plt.scatter(df_Learn['IRRADIATION'], df_Learn['DC_POWER'])
plt.show()

# plt.scatter(np.log(df_Learn['AMBIENT_TEMPERATURE']), np.log(df_Learn['DC_POWER']))
# plt.show()

In [None]:
X_time = df_Learn.drop(['AC_POWER', 'DAILY_YIELD', 'TOTAL_YIELD'], axis = 1)
X_time['hour'] = X_time['DATE_TIME'].dt.hour
X_time['min'] = X_time['DATE_TIME'].dt.minute
X_time = X_time.drop(['DATE_TIME'], axis = 1)

plt.scatter(X_time['hour'], X_time['DC_POWER'])
plt.show()

In [None]:
plt.figure(figsize=(15,15))
plt.scatter(y_test, pred, c='red')

p1 = max(max(pred), max(y_test))
p2 = min(min(pred), min(y_test))
plt.plot([p1, p2], [p1, p2], 'green')
plt.xlabel('True Values', fontsize=15)
plt.ylabel('Predictions', fontsize=15)
plt.legend(['Actual Value', 'Predicted Value'])
plt.axis('equal')
plt.show()