In [396]:
import numpy as np
import pandas as pd

df = pd.read_parquet("solar_cleaned.parquet")

In [397]:
middle = int(len(df) * 0.7)

train = df.loc[:middle].copy()
test = df.loc[middle:]
print("Training shape: " + str(train.shape))
print("Testing shape: " + str(test.shape))

Training shape: (5360, 39)
Testing shape: (36267, 39)


In [398]:
import cloudpickle as cp
feature_pipeline = cp.load(open('feature_pipeline.sav', 'rb'))

In [399]:
from sklearn.ensemble import RandomForestRegressor
from sklearn import model_selection

rfc = RandomForestRegressor(n_estimators=50, max_depth=8)

svecs = feature_pipeline.fit_transform(train)

rfc.fit(svecs, train[['ghi_efficiency', 'dni_efficiency']])

RandomForestRegressor(max_depth=8, n_estimators=50)

In [400]:
from sklearn.neural_network import MLPRegressor

regr = MLPRegressor(random_state=1, max_iter=500).fit(svecs, train[['ghi_efficiency', 'dni_efficiency']])
regr.predict(feature_pipeline.fit_transform(test)[:2])

array([[1.00998243, 0.95450768],
       [1.00959982, 0.98312952]])

In [401]:
regr.score(feature_pipeline.fit_transform(test), test[['ghi_efficiency', 'dni_efficiency']].values)

0.5115056437887135

In [402]:
from sklearn.metrics import r2_score

predictions = rfc.predict(feature_pipeline.fit_transform(test))

print(r2_score(test[['ghi_efficiency', 'dni_efficiency']].values, predictions))

0.6024780864447157


In [403]:
l = list(enumerate(rfc.feature_importances_))
l.sort(key=lambda x: -x[1])
l[:5]

[(7, 0.754169715574614),
 (2, 0.09643603670668147),
 (3, 0.039330220460096064),
 (4, 0.02322511079640592),
 (0, 0.016598114951795837)]

In [404]:
import altair as alt

source = pd.DataFrame({
    'Variable': list(df.drop(columns=['dni_efficiency', 'ghi_efficiency', 'dhi_efficiency', 'STATION','DATE',
                                            'latitude','longitude'])),
    'Importance': (list(rfc.feature_importances_))
})

bars = alt.Chart(source).mark_bar().encode(
    y='Variable:N',
    x="Importance:Q"
)

text = bars.mark_text(
    align='left',
    baseline='middle',
    dx=3  # Nudges text to right so it doesn't appear on top of the bar
).encode(
    text='Importance:Q'
)

(bars + text)

In [405]:
from mlworkflows import util
util.serialize_to(rfc, "rfc.sav")