# Feature Engineering
In this notebook be will look at the features in our dataset, visualize and interpret their trends, and finally transform the data into a pipline that our model can use later.

In [5]:
import numpy as np
import pandas as pd

df = pd.read_parquet("solar_cleaned.parquet")

## Visualizing features
We will construct two sets of graphs, weather and sky observations. Each graph will show us how well each feature will correlate with efficiency. We use the DNI efficiency because it is the most widely used compared to GHI and DNI.

In [6]:
import altair as alt

sample = df.sample(n=5000)

# pick out all weather data from database
weather_info = pd.DataFrame({
    'dni_efficiency': sample['dni_efficiency'],
    'cloud_cover': sample['cloud_cover'],
    'temperature': sample['temperature'],
    'daily_precipitation': sample['daily_precipitation'],
    'dew_point': sample['dew_point'],
    'relative_humidity': sample['relative_humidity'],
    'wind_speed': sample['wind_speed'],
    'hourly_visibility': sample['hourly_visibility'],
    'station_pressure': sample['station_pressure'],
})

# a loess line over the data
lines = [alt.Chart(weather_info).transform_loess(col, 'dni_efficiency').mark_line(color="red").encode(
    alt.X(col),
    alt.Y('dni_efficiency'),
).properties(
    width=150,
    height=150
)
        for col in list(weather_info.columns)]

alt.hconcat(*lines)

In [7]:
# pick out all weather observations from database
weather_obv = sample[['dni_efficiency', 'cloudy','mostly_cloudy', 'partly_cloudy', 'mostly_clear',
       'clear', 'overcast', 'rain_light', 'tstorm',
       'drizzle', 'rain_heavy', 'rain', 'fog', 'snow_light', 'snow',
       'snow_heavy', 'freezing_rain', 'freezing_drizzle', 'ice_pellets',
       'ice_pellets_light', 'ice_pellets_heavy', 'flurries',
       'freezing_rain_heavy', 'freezing_rain_light', 'fog_light']]

# a loess line over the data
lines = [alt.Chart(weather_obv).transform_loess(col, 'dni_efficiency').mark_line(color="red").encode(
    alt.X(col),
    alt.Y('dni_efficiency:Q'),
).properties(
    width=150,
    height=150
)
        for col in list(weather_obv.columns)]
alt.hconcat(*lines)

## Encoding features
Here we will clean the data further and get it ready for training our model later. First we drop columns that we wont use and the columns that we will want as our output. Then we will apply a scaler to the rest of the columns.

In [8]:
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

columns_to_scale = list(df.drop(columns=['dni_efficiency', 'ghi_efficiency', 'dhi_efficiency', 'STATION','DATE',
                                            'latitude','longitude']).columns)

amount_scaler = ('amount_scaler', RobustScaler(), columns_to_scale)

scale_steps = [amount_scaler]
all_xforms = ColumnTransformer(transformers=(scale_steps))

## Fit and save the feature extraction pipeline

In [20]:
feat_pipeline = Pipeline([
    ('feature_extraction',all_xforms)
])

feat_pipeline.fit(df)

import cloudpickle as cp
import os
def serialize_to(obj, default_filename):
    filename = os.getenv("S2I_PIPELINE_STAGE_SAVE_FILE", default_filename)
    cp.dump(obj, open(filename, "wb"))


serialize_to(feat_pipeline, "feature_pipeline.sav")
print("Feature Enginerring done and saved.")