# Imports and Initializations

### Note: This document cannot be converted to pdf due to the presence of plotly graphs which require a paid subscription for the pdf service. Instead, please view the .html version of the document. The graphs are interactive - try hovering your mouse over points on it!

In [49]:
import pandas as pd
import numpy as np

import plotly
import plotly.express as px
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVR
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Done to avoid flooding the screen with warnings for legacy numpy operations
# in pandas methods
import warnings
warnings.filterwarnings('ignore')

## Function Declarations

In [2]:
# Assumption: localminute column is clean and error free
def index_and_sort(data): 
    merged_df = pd.DataFrame()

    for k,df in data.groupby(["dataid"]): 
        df.sort_values(by=["localminute"], inplace=True)
        df["val_diff"] = df["meter_value"].diff()

        df = df.set_index(pd.DatetimeIndex(pd.to_datetime(df['localminute'], utc=True, infer_datetime_format=True, cache=True)))
        df.drop(columns=["localminute"], inplace=True)
        
        merged_df = merged_df.append(df)

    return merged_df

In [3]:
# Function from question 1, used to remove irregular spikes in meter value reading
def remove_spikes(data_df): 
    spikeless_resampled_df = pd.DataFrame()

    for k, df in data_df.groupby("dataid"): 
        spikeless_df = df[~(df['val_diff'].shift(-1) < 0)] 
        spikeless_df['val_diff'] = spikeless_df['meter_value'].diff()

        # Need to do this because some spikes are less "sharp" than 1 timestemp
        for i in range(10):  # by right should be doing UNTIL no more spikes left. Tested to see no more spikes after 10 passes
            spikeless_df = spikeless_df[~(spikeless_df['val_diff'].shift(-1) < 0)] 
            spikeless_df['val_diff'] = spikeless_df['meter_value'].diff()

        spikeless_sample = spikeless_df.resample('1h').mean()
        spikeless_sample["dataid"].fillna(k, inplace=True)
        spikeless_sample["meter_value"] = spikeless_sample["meter_value"].interpolate()
        spikeless_resampled_df = spikeless_resampled_df.append(spikeless_sample)

    return spikeless_resampled_df

In [4]:
# Preprocessing df into inputs ready for training
# input frame = [LOOKBACK_PERIOD val_diffs] + [ExpectedOutput] + [HourToPredictFor]
# returns: [inputs], [labels] where correspond by position
def make_io_frames(data_df): 
    input_data = []
    for k, df in data_df.groupby('dataid'): 
        diff_series =  df['meter_value'].diff().dropna()

        # The = + [] is giving the model the hour of day it is predicting for (important)  
        list_of_data = [diff_series[i:i+LOOKBACK_PERIOD+1].tolist() + [diff_series[i:i+LOOKBACK_PERIOD+1].index[-1].hour] for i in range(0,df.shape[0],1) if diff_series[i:i+LOOKBACK_PERIOD+1].shape[0] == LOOKBACK_PERIOD+1]

        for item in list_of_data: 
            input_data.append(item)
   
    X = [frame[:LOOKBACK_PERIOD]+[frame[-1]] for frame in input_data]
    Y = [frame[-2] for frame in input_data]

    return X, Y

In [5]:
# This function is the frontend for the model that preprocesses input_data before  

# Assumptions: 
# input_data contain all data points for the past LOOKBACK+1 hours.
# input data has a DateTimeIndex
# input data is sorted 
# hour_to_predict is a pandas_timestamp. 
def predict(input_data, hour_to_predict, model): 

    latest_available_reading = input_data[input_data.index < hour_to_predict].meter_value[-1]

    #print(input_data)
    #print("Predicting for", hour_to_predict)

    meter_readings = input_data['meter_value']
    meter_readings = meter_readings[(meter_readings.index > (hour_to_predict - pd.Timedelta(str(LOOKBACK_PERIOD)+'h'))) & (meter_readings.index < hour_to_predict)]
    meter_readings = meter_readings.resample('1h').mean()

    #print(meter_readings)

    def getMeterReading(hours_prior):
        req_hour = hour_to_predict - pd.Timedelta(str(hours_prior)+'h')
        #print(req_hour)
        if req_hour in meter_readings.index:
            return meter_readings.loc[req_hour]
        return np.nan

    attributes = pd.Series([getMeterReading(i) for i in range(LOOKBACK_PERIOD+1, 0, -1)])
    attributes = attributes.interpolate()
    #print(attributes)

    diffs = attributes.diff().fillna(0)[1:]
    #print(diffs)
    
    prediction = model.predict([diffs.tolist() + [hour_to_predict.hour]])

    return latest_available_reading + prediction[0]

In [6]:
# Given a house, SIMULATE day to day predictions 
# Take last LOOKBACK PERIOD readings and simulating next hour (time t)
# After actual reading of time t is found, repeat for time t+1. 
# until last known data point. 
# NOTE: Inherent correction in prediction graph. 
def simulate_operation(df_for_house, model):

    timestamps_to_predict_for = df_for_house
    timestamps_to_predict_for = timestamps_to_predict_for.resample('1h').last()
    timestamps = (timestamps_to_predict_for.index)[LOOKBACK_PERIOD+1:]

    predictions = []
    predicted_timestamps = []

    for time in timestamps: 
        try: 
            predictions.append(predict(df_for_house, time, model))
            predicted_timestamps.append(time)
        except: 
            print("ERROR PREDICTING ", time)

    return predicted_timestamps, predictions

In [7]:
# Given a house, SIMULATE day to day predictions 
# Take last LOOKBACK PERIOD readings and simulating next hour (time t)
# assume prediction is correct, repeat for time t+1. 
# NOTE: Long term predicting over predictions. 
def long_term_prediction(df_for_house, time_start, num_days_to_predict, model):

    seed_data = df_for_house[df_for_house.index < time_start]
    seed_data.drop(columns=['val_diff', 'dataid'], inplace=True)

    timestamps = pd.date_range(time_start, periods=num_days_to_predict*24, freq='H')

    predictions = []
    predicted_timestamps = []

    for time in timestamps: 

        try:
            prediction = predict(seed_data, time, model)
            predictions.append(prediction)
            seed_data = seed_data.append(pd.DataFrame(data=[prediction], columns=['meter_value'], index=[time]))
            predicted_timestamps.append(time)
            #print(seed_data)
        except: 
            print("ERROR PREDICTING", time)

    return predicted_timestamps, predictions

In [8]:
# Obtain mean metervalues from cleaned dataframe
def mean_readings_for_area(df): 
    mean_data = df.resample('1h').mean() # get mean val_diffs
    mean_data['meter_value'] = mean_data['val_diff'].cumsum()   # simulate meter_readings from mean_val_diffs
    return mean_data

In [9]:
## Tunable Hyper-Parameter:  
LOOKBACK_PERIOD = 6

# Question 2.1

## Question 2.1

In this part, you will asked to build a model to forecast the hourly readings in the future (next hour). 

1. Can you explain why you may want to forecast the gas consumption in the future? Who would find this information valuable? 
2. What can you do if you have a good forecasting model?


As one of the fundamental driving forces of economic activities of the world, energy is a crucial consideration in many key decision making processes. Due to its non-renewable nature, and rapidly increasing demand, it is important to use fossil fuels as efficient as possible. Despite falling on the category of fossil fuels, natural gas combustion emits less greenhouse gas and places it as a cleaner and safer option as compared to other fossil fuels such as coal or oil.

A good forecasting model will be able to allow power and gas utility supplier companies to predict periods for which a certain area would experience higher increase in demand. Subsequently, accurate underground stock optimization would allow companies to prevent overstock, which would prove to be costly as the unusable gas would still need to be paid due to contractual agreement. In addition, the prevention of under stocking is also highly important to prevent downtimes and other catastrophic repercussions from inability to meet demand.


## Question 2.2

Build a linear regression model to forecast the hourly readings in the future (next hour). 

Generate two plots: 

**1. Time series plot of the actual and predicted hourly meter readings**

**2. Scatter plot of actual vs predicted meter readings (along with the line showing how good the fit is)**

In [10]:
## Read CSV data
# Takes quite a bit of time for date-inference
# Optimization: Consider manual caching in a dict (top StOvflw answer)
data = pd.read_csv("dataport-export_gas_oct2015-mar2016.csv")
merged_df = index_and_sort(data)
clean_df = remove_spikes(merged_df)

X, Y = make_io_frames(clean_df)

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

In [12]:
## Create linear regresison model and train
lin_model = LinearRegression()
lin_model.fit(X_train, Y_train)

LinearRegression()

In [13]:
## Take predictions of linear regression models
predictions = lin_model.predict(X_train)
# Example of how to use model:
# predict(merged_df[:50], pd.Timestamp('2015-10-04 17:00:00+00:00'), lin_model)

mean_squared_error(Y_train, predictions)

49.85421428855172

In [110]:
## Calculate MSE in predictions train set
print('Train and test result for linear regression')
predictions_test = lin_model.predict(X_train)
print("MSE error in train set is: {}".format(mean_squared_error(Y_train, predictions_test)))
## Calculate MSE in predictions test set
predictions_test = lin_model.predict(X_test)
mean_squared_error(Y_test, predictions_test)
print("MSE error in test set is: {}".format(mean_squared_error(Y_test, predictions_test)))

Train and test result for linear regression
MSE error in train set is: 49.85421428855172
MSE error in test set is: 43.81696857990257


The MSE error in test set is lower than in train set, signifying model has good generalizability and is capable of prediction unfitted data better than train set. There is unlikely to be overfit in the train set.

**Architecture for inference:**


 **[Raw-data]** -> **[PreProcessing]** -> **[ModelFrontend]** -> **[Model]** -> **[Predictions]**

We present different modes of predictions with our model, namely *simulate_operation* and *long_term_prediction*. Instead of making predictions directly over all available date, prediction is done with sampling windows with size defined by *LOOKBACK_PERIOD*.

- If Predictions which we want is taken over a period of 1 hour only and always corrected, use simulate_operation function is used *(refer to declaration above)*. 
- If we require recursive prediction to predict over predictions, we use long_term_prediction *(refer to declaration above)*. 

## Question 2.2 (a) - Time series plot of the actual and predicted hourly meter readings with linear regression

In [90]:
timestamps, predictions = simulate_operation(merged_df[merged_df['dataid'] == 35][:250], lin_model)
ltp_timestamps, ltp_predictions = long_term_prediction(merged_df[merged_df['dataid'] == 35][:250], pd.Timestamp('2015-10-02 01:00:00+00:00'), 6, lin_model)

In [91]:
# before plotting actual, need to take to utc because plotly doesn't do it for us.
actual = merged_df[merged_df['dataid'] == 35][:250]
imputed_actual = clean_df[clean_df['dataid'] == 35][:250]

fig = px.scatter()
fig.add_scatter(x=timestamps, y=predictions, name='predictions')
fig.add_scatter(x=actual.index, y=actual['meter_value'].tolist(), name='actual')
fig.add_scatter(x=imputed_actual.index, y=imputed_actual['meter_value'].tolist(), name='imputed')
fig.add_scatter(x=ltp_timestamps, y=ltp_predictions, name='long_term_prediction')
fig.update_layout(
    title='Single house prediction with linear regression model',
    xaxis_title='Date and time',
    yaxis_title='meter value reading',
    legend_title='Data Category',
)
fig.show()

In this initial scenario we carry out predictions using data from a single house with *dataid* 35. It could be observed that the model predicts the meter value usage in a pessimistic manner by using hourly simulation. This is likely because the model learns incremental increase in hourly window, if we increase LOOKBACK period, it would cause the model to be more optimistic.

One noticable issue with the model is that imperfections that exist in imputation is also carried over by the model, largely due to the fact that the input data fed into the model was imputed.

In [92]:
mean_data = mean_readings_for_area(clean_df)
mean_timestamps, mean_predictions = simulate_operation(mean_data[:259], lin_model)
mean_ltp_timestamps, mean_ltp_predictions = long_term_prediction(mean_data[:250], pd.Timestamp('2015-10-02 01:00:00+00:00'), 6, lin_model)

In [93]:
truth = mean_data[:250]
fig = px.scatter()
fig.add_scatter(x=mean_timestamps, y=mean_predictions, name='predictions')
fig.add_scatter(x=truth.index, y=truth['meter_value'].tolist(), name='mean')
fig.add_scatter(x=mean_ltp_timestamps, y=mean_ltp_predictions, name='long_term_prediction')
fig.update_layout(
    title='Mean predictions with linear regression model',
    xaxis_title='Date and time',
    yaxis_title='meter value reading',
    legend_title='Data Category',
)
fig.show()

In this scenario we made use of mean readings from the increase (val_diff) of meter value reading for each houses in the dataframe as seen in how the dataframe is processed with function *mean_readings_for_area*. The model generally does well on "mean" data for the entire area, capable of predicting better compared to using actual cumulative meter value reading previously. 
It could be theorized that doing linear regression with mean data will give our model better ability of generalization, for which the theory is confirmed in the result that was obtained above. Customizing prediction with linear regression by only using reading of single house leads to results that is akin to overfitting. It could be argued that there is lower deviation in the mean hourly val_diff dataset as compared to using cumulative meter value reading, hence prediciton error with linear regression is less drastic and results in lower changes in next data point. One can use this model to better predict the average gas usage of the entire area over the next hour.

However, it must be kept in mind that long term predictions are still very poor and unreliable. Our group attempted to predict multiple hours in front in an attempt to improve the prediction but did not see any considerable impact on the prediction results.

## Question 2.2 (b) - Scatter plot of actual vs predicted meter readings (along with the line showing how good the fit is) with linear regression

In [94]:
# before plotting actual, need to take to utc because plotly doesn't do it for us.
actual = merged_df[merged_df['dataid'] == 35][:250]

fig = px.scatter()
# plot points form actual dataset
fig.add_trace(go.Scatter(x=actual.index, y=actual['meter_value'].tolist(),
                           mode='markers',
                           name='actual'))
# plot points from hourly linear regression prediction 
fig.add_trace(go.Scatter(x=timestamps, y=predictions,
                           mode='markers',
                           name='predictions'))
# get ols trendline from predictions
fig_t = px.scatter(x=timestamps, y=predictions, trendline='ols')
trendline = fig_t.data[1]
# overlay ols trendline with scattered results
fig.add_trace(go.Scatter(trendline,
                          mode='lines',
                          line=dict(color="Yellow"),
                          name='predictions'))
fig.update_layout(
    title='Single house prediction with linear regression model',
    xaxis_title='Date and time',
    yaxis_title='meter value reading',
    legend_title='Data Category',
    showlegend=True
)
fig.show()

In the graph above, scatter plot of the actual data of a single house (dataid 35) is plotted along with the hourly prediction result with linear regression from Q2.2(a). In initial view one could evidently see that between the scattered plots of actual and prediction results the accuracy is relatively satisfactory. The trendline is taken by fitting results of linear regression prediction of a single house with ordinary least squares method. The linear regression model fares generally well as compared to the scattered plot of actual data.

In [95]:
fig = px.scatter()
# plot points form actual dataset
fig.add_trace(go.Scatter(x=truth.index, y=truth['meter_value'].tolist(),
                           mode='markers',
                           name='mean actual'))
# plot points from hourly linear regression prediction 
fig.add_trace(go.Scatter(x=mean_timestamps, y=mean_predictions,
                           mode='markers',
                           name='mean predictions'))
# get ols trendline from predictions
fig_t = px.scatter(x=mean_timestamps, y=mean_predictions, trendline='ols')
trendline = fig_t.data[1]
# overlay ols trendline with scattered results
fig.add_trace(go.Scatter(trendline,
                          mode='lines',
                          line=dict(color="Yellow"),
                          name='predictions'))
fig.update_layout(
    title='Mean predictions with linear regression model',
    xaxis_title='Date and time',
    yaxis_title='meter value reading',
    legend_title='Data Category',
    showlegend=True
)
fig.show()

We now attempt to extract trendline by fitting results of linear regression prediction of mean houses with ordinary least squares method. Scattered actual mean data points are plotted as well as the prediction of mean houses from hourly linear regression. With the same argument previously, by using mean value of val_diff we can extract general trends that affects all houses instead of a single house. This shows that the mean change in meter value for houses tend to have linear relationship.

## Question 2.3

Do the same as Question 2.2 above but use support vector regression (SVR).

Generate two plots: 

**1. Time series plot of the actual and predicted hourly meter readings**

**2. Scatter plot of actual vs predicted meter readings (along with the line showing how good the fit is)**

In [75]:
# Create and train linear SVR model
svr_lin = LinearSVR(verbose=True)
svr_lin.fit(X_train, Y_train)

[LibLinear]

LinearSVR(verbose=True)

In [109]:
## Calculate MSE in predictions train set
print('Train and test result for linear SVR')
predictions_test = svr_lin.predict(X_train)
print("MSE error in train set is: {}".format(mean_squared_error(Y_train, predictions_test)))
## Calculate MSE in predictions test set
predictions_test = svr_lin.predict(X_test)
mean_squared_error(Y_test, predictions_test)
print("MSE error in test set is: {}".format(mean_squared_error(Y_test, predictions_test)))

Train and test result for linear SVR
MSE error in train set is: 59.74622021127509
MSE error in test set is: 51.31677639453743


## Question 2.3 (a) - Time series plot of the actual and predicted hourly meter readings with SVR

In [88]:
# Produce predictions with linear SVR
timestamps, predictions = simulate_operation(merged_df[merged_df['dataid'] == 35][:250], svr_lin)
ltp_timestamps, ltp_predictions = long_term_prediction(merged_df[merged_df['dataid'] == 35][:250], pd.Timestamp('2015-10-02 01:00:00+00:00'), 6, svr_lin)

SVR model with RBF kernel was also explored but later abandoned as model training time is considerably long and unrealistic given the project timeline. This is because the added complexity in using Radial Basis Function calculation for RBF SVR. Theoretically, SVR with RBF kernel would be able to project data into higher dimensions, therefore allowing better generalizability than SVR with linear kernel. RBF kernel SVR would be better in doing regression where data points are clustered and can be separated by circular shapes. This would allow an input data with relationship that is nonlinear to be identified, however in the case of our project it could be argued that using RBF is unnecessary as the meter value usage likely has a linear relationship as predictionability was demonstrated with linear regression. For the same reason, the approach of using polynomial kernel was foregone as our data points can be regressed linearly.

In [89]:
# before plotting actual, need to take to utc because plotly doesn't do it for us.
actual = merged_df[merged_df['dataid'] == 35][:250]
imputed_actual = clean_df[clean_df['dataid'] == 35][:250]

fig = px.scatter()
fig.add_scatter(x=timestamps, y=predictions, name='predictions')
fig.add_scatter(x=actual.index, y=actual['meter_value'].tolist(), name='actual')
fig.add_scatter(x=imputed_actual.index, y=imputed_actual['meter_value'].tolist(), name='imputed')
fig.add_scatter(x=ltp_timestamps, y=ltp_predictions, name='long_term_prediction')
fig.update_layout(
    title='Predictions with linear kernel SVR',
    xaxis_title='Date and time',
    yaxis_title='meter value reading',
    legend_title='Data Category',
)
fig.show()

In the graph above, time series plot of actual data, data after imputation, predicted data, and data with long term prediction are plotted. Just like in linear regression, prediction is preliminarily done over a single house with dataid 35. Theoretically, SVR is expected to provide better prediction result over the data due to its iterative nature in finding the best pattern. Evidently, as compared to linear regression, using SVR with linear kernel has been shown to provide less overfit and better generalizability to the overall data. Using long term predictor to predict over prediction, the linear SVR has been more accurate in predicting the increase in meter value, although rapidly increasing long term prediction value in around 2nd of October causes the overall error to be subsequently carried into further predictions values.

In [99]:
mean_data = mean_readings_for_area(clean_df)
mean_timestamps, mean_predictions = simulate_operation(mean_data[:259], svr_lin)
mean_ltp_timestamps, mean_ltp_predictions = long_term_prediction(mean_data[:250], pd.Timestamp('2015-10-02 01:00:00+00:00'), 6, svr_lin)

In [100]:
truth = mean_data[:250]
fig = px.scatter()
fig.add_scatter(x=mean_timestamps, y=mean_predictions, name='predictions')
fig.add_scatter(x=truth.index, y=truth['meter_value'].tolist(), name='mean')
fig.add_scatter(x=mean_ltp_timestamps, y=mean_ltp_predictions, name='long_term_prediction')
fig.update_layout(
    title='Mean predictions with linear kernel SVR',
    xaxis_title='Date and time',
    yaxis_title='meter value reading',
    legend_title='Data Category',
)
fig.show()

In this scenario mean readings are again used by using *mean_readings_for_area* fuction to preprocess. The model generally does well on "mean" data for the entire area, capable of predicting better compared to using actual cumulative meter value reading previously. As compared to linear regression, the mean predictions are better, and offers better generalization than simply using reading from single house. This is because of the same reason as explained in 2.2(a) where there is lower deviation in the mean hourly val_diff dataset as compared to using cumulative meter value reading, hence prediciton error with SVR is less drastic, resulting in lower changes. The long term prediction is also shown to be more accurate in SVR compared to linear regression. From early October until 3 October long term prediciton is within reasonably margin of error as compared to mean prediction and actual data, before diverging due and having the error accumulated and influencing the subsequent predictions. One can use this model to better predict the average gas usage of the entire area over the next hour.

## Question 2.3 (b) - Scatter plot of actual vs predicted meter readings (along with the line showing how good the fit is) with SVR

In [113]:
# Produce predictions with linear SVR
timestamps, predictions = simulate_operation(merged_df[merged_df['dataid'] == 35][:250], svr_lin)
ltp_timestamps, ltp_predictions = long_term_prediction(merged_df[merged_df['dataid'] == 35][:250], pd.Timestamp('2015-10-02 01:00:00+00:00'), 6, svr_lin)

In [114]:
# before plotting actual, need to take to utc because plotly doesn't do it for us.
actual = merged_df[merged_df['dataid'] == 35][:250]

fig = px.scatter()
# plot points form actual dataset
fig.add_trace(go.Scatter(x=actual.index, y=actual['meter_value'].tolist(),
                           mode='markers',
                           name='actual'))
# plot points from hourly linear regression prediction 
fig.add_trace(go.Scatter(x=timestamps, y=predictions,
                           mode='markers',
                           name='predictions'))
# get ols trendline from predictions
fig_t = px.scatter(x=timestamps, y=predictions, trendline='ols')
trendline = fig_t.data[1]
# overlay ols trendline with scattered results
fig.add_trace(go.Scatter(trendline,
                          mode='lines',
                          line=dict(color="Yellow"),
                          name='predictions line'))
fig.update_layout(
    title='Single house prediction with linear kernel SVR',
    xaxis_title='Date and time',
    yaxis_title='meter value reading',
    legend_title='Data Category',
    showlegend=True
)
fig.show()

In the graph above, scatter plot of the actual data of a single house (dataid 35) is plotted along with the hourly prediction result with linear kernel SVR from Q2.3(a). Results are similar to linear regression model.

In [111]:
mean_data = mean_readings_for_area(clean_df)
mean_timestamps, mean_predictions = simulate_operation(mean_data[:259], svr_lin)
mean_ltp_timestamps, mean_ltp_predictions = long_term_prediction(mean_data[:250], pd.Timestamp('2015-10-02 01:00:00+00:00'), 6, svr_lin)

In [117]:
fig = px.scatter()
# plot points form actual dataset
fig.add_trace(go.Scatter(x=truth.index, y=truth['meter_value'].tolist(),
                           mode='markers',
                           name='mean actual'))
# plot points from hourly linear regression prediction 
fig.add_trace(go.Scatter(x=mean_timestamps, y=mean_predictions,
                           mode='markers',
                           name='mean predictions'))
# plot points from hourly linear regression prediction 
fig.add_trace(go.Scatter(x=mean_ltp_timestamps, y=mean_ltp_predictions,
                           mode='markers',
                           name='mean ltp predictions'))

# get ols trendline from predictions
fig_t = px.scatter(x=mean_timestamps, y=mean_predictions, trendline='ols')
trendline = fig_t.data[1]
# overlay ols trendline with scattered results
fig.add_trace(go.Scatter(trendline,
                          mode='lines',
                          line=dict(color="Yellow"),
                          name='predictions line'))

# get ols trendline from ltp predictions
fig_t = px.scatter(x=mean_ltp_timestamps, y=mean_ltp_predictions, trendline='ols')
trendline = fig_t.data[1]
# overlay ols trendline with scattered results
fig.add_trace(go.Scatter(trendline,
                          mode='lines',
                          line=dict(color="Cyan"),
                          name='ltp predictions line'))

fig.update_layout(
    title='Mean predictions with linear kernel SVR',
    xaxis_title='Date and time',
    yaxis_title='meter value reading',
    legend_title='Data Category',
    showlegend=True
)
fig.show()

In this analysis the long time prediction trendlines are also added and was shown to be better than linear regression long time prediction trendline.

## Appendix: Results with RBF kernel and Polynomial kernel SVR
#### Warning, takes a really long time to train!

In [None]:
# svr_rbf = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=.1)
# svr_poly = SVR(kernel='poly', C=100, gamma='auto', degree=3, epsilon=.1,
#                coef0=1)
# svr_lin.fit(X_train, Y_train)

In [None]:
# timestamps, predictions = simulate_operation(merged_df[merged_df['dataid'] == 35][:250], svr_rbf)
# ltp_timestamps, ltp_predictions = long_term_prediction(merged_df[merged_df['dataid'] == 35][:250], pd.Timestamp('2015-10-02 01:00:00+00:00'), 6, svr_rbf)

In [None]:
# # before plotting actual, need to take to utc because plotly doesn't do it for us.
# actual = merged_df[merged_df['dataid'] == 35][:250]
# imputed_actual = clean_df[clean_df['dataid'] == 35][:250]

# fig = px.scatter()
# fig.add_scatter(x=timestamps, y=predictions, name='predictions')
# fig.add_scatter(x=actual.index, y=actual['meter_value'].tolist(), name='actual')
# fig.add_scatter(x=imputed_actual.index, y=imputed_actual['meter_value'].tolist(), name='imputed')
# fig.add_scatter(x=ltp_timestamps, y=ltp_predictions, name='long_term_prediction')
# fig.update_layout(
#     title='Predictions with custom kernel SVR',
#     xaxis_title='Date and time',
#     yaxis_title='meter value reading',
#     legend_title='Data Category',
# )
# fig.show()

In [None]:
# truth = mean_data[:250]
# fig = px.scatter()
# fig.add_scatter(x=mean_timestamps, y=mean_predictions, name='predictions')
# fig.add_scatter(x=truth.index, y=truth['meter_value'].tolist(), name='mean')
# fig.add_scatter(x=mean_ltp_timestamps, y=mean_ltp_predictions, name='long_term_prediction')
# fig.update_layout(
#     title='Mean predictions with custom kernel SVR',
#     xaxis_title='Date and time',
#     yaxis_title='meter value reading',
#     legend_title='Data Category',
# )
# fig.show()