In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

from prophet import Prophet
from sklearn.metrics import mean_absolute_error

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# import and show the data
df=pd.read_csv(r'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-GPXX032NEN/images/data/long_data_.csv')
df.head()

In [2]:
df = pd.read_csv(r"C:\Users\Morinyo Baddestman\Documents\long_data_.csv")
df.head()

Unnamed: 0,States,Regions,latitude,longitude,Dates,Usage
0,Punjab,NR,31.519974,75.980003,02/01/2019 00:00:00,119.9
1,Haryana,NR,28.450006,77.019991,02/01/2019 00:00:00,130.3
2,Rajasthan,NR,26.449999,74.639981,02/01/2019 00:00:00,234.1
3,Delhi,NR,28.669993,77.230004,02/01/2019 00:00:00,85.8
4,UP,NR,27.599981,78.050006,02/01/2019 00:00:00,313.9


In [3]:
# dataframe features
print('dataframe is of shape\n', df.shape)
print(df.info())

dataframe is of shape
 (16599, 6)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16599 entries, 0 to 16598
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   States     16599 non-null  object 
 1   Regions    16599 non-null  object 
 2   latitude   16599 non-null  float64
 3   longitude  16599 non-null  float64
 4   Dates      16599 non-null  object 
 5   Usage      16599 non-null  float64
dtypes: float64(3), object(3)
memory usage: 778.2+ KB
None


## data cleaning and wrangling

In [4]:
# convert df columns to lower case for easier referrencing
df.columns = [col.lower() for col in df.columns]

# convert the dates column to date_time format
df['dates'] = pd.to_datetime(df['dates'], format='mixed', dayfirst= True)

# drop unneccesary columns
df = df.drop(['longitude', 'latitude'], axis=1)

In [5]:
# group the data by dates
df = df.groupby('dates', as_index=False).mean(numeric_only=True)
print('new shape\n', df.shape)
print(df.head())


new shape
 (498, 2)
       dates       usage
0 2019-01-02  102.224242
1 2019-01-03  103.142424
2 2019-01-04  100.124242
3 2019-01-05  100.269697
4 2019-01-06  100.512121


## Visualizations

In [6]:
fig = px.line(df, x='dates', y= 'usage')
fig.show()

## Modelling

In [7]:
# specify column names used by prophet
df.columns= ['ds', 'y']
df.head()


Unnamed: 0,ds,y
0,2019-01-02,102.224242
1,2019-01-03,103.142424
2,2019-01-04,100.124242
3,2019-01-05,100.269697
4,2019-01-06,100.512121


In [None]:
# split the data


In [8]:
# initialize model class
model = Prophet()

# fit model
model.fit(df)

# print model components
model.component_modes

09:34:32 - cmdstanpy - INFO - Chain [1] start processing
09:34:35 - cmdstanpy - INFO - Chain [1] done processing


{'additive': ['weekly',
  'additive_terms',
  'extra_regressors_additive',
  'holidays'],
 'multiplicative': ['multiplicative_terms', 'extra_regressors_multiplicative']}

In [19]:
# make a future dataframe
future_dates = model.make_future_dataframe(periods=365, # number of periods to forecast
                                        freq= 'd', # frequency = day
                                        include_history=True # include historical dates for prediction
                                        )

# show dataframe
future_dates.head()

Unnamed: 0,ds
0,2019-01-02
1,2019-01-03
2,2019-01-04
3,2019-01-05
4,2019-01-06


In [20]:
# predict using the future dates
prediction = model.predict(future_dates)

#show predictions
print(prediction.shape)
prediction.head()

(863, 16)


Unnamed: 0,ds,trend,yhat_lower,yhat_upper,trend_lower,trend_upper,additive_terms,additive_terms_lower,additive_terms_upper,weekly,weekly_lower,weekly_upper,multiplicative_terms,multiplicative_terms_lower,multiplicative_terms_upper,yhat
0,2019-01-02,101.80887,89.240257,114.080167,101.80887,101.80887,-0.127512,-0.127512,-0.127512,-0.127512,-0.127512,-0.127512,0.0,0.0,0.0,101.681359
1,2019-01-03,101.813825,89.846177,113.78793,101.813825,101.813825,-0.027117,-0.027117,-0.027117,-0.027117,-0.027117,-0.027117,0.0,0.0,0.0,101.786708
2,2019-01-04,101.81878,89.808072,113.595194,101.81878,101.81878,-0.073942,-0.073942,-0.073942,-0.073942,-0.073942,-0.073942,0.0,0.0,0.0,101.744839
3,2019-01-05,101.823736,88.650208,113.594725,101.823736,101.823736,-0.308206,-0.308206,-0.308206,-0.308206,-0.308206,-0.308206,0.0,0.0,0.0,101.51553
4,2019-01-06,101.828691,88.322168,114.688963,101.828691,101.828691,0.08151,0.08151,0.08151,0.08151,0.08151,0.08151,0.0,0.0,0.0,101.910201


In [21]:
# plot the predictions
trace_open = go.Scatter(
    x = prediction["ds"],
    y = prediction["yhat"],
    mode = 'lines',
    name="Forecast"
)
trace_high = go.Scatter(
    x = prediction["ds"],
    y = prediction["yhat_upper"],
    mode = 'lines',
    fill = "tonexty", 
    line = {"color": "#57b8ff"}, 
    name="Higher uncertainty interval"
)
trace_low = go.Scatter(
    x = prediction["ds"],
    y = prediction["yhat_lower"],
    mode = 'lines',
    fill = "tonexty", 
    line = {"color": "#57b8ff"}, 
    name="Lower uncertainty interval"
)
trace_close = go.Scatter(
    x = df["ds"],
    y = df["y"],
    name="Data values"
)

#make list for all three scattle objects.
data = [trace_open,trace_high,trace_low,trace_close]

# Construct a new Layout object. 

#title - It will display string as a title of graph
layout = go.Layout(title="Power consumption forecasting")

#A list or tuple of trace instances (e.g. [Scatter(…), Bar(…)]) or A single trace instance (e.g. Scatter(…), Bar(…), etc.)
#A list or tuple of dicts of string/value properties where: - The ‘type’ property specifies the trace type.

fig = go.Figure(data=data)
fig.show()

In [22]:
fig = go.Figure([go.Scatter(x=df['ds'], y=df['y'],mode='lines',
                    name='Actual')])
#You can add traces using an Express plot by using add_trace
fig.add_trace(go.Scatter(x=prediction['ds'], y=prediction['yhat'],
                   mode='lines+markers',
                    name='predicted'))
#To display a figure using the renderers framework, you call the .show() method on a graph object figure, or pass the figure to the plotly.io.show function. 
#With either approach, plotly.py will display the figure using the current default renderer(s).
fig.show()

In [23]:
# mean absolute error
#Return a Numpy representation of the DataFrame.
y_true = df['y'].values

#Here we have specified [:498] because in y_true we have 498 data points so for comparing both series we need equal shape of series.
y_pred = prediction['yhat'][:498].values 

#Parameters:
#y_truearray-like of shape = (n_samples) or (n_samples, n_outputs)
#Ground truth (correct) target values.

#y_predarray-like of shape = (n_samples) or (n_samples, n_outputs)
#Estimated target values.

mae = mean_absolute_error(y_true, y_pred)
print('MAE: %.3f' % mae)

MAE: 7.904


## optimizing the model for better perfomamnce

In [24]:
# initialize and tune hyperparameters
model1=Prophet(daily_seasonality=True).add_seasonality(name='yearly',period=365,fourier_order=70)

# fit the model to the data 
model1.fit(df)

#print model components
model1.component_modes

09:47:48 - cmdstanpy - INFO - Chain [1] start processing
09:47:50 - cmdstanpy - INFO - Chain [1] done processing


{'additive': ['yearly',
  'weekly',
  'daily',
  'additive_terms',
  'extra_regressors_additive',
  'holidays'],
 'multiplicative': ['multiplicative_terms', 'extra_regressors_multiplicative']}

In [26]:
# make future dataframe
future_dates1=model1.make_future_dataframe(periods=365)

prediction1=model1.predict(future_dates1)

In [27]:
from sklearn.metrics import mean_absolute_error
y_true = df['y'].values
y_pred = prediction1['yhat'][:498].values
mae = mean_absolute_error(y_true, y_pred)
print('MAE: %.3f' % mae)

MAE: 4.020


In [28]:
import plotly.graph_objects as go
fig = go.Figure([go.Scatter(x=df['ds'], y=df['y'],mode='lines',
                    name='Actual')])

fig.add_trace(go.Scatter(x=prediction1['ds'], y=prediction1['yhat'],
                   mode='lines+markers',
                    name='predicted'))

fig.show()