Copyright (c) Microsoft Corporation. All rights reserved.

Licensed under the MIT License.

![Impressions](https://PixelServer20190423114238.azurewebsites.net/api/impressions/NotebookVM/how-to-use-azureml/automated-machine-learning/forecasting-bike-share/auto-ml-forecasting-bike-share.png)

# Automated Machine Learning
**Sales Forecasting**



## Setup


In [1]:
import azureml.core
import pandas as pd
import numpy as np
import logging
import warnings

from pandas.tseries.frequencies import to_offset

# Squash warning messages for cleaner output in the notebook
warnings.showwarning = lambda *args, **kwargs: None

from azureml.core.workspace import Workspace
from azureml.core.experiment import Experiment
from azureml.train.automl import AutoMLConfig
from matplotlib import pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [2]:
ws = Workspace.from_config()

# choose a name for the run history container in the workspace
experiment_name = 'automl-saleforecasting'

experiment = Experiment(ws, experiment_name)

output = {}
output['SDK version'] = azureml.core.VERSION
output['Subscription ID'] = ws.subscription_id
output['Workspace'] = ws.name
output['Resource Group'] = ws.resource_group
output['Location'] = ws.location
output['Run History Name'] = experiment_name
pd.set_option('display.max_colwidth', -1)
outputDf = pd.DataFrame(data = output, index = [''])
outputDf.T

Unnamed: 0,Unnamed: 1
SDK version,1.0.72
Subscription ID,d7143f7b-854d-4156-ba5d-01d92d5a774d
Workspace,tl289603
Resource Group,tl2896ws
Location,westus2
Run History Name,automl-saleforecasting


## Data

In [3]:
from azureml.core import Dataset, Run
clusters_eggfamily= Dataset.get_by_name(workspace=ws, name='10cities_eggfamily')
data= clusters_eggfamily.to_pandas_dataframe()
data.head()

Unnamed: 0,id,date,store_nbr,item_nbr,unit_sales,onpromotion,city,state,store_type,store_cluster,holiday_type,transferred,transactions,family,class,perishable,dcoilwtico
0,30,2013-01-01,25,158680,1.0,True,Salinas,Santa Elena,D,1,Holiday,False,770.0,EGGS,2502,1,93.14
1,56,2013-01-01,25,208384,3.0,True,Salinas,Santa Elena,D,1,Holiday,False,770.0,EGGS,2502,1,93.14
2,74,2013-01-01,25,227111,1.0,True,Salinas,Santa Elena,D,1,Holiday,False,770.0,EGGS,2502,1,93.14
3,81,2013-01-01,25,258268,6.0,True,Salinas,Santa Elena,D,1,Holiday,False,770.0,EGGS,2502,1,93.14
4,166,2013-01-01,25,376427,1.0,True,Salinas,Santa Elena,D,1,Holiday,False,770.0,EGGS,2502,1,93.14


In [4]:
data = data.drop(columns=['id','item_nbr'])

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 197557 entries, 0 to 197556
Data columns (total 15 columns):
date             197557 non-null datetime64[ns]
store_nbr        197557 non-null int64
unit_sales       197557 non-null float64
onpromotion      197557 non-null bool
city             197557 non-null object
state            197557 non-null object
store_type       197557 non-null object
store_cluster    197557 non-null int64
holiday_type     197557 non-null object
transferred      197557 non-null bool
transactions     197224 non-null float64
family           197557 non-null object
class            197557 non-null int64
perishable       197557 non-null int64
dcoilwtico       197557 non-null float64
dtypes: bool(2), datetime64[ns](1), float64(3), int64(4), object(5)
memory usage: 20.0+ MB


In [6]:
data = data.groupby(['date','store_nbr'] , as_index=False).agg(
{
    'unit_sales':sum,
    'onpromotion':'first',
    'city': 'first',
    'state': 'first',
    'store_type': 'first',
    'store_cluster': 'first',
    'holiday_type':'first',
    'transferred':'first',
    'transactions': sum,
    'family':'first',
    'class':'first',
    'perishable': 'first',
    'dcoilwtico':'first'
}
)
data.head()

Unnamed: 0,date,store_nbr,unit_sales,onpromotion,city,state,store_type,store_cluster,holiday_type,transferred,transactions,family,class,perishable,dcoilwtico
0,2013-01-01,25,46.0,True,Salinas,Santa Elena,D,1,Holiday,False,10010.0,EGGS,2502,1,93.14
1,2013-01-02,14,53.0,True,Riobamba,Chimborazo,C,7,Work Day,True,22022.0,EGGS,2502,1,93.14
2,2013-01-02,15,73.0,True,Ibarra,Imbabura,C,15,Work Day,True,16220.0,EGGS,2502,1,93.14
3,2013-01-02,19,40.0,True,Guaranda,Bolivar,C,15,Work Day,True,13690.0,EGGS,2502,1,93.14
4,2013-01-02,25,136.0,True,Salinas,Santa Elena,D,1,Work Day,True,19722.0,EGGS,2502,1,93.14


In [7]:
data['date'] = pd.to_datetime(data['date'])

## Split the data

splitting on time.

In [8]:
target_column_name = 'unit_sales'
time_column_name = 'date'
grain_column_names = ['store_nbr']
label =  "unit_sales"

In [9]:
train = data[data[time_column_name] < '2017-01-01']
test = data[data[time_column_name] >= '2017-01-01']

X_train = train.copy()
y_train = X_train.pop(target_column_name).values

X_test = test.copy()
y_test = X_test.pop(target_column_name).values

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(12788, 14)
(12788,)
(2379, 14)
(2379,)


# set up experiment

In [10]:
time_series_settings = {
    "time_column_name": "date",
    "grain_column_names":['store_nbr'],
    "max_horizon": 50,
    "target_lags": 2,
    "target_rolling_window_size": 10,
    "preprocess": True,
}

In [11]:
automl_config = AutoMLConfig(task='forecasting',
                             primary_metric='normalized_root_mean_squared_error',
                             experiment_timeout_minutes=15,
                             enable_early_stopping=True,
                             training_data=train,
                             label_column_name=label,
                             n_cross_validations=5,
                             enable_ensembling=False,
                             verbosity=logging.INFO,
                             **time_series_settings)

# run model and get best model

In [12]:
local_run = experiment.submit(automl_config, show_output=True)


Running on local machine
Parent Run ID: AutoML_ae679f6e-c321-4404-97d5-b791eec2b744
Current status: DatasetFeaturization. Beginning to featurize the dataset.
Current status: DatasetFeaturizationCompleted. Completed featurizing the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: DatasetFeaturization. Beginning to featurize the CV split.
Current status: DatasetFeaturizationCompleted. Completed featurizing the CV split.
Current status: DatasetFeaturization. Beginning to featurize the CV split.
Current status: DatasetFeaturizationCompleted. Completed featurizing the CV split.
Current status: DatasetFeaturization. Beginning to featurize the CV split.
Current status: DatasetFeaturizationCompleted. Completed featurizing the CV split.
Current status: DatasetFeaturization. Beginning to featurize the CV split.
Current status: DatasetFeaturizationCompleted. Completed featurizing the CV split.
Current status: DatasetFeaturization.

### view local run summary

In [13]:
local_run

Experiment,Id,Type,Status,Details Page,Docs Page
automl-saleforecasting,AutoML_ae679f6e-c321-4404-97d5-b791eec2b744,automl,Running,Link to Azure Machine Learning studio,Link to Documentation


## get best model

In [14]:
best_run, fitted_model = local_run.get_output()
fitted_model

ForecastingPipelineWrapper(pipeline=Pipeline(memory=None,
     steps=[('timeseriestransformer', TimeSeriesTransformer(logger=None,
           pipeline_type=<TimeSeriesPipelineType.FULL: 1>)), ('stackensembleregressor', StackEnsembleRegressor(base_learners=[('18', Pipeline(memory=None,
     steps=[('standardscalerwrapper', <automl.client.core.runtime.model_wrapp...   random_state=None, selection='cyclic', tol=0.0001, verbose=0),
            training_cv_folds=5))]),
              stddev=None)

# monitoring runs

In [15]:
from azureml.widgets import RunDetails
RunDetails(local_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…