# INSY 695 - AutoML using Ludwig

In [1]:
import pandas as pd
import numpy as np
import os
import platform as pf
from IPython.display import display # Display data
pd.options.mode.chained_assignment = None  # default='warn'

#comment
#packages for time series
import warnings
import itertools    
import statsmodels.api as sm

#Richard was here
from datetime import timedelta
from dateutil.relativedelta import relativedelta
import matplotlib.pyplot as plt
import math
from scipy import stats

plt.style.use('fivethirtyeight')

## Import Data

In [16]:
#import helper file 
import helper
from helper import *

In [17]:
df = load_and_preprocess()

In [18]:
df

Unnamed: 0,datetime,Description,Humidity,Wind Direction,Temperature,Pressure,Wind Speed,clouds,rain,mist,snow,shower,thunderstorm,fog,other,Intensity
1,2012-10-01 13:00:00,heavy clouds,93.0,1001.0,285.830000,230.0,4.0,1,0,0,0,0,0,0,0,3.0
2,2012-10-01 14:00:00,sky is clear,91.0,986.0,285.834650,230.0,4.0,0,0,0,0,0,0,0,0,0.0
3,2012-10-01 15:00:00,sky is clear,87.0,945.0,285.847790,231.0,4.0,0,0,0,0,0,0,0,0,0.0
4,2012-10-01 16:00:00,sky is clear,84.0,904.0,285.860929,233.0,4.0,0,0,0,0,0,0,0,0,0.0
5,2012-10-01 17:00:00,sky is clear,80.0,863.0,285.874069,234.0,3.0,0,0,0,0,0,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45248,2017-11-29 20:00:00,moderate clouds,64.0,1021.0,275.130000,300.0,10.0,1,0,0,0,0,0,0,0,2.0
45249,2017-11-29 21:00:00,moderate clouds,59.0,1023.0,274.130000,300.0,7.0,1,0,0,0,0,0,0,0,2.0
45250,2017-11-29 22:00:00,light clouds,66.0,1024.0,273.480000,290.0,7.0,1,0,0,0,0,0,0,0,1.0
45251,2017-11-29 23:00:00,light clouds,58.0,1026.0,272.480000,290.0,4.0,1,0,0,0,0,0,0,0,1.0


In [25]:
weather_df = df[['datetime', 'Temperature']].set_index('datetime')
weather_df = weather_df.reset_index()
weather_df

Unnamed: 0,datetime,Temperature
0,2012-10-01 13:00:00,285.830000
1,2012-10-01 14:00:00,285.834650
2,2012-10-01 15:00:00,285.847790
3,2012-10-01 16:00:00,285.860929
4,2012-10-01 17:00:00,285.874069
...,...,...
45247,2017-11-29 20:00:00,275.130000
45248,2017-11-29 21:00:00,274.130000
45249,2017-11-29 22:00:00,273.480000
45250,2017-11-29 23:00:00,272.480000


## Format into Train and Test sets 

Format data for time-series prediction by specifying train, validation, and tests sets and generate a new csv to store the formatted data in.

In [33]:
import pandas as pd
from ludwig.utils.data_utils import add_sequence_feature_column

ludwig_df = (weather_df[['Temperature']]
).rename(
    columns={"Temperature": "temperature"}
).fillna(method='backfill').fillna(method='ffill')

# normalize
ludwig_df.temperature = ((ludwig_df.temperature-ludwig_df.temperature.mean()) /
                  ludwig_df.temperature.std())

train_size = int(0.6 * len(ludwig_df))
vali_size = int(0.2 * len(ludwig_df))

# train, validation, test split
ludwig_df['split'] = 0
ludwig_df.loc[
    (
        (df.index.values >= train_size) &
        (df.index.values < train_size + vali_size)
    ),
    ('split')
] = 1
ludwig_df.loc[
    df.index.values >= train_size + vali_size,
    ('split')
] = 2

# prepare timeseries input feature colum
# (here we are using 20 preceding values to predict the target)
add_sequence_feature_column(ludwig_df, 'temperature', 20)
ludwig_df.to_csv('temperature_ludwig.csv')

## Time Series Forecast using Ludwig 

In [37]:
! pip install pyyaml



**Generate YAML config file** 

First, we create a YAML file which stores the configurations neeeded to run the time-series data. We specify the inputs and outputs, and run an RNN analysis using Ludwig.

In [53]:
import yaml

with open('config.yaml') as file:
    # The FullLoader parameter handles the conversion from YAML
    # scalar values to Python the dictionary format
    config = yaml.load(file, Loader=yaml.FullLoader)

    print(config)

{'input_features': [{'name': 'temperature_feature', 'type': 'timeseries', 'encoder': 'rnn', 'embedding_size': 32, 'state_size': 32}], 'output_features': [{'name': 'temperature', 'type': 'numerical'}]}


**Run Ludwig experiment in command line** 

Specify the formatted time series dataframe as input as well as the config.yaml file to run the analysis. The train, validation, and test losses are recorded at each epoch. Early stopping of the training occurs due to lack of validation improvement once there has been 5 epochs since the last validation improvement, and the losses and R^2 score are recorded.

The logs are stored which can be visualized.

In [65]:
!ludwig experiment \
--dataset temperature_ludwig.csv \
--config_file config.yaml

  warn('no type annotations present -- not typechecking {}'.format(function_name(func)))
███████████████████████
█ █ █ █  ▜█ █ █ █ █   █
█ █ █ █ █ █ █ █ █ █ ███
█ █   █ █ █ █ █ █ █ ▌ █
█ █████ █ █ █ █ █ █ █ █
█     █  ▟█     █ █   █
███████████████████████
ludwig v0.3.3 - Experiment

2021-04-08 21:52:55.004941: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
Experiment name: experiment
Model name: run
Output directory: results/experiment_run


ludwig_version: '0.3.3'
command: ('/opt/anaconda3/bin/ludwig experiment --dataset temperature_ludwig.csv '
 '--config_file config.yaml')
commit_hash: '432035164631'
random_seed: 42
dataset: 'temperature_ludwig.csv'
data_format: 'csv'
config: {   'combiner': {'type': 'concat'},
    'input_features': [   {   'column': 'temperature_feature',
                              'embedding_size': 32,
                              'encoder': 'rnn',
                              'name': 'temperature_

In [69]:
!ludwig visualize --visualization learning_curves \
  --output_feature_name temperature \
  --training_statistics results/experiment_run/training_statistics.json \
  --model_names Model1