# Forecasting Time Series Analysis

In [3]:
import pandas as pd
import plotly.offline as py
import plotly.graph_objs as go
import ipywidgets as widgets
from IPython.display import display

py.init_notebook_mode(connected=True)

In [4]:
df = pd.read_csv('../data/processed/forecasting_data.csv')
df['DATA'] = pd.to_datetime(df['DATA'])

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5033 entries, 0 to 5032
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   DATA               5033 non-null   datetime64[ns]
 1   VALOR_REEMBOLSADO  5033 non-null   float64       
 2   COUNT              5033 non-null   int64         
dtypes: datetime64[ns](1), float64(1), int64(1)
memory usage: 118.1 KB


In [6]:
# Check if all years have 365 days
df.groupby(df['DATA'].dt.year)['DATA'].count()


DATA
2009    285
2010    365
2011    365
2012    366
2013    365
2014    365
2015    365
2016    366
2017    365
2018    365
2019    365
2020    366
2021    365
2022    365
Name: DATA, dtype: int64

## Fixing year 2009
We can see above that in 2009, some dates are missing to compose the time series. To solve this, we need to find which dates are missing and define an approach.


In [8]:
# Find the missing days of 2009
missing_days = pd.date_range(start='2009-01-01', end='2009-12-31').difference(df[df['DATA'].dt.year == 2009]['DATA'])
missing_days

DatetimeIndex(['2009-01-01', '2009-01-02', '2009-01-03', '2009-01-04',
               '2009-01-05', '2009-01-06', '2009-01-07', '2009-01-08',
               '2009-01-09', '2009-01-10', '2009-01-11', '2009-01-12',
               '2009-01-13', '2009-01-14', '2009-01-15', '2009-01-16',
               '2009-01-17', '2009-01-18', '2009-01-19', '2009-01-20',
               '2009-01-21', '2009-01-22', '2009-01-23', '2009-01-24',
               '2009-01-25', '2009-01-27', '2009-01-28', '2009-01-29',
               '2009-01-30', '2009-01-31', '2009-02-01', '2009-02-02',
               '2009-02-03', '2009-02-04', '2009-02-05', '2009-02-06',
               '2009-02-07', '2009-02-08', '2009-02-09', '2009-02-10',
               '2009-02-11', '2009-02-12', '2009-02-13', '2009-02-14',
               '2009-02-15', '2009-02-16', '2009-02-17', '2009-02-18',
               '2009-02-19', '2009-02-20', '2009-02-21', '2009-02-22',
               '2009-02-23', '2009-02-24', '2009-02-25', '2009-02-26',
      

We can see above that up intil April, most of the dates are missing. Since the missing dates are centered in those first months, the approach defined was to remove the few dates from these months and begin the times series from April.

In [9]:
# Drop the rows where the data is less than 2009-04-01
df = df[df['DATA'] >= '2009-04-01']

## Ploting the data

In [10]:
# Chart default layout
default_layout = dict(
    titlefont=dict(size=18, color="darkblue"),
    tickfont=dict(size=14, color="black"),
    showgrid=True,
    zeroline=True,
    showline=True,
    mirror=True,
    gridcolor="lightgrey",
    gridwidth=1,
    zerolinecolor="grey",
    zerolinewidth=2,
    linecolor="black",
    linewidth=2,
)

In [11]:
# Define x and y axis layout
xaxis_layout = default_layout.copy()
xaxis_layout["title"] = "Data"

yaxis_layout = default_layout.copy()
yaxis_layout["title"] = "Valor reembolsado (R$)"

In [13]:
# Plot the data
trace = go.Scatter(
    x=df["DATA"],
    y=df["VALOR_REEMBOLSADO"],
    mode="lines",
    marker=dict(size=10, color="blue", symbol="circle"),
    line=dict(width=2, color="blue"),
)

# Set chart layout
layout = go.Layout(
    title="Reimbursement by Year",
    titlefont=dict(size=24, color="darkblue"),
    xaxis=xaxis_layout,
    yaxis=yaxis_layout,
    hovermode="closest",
    plot_bgcolor="white",
)

py.iplot({"data": [trace], "layout": layout})

## Checking stationarity of the time series
A time series is stationary when its statistical properties, like its mean and variance, do not change over time.
![stationary series](https://miro.medium.com/v2/resize:fit:720/format:webp/0*Dyml4bSlkE5WHdcc)

One way to check if a time series is stationary is by the [Dickey-Fuller Test](https://medium.com/@ritusantra/tests-for-stationarity-in-time-series-dickey-fuller-test-augmented-dickey-fuller-adf-test-d2e92e214360).

In [20]:
df['DATA'] = df['DATA'].dt.strftime('%Y-%m-%d')
df.values

array([['2009-04-01', 50218.99, 45],
       ['2009-04-02', 24003.67, 50],
       ['2009-04-03', 9656.97, 52],
       ...,
       ['2022-12-29', 23237.61, 18],
       ['2022-12-30', 155183.29, 14],
       ['2022-12-31', 17113.68, 7]], dtype=object)

In [21]:
# from statsmodels.tsa.stattools import adfuller
# X = df.values
# result = adfuller(X)
# print('ADF Statistic: %f' % result[0])
# print('p-value: %f' % result[1])
# print('Critical Values:')
# for key, value in result[4].items():
# 	print('\t%s: %.3f' % (key, value))

ValueError: could not convert string to float: '2009-04-01'