In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from tinyshift.series import trend_significance
from utilsforecast.preprocessing import fill_gaps
from tinyshift.plot import corr_heatmap
import plotly.express as px
from utils import remove_leading_zeros, is_obsolete, forecastability, plot_acf_pacf_adf, add_in_date_information

In [2]:
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/airline-passengers.csv'
df = pd.read_csv(url, parse_dates=['Month'])
df["unique_id"] = "1"
df.rename(columns={"Month": "ds", "Passengers": "y"}, inplace=True)

In [3]:
df.isnull().sum()

ds           0
y            0
unique_id    0
dtype: int64

In [4]:
df

Unnamed: 0,ds,y,unique_id
0,1949-01-01,112,1
1,1949-02-01,118,1
2,1949-03-01,132,1
3,1949-04-01,129,1
4,1949-05-01,121,1
...,...,...,...
139,1960-08-01,606,1
140,1960-09-01,508,1
141,1960-10-01,461,1
142,1960-11-01,390,1


In [5]:
df = fill_gaps(df, freq="ME", end="per_serie", id_col="unique_id", time_col="ds")

In [6]:
df = df.groupby("unique_id")[df.columns].apply(remove_leading_zeros).reset_index(drop=True)

In [7]:
days_obsoletes=180
obsolete_series = df.groupby("unique_id")[df.columns].apply(is_obsolete, days_obsoletes)

In [8]:
obsolote_ids = obsolete_series[obsolete_series].index.tolist()

In [9]:
obsolote_ids

[]

In [10]:
df.groupby("unique_id")["y"].apply(trend_significance)

unique_id
1    (0.853638165603188, 4.020274506593391e-61)
Name: y, dtype: object

In [11]:
px.line(df, x="ds", y="y", facet_col="unique_id")

In [12]:
px.line(df, x="ds", y="y", facet_col="unique_id")

In [13]:
df.groupby("unique_id")["y"].apply(forecastability)

unique_id                   
1          foreCA                                               0.571263
           theoretical_limit                                    0.086973
           stability_index                                      0.183962
           adi_cv                             (1.0, 0.18190737443351804)
           hurst_exponent       (0.5203628364992473, 0.8044229708616066)
Name: y, dtype: object

# ADF Test - Order of Diff 1

In [14]:
def generate_lag(X, lag=1):
    X = np.asarray(X, dtype=np.float64)

    if X.ndim > 1:
        raise ValueError("Input array must be one-dimensional.")

    return np.concatenate((np.nan * np.ones(lag), (X[lag:] - X[:-lag])))

In [15]:
df["lag_1"] = df["y"].pipe(generate_lag, lag=1)
df["lag_12"] = df["lag_1"].pipe(generate_lag, lag=12)
#df = df.dropna()

In [16]:
df

Unnamed: 0,unique_id,ds,y,lag_1,lag_12
0,1,1949-01-01,112,,
1,1,1949-02-01,118,6.0,
2,1,1949-03-01,132,14.0,
3,1,1949-04-01,129,-3.0,
4,1,1949-05-01,121,-8.0,
...,...,...,...,...,...
139,1,1960-08-01,606,-16.0,-27.0
140,1,1960-09-01,508,-98.0,-2.0
141,1,1960-10-01,461,-47.0,9.0
142,1,1960-11-01,390,-71.0,-26.0


In [17]:
#df.groupby("unique_id")["lag_1"].apply(adfuller)

# Domingo

- #TODO: Adicionar gráfico de STL
- #TODO: Interpretar
- #TODO: Adicionar gráficos da aula do valeriy
- #TODO: Comparativo de dados diferenciados
- #TODO: Trazer gráficos de ipynbs dele

In [18]:
plot_acf_pacf_adf(df, ["y", "lag_1", "lag_12"])

## Features
- STL
- ~~Holidays~~
- ~~Série de fourier para padrões cíclicos (seno e cosseno) - dia da semana, dia do ano~~
- ~~mês~~
- ~~fim do mês~~
- ~~Lags~~
- Rolling Windows: médias, desvio padrão e permutação da entropia

In [19]:
df = df.pipe(add_in_date_information, "ds")
df = df.dropna()

In [20]:
df

Unnamed: 0,unique_id,ds,y,lag_1,lag_12,month,is_holiday,is_month_end,dow_sin,dow_cos,yr_sin,yr_cos
13,1,1950-02-01,126,11.0,5.0,2,False,False,0.974928,-0.222521,-8.660254e-01,-0.500000
14,1,1950-03-01,141,15.0,1.0,3,False,False,0.974928,-0.222521,-4.777360e-15,1.000000
15,1,1950-04-01,135,-6.0,-3.0,4,False,False,-0.974928,-0.222521,-5.000000e-01,-0.866025
16,1,1950-05-01,125,-10.0,-2.0,5,True,False,0.000000,1.000000,5.000000e-01,0.866025
17,1,1950-06-01,149,24.0,10.0,6,False,False,0.433884,-0.900969,-8.660254e-01,-0.500000
...,...,...,...,...,...,...,...,...,...,...,...,...
139,1,1960-08-01,606,-16.0,-27.0,8,False,False,0.000000,1.000000,-8.660254e-01,0.500000
140,1,1960-09-01,508,-98.0,-2.0,9,False,False,0.433884,-0.900969,5.000000e-01,-0.866025
141,1,1960-10-01,461,-47.0,9.0,10,False,False,-0.974928,-0.222521,-5.000000e-01,0.866025
142,1,1960-11-01,390,-71.0,-26.0,11,False,False,0.781831,0.623490,9.798412e-15,-1.000000


In [None]:
corr_heatmap(df[df.columns[2:]], fig_type="png")

# Modelagem (Decomposição + Modelagem Residual)

1. modelo estatístico + ML residual
2. ver como fazer Tendência aditivia, sazonalidade multiplicativa
3. ver como fazer tendência aditiva, sazonalidade aditiva