#### **Python env**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import numpy as np

from sklearn.linear_model import LinearRegression

#### **Import data**

In [None]:
df = pd.read_csv("store-sales-time-series-forecasting/book_sales.csv", index_col = "Date", parse_dates = ["Date"])
df = df.drop(["Paperback"], axis = 1)
df.head()

#### **Linear regression**

The interesting features that could be used to solve this problem are time and lags. In order to solce the problem, maybe a combination of the two could be used.

In [None]:
df["Time"] = range(0, df.shape[0])

In [None]:
plt.style.use("seaborn-whitegrid")

plt.rc(
    "figure",
    autolayout = True,
    figsize = (11, 4),
    titlesize = 18,
    titleweight = "bold"
)

plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=16,
    titlepad=10,
)

fig, ax = plt.subplots()
ax.plot("Time", "Hardcover", data = df, color = '0.7')
ax = sns.regplot(x = "Time", y = "Hardcover", data = df, ci = 95, scatter_kws = dict(color = "0.25"))
ax.set_title('Time Plot of Hardcover Sales')
ax.grid(False);

In [None]:
lr = LinearRegression()

lr.fit(X = df["Time"].values.reshape(-1, 1), y = df["Hardcover"].values)

lr.coef_[0], lr.intercept_

In [None]:
df["Hardcover"].values

In [None]:
df["Lag_1"] = df.Hardcover.shift(1)
df.head()

In [None]:
# dato che si osserva una correlazione tra la variabile e il suo lag, 
# tale lag dovrebbe essere tenuto in considerazione per le analisi.
# Stiamo tenendo in considerazione una dipendenza seriale: il sales di un
# giorno sarà minore di quello successivo.  

fig, ax = plt.subplots()
ax = sns.regplot(x = "Lag_1", y = "Hardcover", data = df[["Lag_1", "Hardcover"]], ci = 95, scatter_kws = dict(color = "0.25"))
ax.set_title('Lag Plot of Hardcover Sales')
ax.grid(False);

In [None]:
lr = LinearRegression()

lr.fit(X = df["Lag_1"].values[1:].reshape(-1, 1), y = df["Hardcover"].values[1:])

lr.coef_[0], lr.intercept_

In [None]:
lr = LinearRegression()

lr.fit(X = df[["Time", "Lag_1"]].values[1:,:], y = df["Hardcover"].values[1:])

lr.coef_[0], lr.intercept_

#### **Trend**

A _trend_ represents the change in the mean of a time series and it is the slowest-moving part of a series.

Generally speaking, a trend is a slow-moving and persistent change which could involve the mean but also other measures, like the median. Moreover, 
it could be linear or also a persistent and slow-moving seasonal. 

So, in order to highlight the type of trend, a rolling mean could be performed so that any short-term trend should be deleted. Therefore, the width of
the rolling mean should be larger than the seasonal period.   

In [None]:
df = pd.read_csv("archive/tunnel.csv")
df["Day"] = pd.to_datetime(df["Day"])
df["Time"] = range(0, df.shape[0])
df.head()

In [None]:
plt.style.use("seaborn-whitegrid")

plt.rc(
    "figure",
    autolayout = True,
    figsize = (11, 4),
    titlesize = 18,
    titleweight = "bold"
)

plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=16,
    titlepad=10,
)


In [None]:
fig, ax = plt.subplots()
ax.plot("Time", "NumVehicles", data = df, color = '0.7', zorder = 0)
ax = sns.regplot(x = "Time", y = "NumVehicles", data = df, ci = 95, scatter_kws = dict(color = "0.25"))
ax.set_title('Time Plot of Hardcover Sales')
ax.grid(False);

In [None]:
moving_average = df.rolling(
    window = 365,
    center = True,
    min_periods = 183
).mean()

ax = df["NumVehicles"].plot()
moving_average["NumVehicles"].plot(ax = ax, linewidth = 3)
ax.grid(False);

# possiamo vedere che il trend è lineare.

In [None]:
# una volta che il trend è stato identificato, possiamo modellarlo: in questo caso, possiamo ottenere le features
# da un processo deterministico.

from statsmodels.tsa.deterministic import DeterministicProcess

dp = DeterministicProcess(
    index = df.Time,    # regressor
    constant = False,    # bias
    order = 1,          # order of the regression
    drop = True         # avoid collinearity
)

# feature estratte per i dati di training. 

X = dp.in_sample()
X.head(5)

In [None]:
from sklearn.linear_model import LinearRegression

y = df["NumVehicles"]

model = LinearRegression(fit_intercept = True)
model.fit(X, y)

y_pred = pd.Series(model.predict(X), index = X.index)

X = dp.out_of_sample(steps=30)

y_fore = pd.Series(model.predict(X), index=X.index)

In [None]:
plt.figure(figsize = (10, 5))
plt.plot(y, color = "red", alpha = .7, label = "Data")
plt.plot(y_pred, color = "black", alpha = .7, label = "In-sample predictions")
plt.plot(y_fore, color = "black", ls = "--", alpha = .7, label = "Out-sample predictions")
plt.legend()
plt.grid(False)