In [None]:
import pandas as pd
import pathlib
from pathlib import Path
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import numpy as np
import torch.nn as nn
import torch
from tqdm import tqdm
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

In [None]:
data_taxi = pd.read_csv("/datasets/taxi.csv", index_col=[0], parse_dates=[0])
data_taxi.sort_index(inplace=True)
data_taxi.info()

In [None]:
data_taxi = data_taxi .resample('1H').sum()
figsize = (18,10)
data_taxi.plot(figsize=figsize)

In [None]:
data_taxi['2018-08-25':'2018-08-31'].plot(figsize=figsize)

In [None]:
decomposed_data_taxi = seasonal_decompose(data_taxi)
decomposed_data_taxi.trend.plot(figsize=figsize)

In [None]:
decomposed_data_taxi.seasonal.plot(figsize=figsize)

In [None]:
decomposed_data_taxi.resid.plot(figsize=figsize)

In [None]:
decomposed_data_taxi = seasonal_decompose(data_taxi['2018-08-25':'2018-08-31'])
decomposed_data_taxi.trend.plot(figsize=figsize)

In [None]:
decomposed_data_taxi.seasonal.plot(figsize=figsize)

In [None]:
decomposed_data_taxi.resid.plot(figsize=figsize)

In [None]:
taxi_shift = data_taxi - data_taxi.shift()
taxi_shift['mean'] = data_taxi['num_orders'].rolling(24).mean()
taxi_shift['std'] = data_taxi['num_orders'].rolling(24).std()
taxi_shift.plot(figsize=figsize)

In [None]:
max_lag = 24
rolling_mean_size = 48
def make_features(data, column, max_lag, rolling_mean_size):
    data['day'] = data.index.day
    data['dayofweek'] = data.index.dayofweek
    
    for lag in range(1, max_lag + 1):
        data['lag_{}'.format(lag)] = data[column].shift(lag)

    data['rolling_mean'] = data[column].shift().rolling(rolling_mean_size).mean()

In [None]:
data_taxi_copy = data_taxi.copy()

make_features(data_taxi,'num_orders', max_lag, rolling_mean_size)

train, test = train_test_split(data_taxi, shuffle=False, test_size=0.1, random_state=1515)
train = train.dropna()

features_train = train.drop('num_orders', axis=1)
target_train = train['num_orders']
features_test = test.drop('num_orders', axis=1)
target_test = test['num_orders']

In [None]:
target_train

In [None]:
def create_sequences(X, y, time_steps=2):
    X_sequences, y_sequences = [], []
    for i in range(len(X) - time_steps):
        X_sequences.append(X[i:i+time_steps])
        y_sequences.append(sum(y[i:i+time_steps]))
    return np.array(X_sequences), np.array(y_sequences)

In [None]:
features_train_, target_train_ = create_sequences(features_train.to_numpy(), target_train.to_numpy())
features_test_, target_test_ = create_sequences(features_test.to_numpy(), target_test.to_numpy())

In [None]:
model = LinearRegression()
model.fit(features_train, target_train)
target_predict = model.predict(features_test)
final_RMSE_LR = mean_squared_error(target_test, target_predict) ** 0.5

liniar_prediction = model.predict(features_test)

plt.figure(figsize=figsize)
plt.plot(target_test.index, 
         target_test, 
         'b-', 
         target_test.index, 
         target_predict,
         'r-')

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
device

In [None]:
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(LSTMModel, self).__init__()
        self.lstm_1 = nn.LSTM(input_size, hidden_size, batch_first=True, num_layers=2)
        self.relu = nn.LeakyReLU()
        self.norm = nn.LayerNorm(hidden_size)
        self.fc_in = nn.Linear(hidden_size, hidden_size)
        self.fc_out = nn.Linear(hidden_size, 1)

    def forward(self, x):
        # Проход через первый слой LSTM
        out, _ = self.lstm_1(x.unsqueeze(0))
        out = self.relu(out)
        out = self.norm(out)
        out = self.fc_in(out)
        out = self.relu(out)
        out = self.norm(out)
        out = self.fc_in(out)
        out = self.relu(out)
        out = self.fc_out(out)

        return out[:, -1]
model = LSTMModel(27, 14).to(device)

In [None]:
features_train_ = torch.tensor(features_train_, dtype=torch.float32).to(device)
target_train_ = torch.tensor(target_train_, dtype=torch.float32).to(device)

In [None]:
criterion = nn.MSELoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    for inputs, targets in tqdm(zip(features_train_, target_train_)):
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    average_loss = total_loss / len(features_train_)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {average_loss:.4f}')
model.eval()

In [None]:
features_test_ = torch.tensor(features_test_, dtype=torch.float32).to(device)

In [None]:
target_predict_ = []

In [None]:
for i in tqdm(features_test_):
    target_predict_.append(model(i).to("cpu").detach().numpy().astype('int')[0])

In [None]:
plt.figure(figsize=figsize)
plt.plot(pd.DataFrame(target_predict_).index, target_test_, 'b-', label='Actual')
plt.plot(pd.DataFrame(target_predict_).index, target_predict_, 'r-', label='Predicted')

In [None]:
print(f"{mean_squared_error(target_test_, target_predict_, squared=False)/2:.4f}") # все-таки я же складывал попарно данные

In [None]:
regressor = DecisionTreeRegressor() 
max_depth_list = [x for x in range(2, 15)]
hyperparams = [{'max_depth':max_depth_list,
                "min_samples_leaf":[2, 3],
                'random_state':[1515]}]

print('# Tuning hyper-parameters for root_mean_squared_error')
print()
clf = GridSearchCV(regressor, hyperparams, scoring='neg_mean_squared_error')
clf.fit(features_train, target_train)
print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.6f for %r"% ((mean*-1)** 0.5, params))
print()

cv_RMSE_DTR = (max(means)*-1) ** 0.5

In [None]:
model = DecisionTreeRegressor(max_depth=5, 
                              random_state=1515,
                             min_samples_leaf=3) 
model.fit(features_train, target_train)
target_predict = model.predict(features_test)
final_RMSE_DTR = mean_squared_error(target_test, target_predict) ** 0.5

plt.figure(figsize=figsize)
plt.plot(target_test.index, 
         target_test, 
         'b-', 
         target_test.index, 
         target_predict,
         'r-')

In [None]:
print(mean_squared_error(target_test, liniar_prediction, squared=False))