In [3]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_regression
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
import pycaret
from pycaret.regression import *

import matplotlib.pyplot as plt
import plotly.express as px
import tensorflow as tf
from sklearn.base import BaseEstimator, RegressorMixin
import xgboost as xgb
import plotly.graph_objects as go

# Preprocessing

In [4]:
# načtení datasetu z CSV souboru
data = pd.read_csv('procraft_corrosion_kbely_data_v02.csv')
dataframe = pd.DataFrame(data)
datetime = dataframe['Datetime']
dataframe = dataframe.drop(columns=['corrosion', 'Datetime'])
target = dataframe['corrosion_diff']



# Vyhlazení dat pomocí klouzavého průměru
dataframe = dataframe.rolling(window=10).mean()



# Normalizace dat
scaler = MinMaxScaler(feature_range=(0, 1))
dataframe = pd.DataFrame(scaler.fit_transform(dataframe), columns=dataframe.columns)
dataframe.fillna(0, inplace=True)

# Rozdělení dat na roční období pro trénování a zbytek pro ověření modelu
one_year = datetime[8761]
one_year_data = dataframe[:8762]


# Vlastní model

In [5]:
from pycaret.regression import add_metric
from sklearn.base import BaseEstimator, RegressorMixin
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

class LSTMRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, input_shape=(10, 1), units=50, epochs=10, batch_size=32, learning_rate=0.001):
        self.input_shape = input_shape
        self.units = units
        self.epochs = epochs
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.model = None

    def build_model(self):
        model = Sequential()
        model.add(LSTM(self.units, input_shape=self.input_shape))
        model.add(Dense(1))
        model.compile(optimizer=Adam(learning_rate=self.learning_rate), loss='mse')
        return model

    def fit(self, X, y):
        X = X.values.reshape((X.shape[0], X.shape[1], 1))  
        self.model = self.build_model()
        self.model.fit(X, y, epochs=self.epochs, batch_size=self.batch_size, verbose=0)
        return self

    def predict(self, X):
        X = X.values.reshape((X.shape[0], X.shape[1], 1))  
        return self.model.predict(X).flatten()

# Porovnávání modelů


In [6]:
exp = RegressionExperiment()
exp.setup(data=one_year_data, target='corrosion_diff')
models = exp.models().index.tolist()
models.append(LSTMRegressor())
best_model = exp.compare_models(include = models, sort='R2', turbo=False)

Unnamed: 0,Description,Value
0,Session id,1497
1,Target,corrosion_diff
2,Target type,Regression
3,Original data shape,"(8762, 25)"
4,Transformed data shape,"(8762, 25)"
5,Transformed train set shape,"(6133, 25)"
6,Transformed test set shape,"(2629, 25)"
7,Numeric features,24
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
18,Extra Trees Regressor,0.0262,0.002,0.0447,0.8765,0.0389,0.2611,0.219
15,K Neighbors Regressor,0.0269,0.0028,0.053,0.8277,0.0467,0.2865,0.088
17,Random Forest Regressor,0.0348,0.0031,0.0557,0.8102,0.0479,0.356,4.583
22,Extreme Gradient Boosting,0.0351,0.0031,0.0559,0.8092,0.0486,0.3221,10.614
23,Light Gradient Boosting Machine,0.0406,0.0036,0.06,0.78,0.0528,0.4126,0.631
20,Gradient Boosting Regressor,0.0576,0.0061,0.078,0.6292,0.0687,0.5827,2.247
16,Decision Tree Regressor,0.0293,0.0071,0.0839,0.5717,0.0727,0.3019,0.1
19,AdaBoost Regressor,0.0841,0.0079,0.0891,0.5147,0.0814,0.5602,0.309
14,Support Vector Regression,0.0974,0.012,0.1094,0.2739,0.0952,0.4747,0.4
21,MLP Regressor,0.0755,0.0128,0.113,0.229,0.0924,0.6219,0.234


# Učení

In [7]:
best_model = exp.create_model(best_model[0])
best_model = exp.tune_model(best_model, n_iter=10, optimize='R2')
best_model = exp.finalize_model(best_model)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.0248,0.0063,0.0791,0.6516,0.0667,0.2892
1,0.0264,0.0059,0.077,0.6933,0.0676,0.2478
2,0.0225,0.0046,0.068,0.7355,0.0602,0.2585
3,0.0307,0.0063,0.0791,0.6521,0.0705,0.3465
4,0.0268,0.0057,0.0758,0.6841,0.0675,0.2468
5,0.0238,0.0049,0.07,0.617,0.062,0.2481
6,0.0241,0.005,0.0704,0.6356,0.0632,0.2181
7,0.0225,0.0046,0.0681,0.6949,0.0607,0.268
8,0.0274,0.0064,0.08,0.6391,0.0691,0.2697
9,0.0343,0.0101,0.1006,0.3758,0.0839,0.2888


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.0535,0.0068,0.0826,0.6198,0.0728,0.553
1,0.0501,0.0074,0.086,0.617,0.0752,0.5027
2,0.0478,0.006,0.0774,0.6574,0.0682,0.5254
3,0.0474,0.0065,0.0805,0.6404,0.0694,0.5166
4,0.0507,0.0077,0.0879,0.5749,0.0761,0.4766
5,0.0461,0.0056,0.0746,0.5639,0.0657,0.4999
6,0.0536,0.0074,0.086,0.4562,0.0758,0.5538
7,0.0499,0.0068,0.0823,0.5538,0.0726,0.4988
8,0.051,0.006,0.0773,0.6629,0.0691,0.5222
9,0.0519,0.0087,0.0931,0.4653,0.079,0.5242


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


# Vykrelsení výsledků

In [8]:
zbytek = dataframe[8762:] #zbytek se musí definovat až tady pro opakovatelnost spuštění této buňky
cor_diff = np.cumsum(dataframe['corrosion_diff'])

# Inicializace grafu
fig = go.Figure()


# První graf: původní časová řada
fig.add_trace(
    go.Scatter(
        x=datetime,
        y=cor_diff,
        mode='lines',
        name = 'Původní časová řada'
    )
)

zbytek = zbytek.drop(columns=['corrosion_diff'])
predictions = exp.predict_model(best_model, data=zbytek)
predictions = predictions.rename(columns={'prediction_label': 'corrosion_diff'})

# posunutí odhadů pro porovnání s původní časovou řadou
pred_shifted = predictions['corrosion_diff']
pred_shifted[8762] += cor_diff[8761]  # přidání poslední hodnoty původní řady

# Druhý graf: predikovaná časová řada
fig.add_trace(
    go.Scatter(
        x=datetime[8762:],
        y=np.cumsum(pred_shifted),
        mode='lines',
        name = 'Extra trees regressor'
    )
)


# Update rozložení grafu
fig.update_layout(
    legend=dict(
        bgcolor='rgba(0,0,0,0)',  # Set background to transparent
        x=0,
        y=1,
        xanchor='left',
        yanchor='top'),
    xaxis_title='Datum',
    yaxis_title='Míra koroze',
    
)


fig.show()

In [None]:
# exp.save_model(best_model, 'corrosion_regressor_model')
# exp.save_experiment('corrosion_regression_experiment')
# exp.create_api(best_model, 'corrosion_regressor_api')
# exp.create_docker('corrosion_regressor_api')

Transformation Pipeline and Model Successfully Saved
API successfully created. This function only creates a POST API, it doesn't run it automatically. To run your API, please run this command --> !python corrosion_regressor_api.py
Writing requirements.txt
Writing Dockerfile
Dockerfile and requirements.txt successfully created.
    To build image you have to run --> !docker image build -f "Dockerfile" -t IMAGE_NAME:IMAGE_TAG .
            
