In [21]:
# Gerekli kütüphaneleri yükle
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
from sklearn.model_selection import train_test_split
import statistics

In [22]:
def get_mape(actual_values,predicted_values):
   mape = np.mean(np.abs((actual_values - predicted_values) / actual_values))
   return mape


In [23]:
df = pd.read_csv("train1.csv")
df.head()

Unnamed: 0,Tarih,energy
0,2018-01-01 00:00:00,1593.944216
1,2018-01-01 01:00:00,1513.933887
2,2018-01-01 02:00:00,1402.612637
3,2018-01-01 03:00:00,1278.527266
4,2018-01-01 04:00:00,1220.697701


## detect and remove outlier

In [24]:
q1 = df["energy"].quantile(0.25)
q3 = df["energy"].quantile(0.75)
iqr = q3-q1
low_limit = q1-1.5*iqr
high_limit = q3+1.5*iqr
print(iqr)
print(df.shape)

630.2429502129785
(40152, 2)


In [25]:
df = df[~((df["energy"]<low_limit)|(df["energy"]>high_limit))]
dfcopy = df
print(df.shape)
print(statistics.variance(df["energy"]))

(40026, 2)
175617.3813957277


## trend and seasonality

In [26]:
#logarithmic transformation
df["energy"] = np.log(df["energy"])
print(df.shape) 
print(statistics.variance(df["energy"]))

(40026, 2)
0.053885003420288874


## stationary data control

In [27]:
from statsmodels.tsa.stattools import adfuller

result = adfuller(df["energy"])
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
for key, value in result[4].items():
    print('\t%s: %.3f' % (key, value))

ADF Statistic: -9.837840
p-value: 0.000000
Critical Values:
	1%: -3.431
	5%: -2.862
	10%: -2.567


In [28]:
df["energy"] = df["energy"].diff()
df["energy"] = df["energy"].fillna(value=df["energy"].mode()[0])
df.head()

Unnamed: 0,Tarih,energy
0,2018-01-01 00:00:00,-0.240834
1,2018-01-01 01:00:00,-0.0515
2,2018-01-01 02:00:00,-0.076375
3,2018-01-01 03:00:00,-0.092628
4,2018-01-01 04:00:00,-0.046286


## model operations before logarithmic transformation

In [29]:
X = df["energy"]
y = df["energy"]

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state = 42)
X_train,X_val,y_train,y_val = train_test_split(X_train,y_train,test_size=0.25, random_state=42)

In [30]:
#determination of the most appropriate p, d, q values
from pmdarima import auto_arima

model_auto_arima2 =  auto_arima(y_val, seasonal=False, trace=True, error_action='ignore', suppress_warnings=True)
print(model_auto_arima2.summary())

Performing stepwise search to minimize aic
 ARIMA(2,0,2)(0,0,0)[0]             : AIC=-21298.258, Time=0.65 sec
 ARIMA(0,0,0)(0,0,0)[0]             : AIC=-21305.518, Time=0.22 sec
 ARIMA(1,0,0)(0,0,0)[0]             : AIC=-21303.543, Time=0.12 sec
 ARIMA(0,0,1)(0,0,0)[0]             : AIC=-21303.543, Time=0.36 sec
 ARIMA(1,0,1)(0,0,0)[0]             : AIC=-21301.544, Time=0.67 sec
 ARIMA(0,0,0)(0,0,0)[0] intercept   : AIC=-21307.544, Time=0.63 sec
 ARIMA(1,0,0)(0,0,0)[0] intercept   : AIC=-21305.557, Time=0.35 sec
 ARIMA(0,0,1)(0,0,0)[0] intercept   : AIC=-21305.557, Time=0.35 sec
 ARIMA(1,0,1)(0,0,0)[0] intercept   : AIC=-21303.638, Time=1.87 sec

Best model:  ARIMA(0,0,0)(0,0,0)[0] intercept
Total fit time: 5.225 seconds
                               SARIMAX Results                                
Dep. Variable:                      y   No. Observations:                 8005
Model:                        SARIMAX   Log Likelihood               10655.772
Date:                Thu, 13 Ap

In [31]:
print("Optimal p,d,q values: ",model_auto_arima2.order)

Optimal p,d,q values:  (0, 0, 0)


In [32]:
#model = ARIMA(X_train,order=(1,0,0))
model = ARIMA(X_train,order=(0,0,0))
model_fit = model.fit()

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


In [33]:
predicts = model_fit.predict(start=len(X_train), end=len(X_train)+len(X_test)-1, dynamic=False)

  return get_prediction_index(


In [34]:
print("MAPE score: ",get_mape(X_train,predicts))

MAPE score:  1.3439691138601848


In [35]:
y_test_pred = model_fit.predict(start=len(X_train),end=len(X_train)+len(X_test)-1)

  return get_prediction_index(


In [36]:
print("MAPE score: ",get_mape(X_test,y_test_pred))

MAPE score:  1.0201611333958684


In [37]:
y_val_pred = model_fit.predict(start=len(X_train)+len(X_test), end=len(X_train)+len(X_test)+len(X_val)-1)

  return get_prediction_index(


In [38]:
print("MAPE score: ",get_mape(X_val,y_val_pred))

MAPE score:  1.0089839664565705
