In [147]:
import numpy as np
from numpy import sin
import holidays
from numpy.linalg import inv
from tqdm import tqdm_notebook as tqdm
import pandas as pd
import matplotlib.pyplot as plt
import datetime
from random import random
from random import sample
%matplotlib inline

In [98]:
def get_weights(A, y):
    return inv(A.T@A)@A.T@y

In [233]:
def get_error(A_train, A_test, y_train, y_test):
    w = get_weights(A_train, y)
    Y = A_test@w
    r = y - Y
    return r

In [234]:
data = pd.read_excel("/home/andreizoltan/ml/1/EnergyConsumption.xls")
print("Данные по энергопотреблению за {}".format(data['Date'].max()-data['Date'].min()))
data.head()

Данные по энергопотреблению за 364 days 23:00:00


Unnamed: 0,Date,WeekDay,Hour,ConsumprionMWtH,TemperatureC
0,2005-01-01 00:00:00,6,00:00:00,1289.617859,-10.676569
1,2005-01-01 01:00:00,6,01:00:00,1265.573853,-12.041959
2,2005-01-01 02:00:00,6,02:00:00,1273.188232,-13.198006
3,2005-01-01 03:00:00,6,03:00:00,1279.703979,-13.999279
4,2005-01-01 04:00:00,6,04:00:00,1292.839539,-14.782819


In [235]:
n_train = int(0.8*data.shape[0])
data_train = data[:n_train]
data_test = data[n_train:]

In [236]:
h = data_train['Hour'].apply(lambda x: x.hour)
d = data_train['WeekDay']
m = data_train['Date'].apply(lambda x: x.dayofyear)
t = data_train['TemperatureC']
y_train = data_train['ConsumprionMWtH']
x_train = np.linspace(0, 1, y_train.shape[0])

### нормировка

In [237]:
h = (h - np.mean(h))/np.std(h)
d = (d - np.mean(d))/np.std(d)
m = (m - np.mean(m))/np.std(m)
t = (t - np.mean(t))/np.std(t)
y_train = (y_train - np.mean(y_train))/np.std(y_train)

In [238]:
def is_holiday(day):
    russia_holidays = holidays.Russia()
    day = day.date()
    if day in russia_holidays:
        return 1
    else:
        return -1

In [301]:
def get_model_matrix(h, d, m, t, holiday):
    A = np.column_stack((ones, h, h**2, np.arcsinh(h), np.log(abs(h)), x*np.sqrt(x),\
                     sin(omega*h + 0.2), sin(omega*h/2), sin(omega*h*4),\
                     d, d**2, sin(327*d), sin(250*d),np.sin(29*d),\
                     m, m**2, m**3, sin(m), np.log(abs(m)+1),          \
                     t, t**3, np.sinh(t), np.log(abs(t)),\
                     holiday
                     ))
    return A

In [296]:
ones = np.ones(y_train.shape[0])
holiday = data['Date'][:x_train.shape[0]].apply(lambda x: is_holiday(x))
omega = 2*np.pi/(2/365)
A = np.column_stack((ones, h, h**2, np.arcsinh(h), np.log(abs(h)),\
                     sin(omega*h + 0.2), sin(omega*h/2), sin(omega*h*4),\
                     d, d**2, sin(327*d), sin(250*d),np.sin(29*d),\
                     m, m**2, m**3, sin(m), np.log(abs(m)+1),          \
                     t, t**3, np.sinh(t), np.log(abs(t)),\
                     holiday
                     ))

### Let set the parameters of the algorithm

In [240]:
P = 20
Q = 5
R = 40
n = A.shape[1]

In [261]:
n

23

### 1. Choose population

In [297]:
A_population = np.random.randint(low=0, high=2, size=(P, n))

### 2. get 2 vectors

In [268]:
p, q = sample(range(n), 2)
a, b = A_population[p], A_population[q]

### 3. choose random number $\nu$

In [269]:
u = np.random.randint(n)

### 4. split both vectors and change their parts

In [270]:
def swap(a, b, u):
    return np.concatenate((b[:u], a[u:])), np.concatenate((a[:u], b[u:]))

In [271]:
a, b = swap(a, b, u)

### 5. choose random numbers $\eta_{1}, ..., \eta_{Q}$

In [272]:
ns = sample(range(n), Q)

### 6. invert positions $\eta_{1}, ..., \eta_{Q}$ of the vectors a', b'

In [273]:
def swap_positions(a, b, ns):
    for i in ns:
        a[i], b[i] = b[i], a[i]
    return a, b

In [274]:
a, b = swap_positions(a, b, ns)

### 7. repeat 2-6 P/2 times

In [283]:
def from2to6(A):
    p, q = sample(range(n), 2)               # first step
    a, b = A[p], A[q]
    u = np.random.randint(n)                 # second step
    a, b = swap(a, b, u)                     # third step
    ns = sample(range(n), Q)                 # fourth step
    a, b = swap_positions(a, b, ns)          # fifth step
    return a, b

In [298]:
for i in range(int(P/2)):
    a, b = from2to6(A_population)
    A_population = np.vstack((A_population, a, b))

In [299]:
A_population.shape

(40, 23)

### evaluate obtained models and repeate R times

In [None]:
del_models = list()
for i in range(int(P)):
    error = get_error(A_population[i], y_train)