# Сначала реализуем класс лог регрессии

In [1]:
import numpy as np
import torch as T
import time
device = 'cpu'


class LogisticRegression:

    def __init__(self, num_features):
        self.x_dim = num_features
        self.b = np.array(1)

        lo = -0.01
        hi = 0.01
        self.w = T.rand(dim_x, dtype=T.float32, requires_grad=True).to(device)
        self.w = (hi - lo) * self.w + lo
        self.w.grad = T.zeros(dim_x)
        self.w.retain_grad()

        self.b = T.zeros(1, dtype=T.float32, requires_grad=True).to(device)
        self.b.grad = T.zeros(1)
        self.b.retain_grad()

    @staticmethod
    def forward(x, w, b):
        z = T.dot(x, w).reshape(1)
        z += b
        p = 1 / (1 + T.exp(-z))
        return p

    def train(self, train_x, train_y, num,  lrn_rate, epochs, reg=0, verbose=0):
        indices = np.arange(num)

        for epoch in range(0, epochs):
            tot_loss = T.zeros(1, dtype=T.float32, requires_grad=True).to(device)
            tot_loss.grad = T.zeros(1)
            tot_loss.retain_grad()

            np.random.shuffle(indices)
            for ii in range(len(indices)):
                i = indices[ii]
                x = train_x[i]
                target = train_y[i]
                out = self.forward(x, self.w, self.b)
                loss = (out - target).pow(2).sum()  # l2
                tot_loss = loss + tot_loss

            if reg == 1:
                tot_loss = tot_loss + T.norm(self.w, p=1)
            elif reg == 2:
                tot_loss = tot_loss + T.norm(self.w, p=2)

            tot_loss.backward(retain_graph=True)  # compute gradients

            self.w.data += -1 * lrn_rate * self.w.grad.data
            self.b.data += -1 * lrn_rate * self.b.grad.data

            self.w.grad = T.zeros(dim_x)
            self.b.grad = T.zeros(1)

            if epoch % verbose == 0:
                print("epoch = %4d " % epoch, end="")
                print("   loss = %6.4f" % (tot_loss / num))

    def predict(self, x):
        return T.matmul(x, self.w) + self.b

    def results(self, name=''):
        print("Коэффициенты модели", name)
        print(self.w.detach().numpy(), self.b.detach().numpy())



In [2]:
seed = time.time()
T.manual_seed(int(seed))
np.random.seed(int(seed))

num = 100  # общее число точек в данных
dim_x = 5  # число фич
sigma = 0  # дисперсия случайного шума в выборке

# Проверим на сгенерированных данных

In [4]:
true_w = np.random.rand(dim_x)
true_b = np.random.rand(1)
# Считаем что данные (точки) тоже нормально распределены. Зато можно их не нормировать!
train_x = np.random.normal(size=(num, dim_x))
train_y = np.dot(train_x, true_w) + true_b
# Добавляем шум (гауссовский с дисперсией sigma)
if sigma != 0:
    train_x += np.random.normal(scale=sigma, size=(num, dim_x))

train_x = T.tensor(train_x, dtype=T.float32).to(device)
train_y = T.tensor(np.sign(train_y), dtype=T.long).to(device)

# Теперь делаем логистическую регрессию

res = LogisticRegression(dim_x)
res.train(train_x, train_y, num, 0.05, 100, reg=0, verbose=10)
res.results('без регуляризация')

# Сделаем проверку коэффциентов
print("Оригинальные коэффициенты:")
c = true_w.mean() / res.w.detach().numpy().mean()
print(true_w / c, true_b / c)

epoch =    0    loss = 1.2500
epoch =   10    loss = 0.6171
epoch =   20    loss = 0.5928
epoch =   30    loss = 0.5807
epoch =   40    loss = 0.5731
epoch =   50    loss = 0.5676
epoch =   60    loss = 0.5635
epoch =   70    loss = 0.5601
epoch =   80    loss = 0.5574
epoch =   90    loss = 0.5551
Коэффициенты модели без регуляризация
[4.0834093 2.5109665 3.8863547 4.554378  4.599452 ] [-1.2394233]
Оригинальные коэффициенты:
[4.48947665 2.82845077 3.86774258 4.45327993 3.99560923] [0.39382708]


In [63]:
# Сделаем проверку на второй части выборки
num_2 = 10  # Число точек в проверочной выборке
x = np.random.normal(size=(num_2, dim_x))
y = np.dot(train_x, true_w) + true_b
x = T.tensor(x, dtype=T.float32).to(device)
y = T.tensor(y, dtype=T.float32).to(device).detach().numpy()
y_p = res.predict(x).detach().numpy()

print("Проверяем модель:")
errors = 0
for i in range(num_2):
    print(y[i], y_p[i])
    if y[i] * y_p[i] < 0:
        errors += 1
print("Ошибок произведено", errors, "из", num_2)

Проверяем модель:
-3.0102863 -0.87553096
-3.0102863 3.6029117
-2.7049305 1.9667728
-2.5397189 -4.5294027
-2.5397189 -4.2862234
-2.5397189 2.1967945
-2.353759 -3.9826279
-2.6958566 -0.8252412
-2.9329054 -1.4006189
-3.3503954 -3.4682229
Ошибок произведено 3 из 10


Так себе, но допустим

# Теперь используем для погоды в Шанхае

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [24]:
data = pd.read_csv('Shanghai_HMT_2010.csv')
data = data.dropna()

In [25]:
for x in ['day', 'hour', 'PM_Jingan', 'PM_US Post' , 'PM_Xuhui', 'Iws', 'precipitation', 'Iprec', 'No', 'year', 'cbwd']:
    data = data.drop(x, axis=1)

In [30]:
data -= data.mean()
data /= data.std()
data

Unnamed: 0,month,season,DEWP,HUMI,PRES,TEMP
26304,-1.557791,1.347979,-1.704071,0.018112,0.774887,-1.961308
26305,-1.557791,1.347979,-1.704071,0.018112,0.774887,-1.961308
26307,-1.557791,1.347979,-1.598843,0.317001,0.774887,-1.961308
26308,-1.557791,1.347979,-1.598843,0.628044,0.774887,-2.072632
26309,-1.557791,1.347979,-1.598843,0.628044,0.774887,-2.072632
...,...,...,...,...,...,...
52578,1.607977,1.347979,-1.388386,-0.450938,2.114730,-1.404688
52579,1.607977,1.347979,-1.283157,-0.195695,2.114730,-1.404688
52580,1.607977,1.347979,-1.072701,0.088277,2.114730,-1.293365
52581,1.607977,1.347979,-1.072701,0.088277,2.226384,-1.293365


In [31]:
data['PRES'] = data['PRES'].apply(lambda x: 1 if x > 0 else 0)

In [32]:
data

Unnamed: 0,month,season,DEWP,HUMI,PRES,TEMP
26304,-1.557791,1.347979,-1.704071,0.018112,1,-1.961308
26305,-1.557791,1.347979,-1.704071,0.018112,1,-1.961308
26307,-1.557791,1.347979,-1.598843,0.317001,1,-1.961308
26308,-1.557791,1.347979,-1.598843,0.628044,1,-2.072632
26309,-1.557791,1.347979,-1.598843,0.628044,1,-2.072632
...,...,...,...,...,...,...
52578,1.607977,1.347979,-1.388386,-0.450938,1,-1.404688
52579,1.607977,1.347979,-1.283157,-0.195695,1,-1.404688
52580,1.607977,1.347979,-1.072701,0.088277,1,-1.293365
52581,1.607977,1.347979,-1.072701,0.088277,1,-1.293365


In [33]:
data.describe()

Unnamed: 0,month,season,DEWP,HUMI,PRES,TEMP
count,21436.0,21436.0,21436.0,21436.0,21436.0,21436.0
mean,0.0,2.121419e-17,0.0,-2.121419e-17,0.48078,1.0607090000000001e-17
std,1.0,1.0,1.0,1.0,0.499642,1.0
min,-1.557791,-1.293593,-2.966812,-3.070773,0.0,-2.29528
25%,-0.982197,-1.293593,-0.862244,-0.6791105,0.0,-0.8480687
50%,-0.118806,-0.4130691,0.084812,0.1103756,0.0,0.04252288
75%,0.744585,0.4674549,0.926639,0.7921291,1.0,0.8217905
max,1.607977,1.347979,1.768467,1.730783,1.0,2.602974


In [51]:
train_x = T.tensor(data.drop(['PRES'], axis=1).to_numpy(), dtype=T.float32).to(device)
train_y = T.tensor(data['PRES'].to_numpy(), dtype=T.long).to(device)

Проблемы с быстродействием. Уменьшим число эпох

In [54]:
num = int(train_y.size()[0])
dim_x = 5
res = LogisticRegression(dim_x)
res.train(train_x, train_y, num, 0.0005, 30, reg=0, verbose=10)
res.results('без регуляризация')

epoch =    0    loss = 0.2512
epoch =   10    loss = 0.0640
epoch =   20    loss = 0.0631
Коэффициенты модели без регуляризация
[ 1.0856626   1.1052082  -2.019232   -0.23652948 -2.227636  ] [0.531966]


# Посмотрим как покажет себя sklearn

In [55]:
from sklearn.linear_model import LogisticRegression
res3 = LogisticRegression()
res3.fit(data.drop(['PRES'], axis=1).to_numpy(), data['PRES'].to_numpy())

LogisticRegression()

In [56]:
print(res3.coef_)

[[ 1.65479326  0.75872407 -0.62277762 -1.12085302 -4.15814051]]


In [59]:
print(res3.intercept_[:,None])

[[0.34836558]]


In [62]:
from sklearn.metrics import log_loss
log_loss(data['PRES'].to_numpy(), res3.predict(data.drop(['PRES'], axis=1).to_numpy()))

2.9840686667280534