In [1]:
import numpy as np
import pandas as pd
import random
import sys
sys.path.insert(0, '../ml/')

import warnings

warnings.filterwarnings('ignore')

from sklearn.datasets import load_boston, load_breast_cancer
from sklearn.preprocessing import StandardScaler
import sklearn.linear_model

from linear_model import LinearRegression

np.random.seed(40)

In [7]:
X = load_boston().data
y = load_boston().target
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [3]:
sklr = sklearn.linear_model.LinearRegression()
sklr.fit(X_scaled, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [4]:
lr = LinearRegression(learning_rate=0.0005, max_iter=1000, random_state=2019)
lr.fit(X_scaled, y)

<linear_model.LinearRegression at 0x106334898>

In [5]:
list(zip(sklr.coef_,lr.coef_))

[(-0.9204111277001574, -0.9204109281529967),
 (1.0809805781703272, 1.0809802235981993),
 (0.14296712402019407, 0.1429660227403188),
 (0.6822034602840216, 0.6822036176494952),
 (-2.060092463540669, -2.060092271173144),
 (2.6706414133249385, 2.6706416140852207),
 (0.02112063062127656, 0.021120460971634156),
 (-3.104448051483856, -3.104448131620553),
 (2.6587865409067524, 2.658783764888558),
 (-2.07589814093026, -2.0758949841662204),
 (-2.062155925973757, -2.0621558229357366),
 (0.8566404358167781, 0.8566404145587317),
 (-3.7486798191194586, -3.7486797105461305)]

## Логистическая

In [3]:
X = load_breast_cancer().data
y = load_breast_cancer().target
y = y.astype(float)

In [4]:
sk_lg = sklearn.linear_model.LogisticRegression(max_iter=10000)

In [5]:
sk_lg.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=10000, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [6]:
def logistic_regression(X, y, alpha, max_iter):
    X_ = np.ones((X.shape[0], X.shape[1] + 1))
    X_[:, 1:] = X
    w = np.random.normal(loc=0.0, scale=1.0, size=X_.shape[1])
    cost = []
    w_hist = []
    for i in range(max_iter):
        w_hist.append(w)
        predict = 1. / (1. + np.exp(np.clip(-np.dot(X_, w), -250, 250)))
        error = y - predict
        for idx, _ in enumerate(w):
            w[idx] += error.T.dot(X_[:, idx]) * alpha
            cost_ = -y.dot(np.log(predict)) - ((1 - y).dot(np.log(1 - predict)))
            cost.append(cost_)
    return w, cost

def predict(X, w):
    X_ = np.ones((X.shape[0], X.shape[1] + 1))
    X_[:, 1:] = X
    return np.where(X_.dot(w)>=0.5, 1, 0)

In [8]:
w, cost = logistic_regression(X, y, alpha=0.001, max_iter=10000)
w

array([ 5.02841506e+01,  3.69521882e+02, -2.29969649e+02,  1.64449341e+03,
        2.66583006e+02,  1.48924577e+00, -1.71974984e+01, -2.66509631e+01,
       -1.13717052e+01, -1.03278529e+00,  6.39464783e-01,  3.63462374e+00,
       -8.46442725e+00, -7.14865856e+01, -7.30960078e+02, -4.55814716e-01,
       -3.69522743e+00, -6.24204874e+00, -1.89794641e+00, -3.26519470e-03,
       -7.36575715e-01,  3.94993134e+02, -5.90344065e+02,  1.12329586e+03,
       -4.97215342e+02, -3.41072477e+00, -6.10305780e+01, -8.10033688e+01,
       -2.08636284e+01, -8.35751424e+00, -2.19193352e+00])

In [13]:
sk_lg.intercept_, sk_lg.coef_

(array([0.40705725]),
 array([[ 2.16249887,  0.11754616, -0.07831771, -0.00266282, -0.15929505,
         -0.40812037, -0.66342543, -0.3535881 , -0.23183976, -0.02541806,
         -0.02324736,  1.24011836,  0.04378481, -0.09733039, -0.01784774,
          0.01063528, -0.04533986, -0.0418176 , -0.04233063,  0.00715805,
          1.30262095, -0.34440521, -0.12411562, -0.02470996, -0.29577272,
         -1.13455596, -1.61295028, -0.67995191, -0.70343074, -0.11274612]]))

In [18]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.2)

In [21]:
w = logistic_regression(X_train, y_train, alpha=0.01, max_iter=10000)
pr = predict(X_test, w)

In [22]:
sklearn.metrics.accuracy_score(y_test, pr)

0.9298245614035088

In [4]:
def check_random_state(seed):
    """Turn seed into a np.random.RandomState instance

    Parameters
    ----------
    seed : None | int | instance of RandomState
        If seed is None, return the RandomState singleton used by np.random.
        If seed is an int, return a new RandomState instance seeded with seed.
        If seed is already a RandomState instance, return it.
        Otherwise raise ValueError.
    """
    if seed is None or seed is np.random:
        return np.random.mtrand._rand
    if isinstance(seed, numbers.Integral):
        return np.random.RandomState(seed)
    if isinstance(seed, np.random.RandomState):
        return seed
    raise ValueError('%r cannot be used to seed a numpy.random.RandomState'
                     ' instance' % seed)

In [20]:
from sklearn.utils._seq_dataset import ArrayDataset32, CSRDataset32
from sklearn.utils._seq_dataset import ArrayDataset64, CSRDataset64
from scipy import sparse as sp

In [21]:
def make_dataset(X, y, sample_weight, random_state=None):
    """Create ``Dataset`` abstraction for sparse and dense inputs.

    This also returns the ``intercept_decay`` which is different
    for sparse datasets.

    Parameters
    ----------
    X : array_like, shape (n_samples, n_features)
        Training data

    y : array_like, shape (n_samples, )
        Target values.

    sample_weight : numpy array of shape (n_samples,)
        The weight of each sample

    random_state : int, RandomState instance or None (default)
        Determines random number generation for dataset shuffling and noise.
        Pass an int for reproducible output across multiple function calls.
        See :term:`Glossary <random_state>`.

    Returns
    -------
    dataset
        The ``Dataset`` abstraction
    intercept_decay
        The intercept decay
    """

    rng = check_random_state(random_state)
    # seed should never be 0 in SequentialDataset64
    seed = rng.randint(1, np.iinfo(np.int32).max)

    if X.dtype == np.float32:
        CSRData = CSRDataset32
        ArrayData = ArrayDataset32
    else:
        CSRData = CSRDataset64
        ArrayData = ArrayDataset64

    if sp.issparse(X):
        dataset = CSRData(X.data, X.indptr, X.indices, y, sample_weight,
                          seed=seed)
        intercept_decay = SPARSE_INTERCEPT_DECAY
    else:
        X = np.ascontiguousarray(X)
        dataset = ArrayData(X, y, sample_weight, seed=seed)
        intercept_decay = 1.0

    return dataset, intercept_decay

In [29]:
make_dataset(X=X, y=y, sample_weight=np.array(1., 2.), random_state=17)

TypeError: data type not understood

In [10]:
import numbers

In [50]:
lr = sklearn.linear_model.LinearRegression(fit_intercept=True, copy_X=True, n_jobs=-1, normalize=True)

In [51]:
lr.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=True)

In [52]:
lr.coef_

array([-1.08011358e-01,  4.64204584e-02,  2.05586264e-02,  2.68673382e+00,
       -1.77666112e+01,  3.80986521e+00,  6.92224640e-04, -1.47556685e+00,
        3.06049479e-01, -1.23345939e-02, -9.52747232e-01,  9.31168327e-03,
       -5.24758378e-01])

In [53]:
lr.intercept_

36.45948838508992

In [54]:
lr.score(X, y)

0.7406426641094094

In [55]:
lr.score(X, y)

0.7406426641094094