<a href="https://colab.research.google.com/github/Metamorphozis/model_ml/blob/main/Logistic_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import random
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

class MyLogReg:
    def __init__(self, n_iter=10, learning_rate=0.1, weights=None, l1_reg=0, l2_reg=0, elastic_net_alpha=0, elastic_net_l1_ratio=0.5, sgd_sample=None, random_state=42, metric=None):
        self.n_iter = n_iter
        self.learning_rate = learning_rate
        self.weights = weights if weights is not None else np.zeros(16)
        self.l1_reg = l1_reg
        self.l2_reg = l2_reg
        self.elastic_net_alpha = elastic_net_alpha
        self.elastic_net_l1_ratio = elastic_net_l1_ratio
        self.sgd_sample = sgd_sample
        self.random_state = random_state
        self.metric = metric
        self.best_score = None

    def fit(self, X, y, verbose=False):
        random.seed(self.random_state)
        X = pd.DataFrame(X)
        X = pd.concat([pd.DataFrame(np.ones((X.shape[0], 1))), X], axis=1)
        num_features = X.shape[1]
        self.weights = np.zeros(num_features)

        if self.sgd_sample is not None:
            sample_size = int(self.sgd_sample * X.shape[0]) if isinstance(self.sgd_sample, float) else self.sgd_sample
            # Ограничиваем sample_size размером X.shape[0]
            sample_size = min(sample_size, X.shape[0])
            sample_indices = random.choices(range(X.shape[0]), k=sample_size)
            X_sample = X.iloc[sample_indices]
            y_sample = np.take(y, sample_indices)
        else:
            X_sample = X
            y_sample = y

        y_pred = 1 / (1 + np.exp(-np.dot(X_sample, self.weights)))
        log_loss_start = -(y_sample * np.log(y_pred + 1e-15) + (1 - y_sample) * np.log(1 - y_pred + 1e-15)).mean()

        if verbose:
            print(f"start | loss: {log_loss_start:.2f}", end=" | ")

        for i in range(1, self.n_iter + 1):
            if self.sgd_sample is not None:
                sample_size = int(self.sgd_sample * X.shape[0]) if isinstance(self.sgd_sample, float) else self.sgd_sample
                # Ограничиваем sample_size размером X.shape[0]
                sample_size = min(sample_size, X.shape[0])
                sample_indices = random.choices(range(X.shape[0]), k=sample_size)
                X_sample = X.iloc[sample_indices]
                y_sample = np.take(y, sample_indices)
            else:
                X_sample = X
                y_sample = y

            y_pred_sample = 1 / (1 + np.exp(-np.dot(X_sample, self.weights)))
            log_loss_sample = -(y_sample * np.log(y_pred_sample + 1e-15) + (1 - y_sample) * np.log(1 - y_pred_sample + 1e-15)).mean()

            grad = np.dot(X_sample.T, (y_pred_sample - y_sample)) / X_sample.shape[0]

            # Обновляем веса с учетом регуляризации
            if self.l1_reg > 0:
                reg_term = self.l1_reg * np.sign(self.weights)
            elif self.l2_reg > 0:
                reg_term = self.l2_reg * self.weights
            elif self.elastic_net_alpha > 0:
                l1_reg = self.elastic_net_alpha * self.elastic_net_l1_ratio
                l2_reg = self.elastic_net_alpha * (1 - self.elastic_net_l1_ratio)
                reg_term = l1_reg * np.sign(self.weights) + l2_reg * self.weights
            else:
                reg_term = 0

            self.weights -= self.learning_rate * (grad + reg_term)

            if verbose:
                print(f"{i} | loss: {log_loss_sample:.2f} | learning_rate: {self.learning_rate:.2f}", end=" | ")
                if self.metric:
                    y_pred_full = 1 / (1 + np.exp(-np.dot(X, self.weights)))
                    score = self._calculate_metric(y, (y_pred_full > 0.5).astype(int))
                    print(f"{self.metric}: {score:.2f}")
                else:
                    print("")

        # Запоминаем best_score после обучения
        if self.metric:
            y_pred_full = 1 / (1 + np.exp(-np.dot(X, self.weights)))
            self.best_score = self._calculate_metric(y, (y_pred_full > 0.5).astype(int))

    def get_coef(self):
        return self.weights

    def predict_proba(self, X):
        X = pd.concat([pd.DataFrame(np.ones((X.shape[0], 1))), X], axis=1)
        y_pred = 1 / (1 + np.exp(-np.dot(X, self.weights)))
        return y_pred

    def predict(self, X):
        X = pd.concat([pd.DataFrame(np.ones((X.shape[0], 1))), X], axis=1)
        y_pred = 1 / (1 + np.exp(-np.dot(X, self.weights)))
        y_pred_class = (y_pred > 0.5).astype(int)
        return y_pred_class

    def get_best_score(self):
        if self.best_score is None:
            return "No score available"
        else:
            return self.best_score

    def _calculate_metric(self, y_true, y_pred):
        if self.metric == 'accuracy':
            score = accuracy_score(y_true, y_pred)
        elif self.metric == 'precision':
            score = precision_score(y_true, y_pred)
        elif self.metric == 'recall':
            score = recall_score(y_true, y_pred)
        elif self.metric == 'f1':
            score = f1_score(y_true, y_pred)
        elif self.metric == 'roc_auc':
            score = roc_auc_score(y_true, y_pred)
        else:
            raise ValueError("Unsupported metric")
        return score

# Тестовые данные
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import numpy as np

# Генерация тестовых данных
X, y = make_classification(n_samples=100, n_features=5, n_informative=2, n_redundant=0, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Модель с sgd_sample = 10
log = MyLogReg(n_iter=50, learning_rate=0.1, sgd_sample=0.1)
log.fit(X_train, y_train)
print(np.mean(log.get_coef()))

0.16637872008745783
