In [41]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, confusion_matrix, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OneHotEncoder
import os

# Кастомный класс датасета
class CSVDataset(Dataset):
    def __init__(self, csv_file, target_column, column_types=None, scaler_type='minmax', ignore_columns=None):
        self.df = pd.read_csv(csv_file)
        self.target_column = target_column
        self.ignore_columns = ignore_columns if ignore_columns is not None else []
        self.scaler_type = scaler_type

        self.df = self.df.drop(columns=[col for col in self.ignore_columns if col in self.df.columns])

        if target_column not in self.df.columns:
            raise ValueError(f"Целевая переменная '{target_column}' не найдена в CSV-файле")

        self.column_types = column_types if column_types is not None else self._infer_column_types()
        self.features = self.df.drop(columns=[target_column])
        self.target = self.df[target_column]
        self._preprocess()

    def _infer_column_types(self):
        column_types = {}
        for col in self.df.columns:
            if col == self.target_column:
                continue
            if self.df[col].dtype in [np.float64, np.float32, np.int64, np.int32]:
                unique_values = self.df[col].nunique()
                if unique_values == 2:
                    column_types[col] = 'binary'
                else:
                    column_types[col] = 'numeric'
            else:
                column_types[col] = 'categorical'
        return column_types

    def _preprocess(self):
        # Обработка пропущенных значений в признаках
        for col in self.features.columns:
            if self.column_types.get(col, 'numeric') in ['numeric', 'binary']:
                # Проверяем, что столбец можно преобразовать в числовой тип
                try:
                    self.features[col] = pd.to_numeric(self.features[col], errors='coerce')
                    self.features[col] = self.features[col].fillna(self.features[col].median())
                except:
                    print(f"Предупреждение: не удалось вычислить медиану для столбца '{col}'. Заполняю модой.")
                    self.features[col] = self.features[col].fillna(self.features[col].mode()[0])
            else:
                self.features[col] = self.features[col].fillna(self.features[col].mode()[0])

        # Обработка пропущенных значений в целевой переменной
        try:
            self.target = pd.to_numeric(self.target, errors='coerce')
            self.target = self.target.fillna(self.target.median())
            self.is_target_categorical = False
        except:
            self.target = self.target.fillna(self.target.mode()[0])
            self.is_target_categorical = True

        self.scaler = MinMaxScaler() if self.scaler_type == 'minmax' else StandardScaler()
        self.label_encoder = LabelEncoder()
        self.onehot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

        numeric_cols = [col for col in self.features.columns if self.column_types.get(col, 'numeric') == 'numeric']
        categorical_cols = [col for col in self.features.columns if self.column_types.get(col, 'numeric') == 'categorical']
        binary_cols = [col for col in self.features.columns if self.column_types.get(col, 'numeric') == 'binary']

        if numeric_cols:
            self.features[numeric_cols] = self.scaler.fit_transform(self.features[numeric_cols])

        if categorical_cols:
            encoded_cats = self.onehot_encoder.fit_transform(self.features[categorical_cols])
            encoded_cat_cols = self.onehot_encoder.get_feature_names_out(categorical_cols)
            self.features = self.features.drop(columns=categorical_cols)
            self.features[encoded_cat_cols] = encoded_cats

        for col in binary_cols:
            self.features[col] = (self.features[col] == self.features[col].unique()[1]).astype(float)

        if self.is_target_categorical:
            self.target = self.label_encoder.fit_transform(self.target)

        self.X = torch.tensor(self.features.values, dtype=torch.float32)
        self.y = torch.tensor(self.target.values, dtype=torch.long if self.is_target_categorical else torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

    def get_feature_names(self):
        return self.features.columns.tolist()

    def get_num_features(self):
        return self.X.shape[1]

    def get_num_classes(self):
        return len(self.label_encoder.classes_) if self.is_target_categorical else None

# Определение функции mse
def mse(y_pred, y_true):
    return ((y_pred - y_true) ** 2).mean().item()

# Модель для регрессии и классификации
class LinearRegression(nn.Module):
    def __init__(self, in_features):
        super().__init__()
        self.linear = nn.Linear(in_features, 1)

    def forward(self, x):
        return self.linear(x)

class LogisticRegression(nn.Module):
    def __init__(self, in_features, num_classes):
        super().__init__()
        self.linear = nn.Linear(in_features, 1 if num_classes == 2 else num_classes)
        self.sigmoid = nn.Sigmoid() if num_classes == 2 else nn.Identity()

    def forward(self, x):
        return self.sigmoid(self.linear(x))

# Функции для метрик и визуализации
def compute_regression_metrics(y_true, y_pred):
    y_true = y_true.cpu().numpy()
    y_pred = y_pred.cpu().numpy()
    mse_val = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse_val)
    r2 = r2_score(y_true, y_pred)
    return mse_val, rmse, r2

def compute_classification_metrics(y_true, y_pred, y_pred_proba, num_classes):
    y_true = y_true.cpu().numpy()
    y_pred = y_pred.cpu().numpy()
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary', zero_division=0)
    try:
        roc_auc = roc_auc_score(y_true, y_pred_proba[:, 1] if num_classes == 2 else y_pred_proba, multi_class='ovr')
    except ValueError:
        roc_auc = float('nan')
    return precision, recall, f1, roc_auc

def plot_confusion_matrix(y_true, y_pred, num_classes, epoch, task='classification'):
    cm = confusion_matrix(y_true.cpu().numpy(), y_pred.cpu().numpy(), labels=range(num_classes))
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=[f'Class {i}' for i in range(num_classes)],
                yticklabels=[f'Class {i}' for i in range(num_classes)])
    plt.title(f'Confusion Matrix at Epoch {epoch} ({task})')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.savefig(f'confusion_matrix_{task}_epoch_{epoch}.png')
    plt.close()

def log_epoch(epoch, loss, **metrics):
    msg = f"Epoch {epoch}: loss={loss:.4f}"
    for k, v in metrics.items():
        msg += f", {k}={v:.4f}"
    print(msg)

# Функция для обучения модели
def train_model(model, criterion, optimizer, train_dataloader, val_dataloader, epochs, num_classes, task='classification'):
    for epoch in range(1, epochs + 1):
        model.train()
        total_loss = 0
        total_acc = 0

        for i, (batch_X, batch_y) in enumerate(train_dataloader):
            optimizer.zero_grad()
            outputs = model(batch_X)
            if task == 'regression':
                outputs = outputs.squeeze(-1)
                loss = criterion(outputs, batch_y)
                acc = mse(outputs, batch_y)
            elif task == 'classification' and num_classes == 2:
                outputs = outputs.squeeze(-1)
                loss = criterion(outputs, batch_y.float())
                acc = ((outputs > 0.5).float() == batch_y).float().mean().item()
            else:
                loss = criterion(outputs, batch_y)
                acc = (torch.argmax(outputs, dim=1) == batch_y).float().mean().item()

            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            total_acc += acc

        avg_loss = total_loss / (i + 1)
        avg_acc = total_acc / (i + 1)

        model.eval()
        val_y_true = []
        val_y_pred = []
        val_y_pred_proba = []
        total_val_loss = 0
        total_val_acc = 0

        with torch.no_grad():
            for i, (batch_X, batch_y) in enumerate(val_dataloader):
                outputs = model(batch_X)
                if task == 'regression':
                    outputs = outputs.squeeze(-1)
                    loss = criterion(outputs, batch_y)
                    y_pred = outputs
                    y_pred_proba = outputs
                    acc = mse(outputs, batch_y)
                elif task == 'classification' and num_classes == 2:
                    outputs = outputs.squeeze(-1)
                    loss = criterion(outputs, batch_y.float())
                    y_pred = (outputs > 0.5).float()
                    y_pred_proba = torch.sigmoid(outputs).unsqueeze(-1)
                    acc = (y_pred == batch_y).float().mean().item()
                else:
                    loss = criterion(outputs, batch_y)
                    y_pred = torch.argmax(outputs, dim=1)
                    y_pred_proba = torch.softmax(outputs, dim=1)
                    acc = (y_pred == batch_y).float().mean().item()

                val_y_true.append(batch_y)
                val_y_pred.append(y_pred)
                val_y_pred_proba.append(y_pred_proba)

                total_val_loss += loss.item()
                total_val_acc += acc

        avg_val_loss = total_val_loss / (i + 1)
        avg_val_acc = total_val_acc / (i + 1)

        val_y_true = torch.cat(val_y_true)
        val_y_pred = torch.cat(val_y_pred)
        val_y_pred_proba = torch.cat(val_y_pred_proba)

        if task == 'classification':
            unique_classes = torch.unique(val_y_true).numel()
            if unique_classes < num_classes:
                print(f"Предупреждение на эпохе {epoch}: в валидационной выборке только {unique_classes} из {num_classes} классов")

        if task == 'regression':
            mse_val, rmse, r2 = compute_regression_metrics(val_y_true, val_y_pred)
            metrics = {'mse': mse_val, 'rmse': rmse, 'r2': r2}
        else:
            precision, recall, f1, roc_auc = compute_classification_metrics(val_y_true, val_y_pred, y_pred_proba, num_classes)
            metrics = {'precision': precision, 'recall': recall, 'f1_score': f1, 'roc_auc': roc_auc}
            if epoch % 10 == 0:
                plot_confusion_matrix(val_y_true, val_y_pred, num_classes, epoch, task)

        if epoch % 10 == 0:
            log_epoch(epoch, avg_loss, accuracy=avg_acc, validation_loss=avg_val_loss, validation_accuracy=avg_val_acc, **metrics)

    return model

In [42]:

# Основной код
if __name__ == '__main__':
    # 1. Регрессия: Boston Housing Dataset
    print("Обучение линейной регрессии на Boston Housing Dataset")
    regression_dataset = CSVDataset(
        csv_file='https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv',
        target_column='medv',
        column_types={'chas': 'binary'},
        scaler_type='minmax',
        ignore_columns=[]
    )

    print(f"Количество признаков: {regression_dataset.get_num_features()}")
    print(f"Имена признаков: {regression_dataset.get_feature_names()}")

    train_size = int(0.8 * len(regression_dataset))
    val_size = len(regression_dataset) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(regression_dataset, [train_size, val_size])

    train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=32)

    print(f"Размер тренировочного датасета: {len(train_dataset)}")
    print(f"Размер валидационного датасета: {len(val_dataset)}")

    regression_model = LinearRegression(in_features=regression_dataset.get_num_features())
    criterion = nn.MSELoss()
    optimizer = optim.SGD(regression_model.parameters(), lr=0.01, weight_decay=0.01)

    regression_model = train_model(
        regression_model, criterion, optimizer, train_dataloader, val_dataloader, 
        epochs=100, num_classes=1, task='regression'
    )

    regression_model_path = 'linear_regression.pth'
    torch.save(regression_model.state_dict(), regression_model_path)
    print(f"Модель сохранена в {regression_model_path}")

    # 2. Бинарная классификация: Breast Cancer Wisconsin Dataset
    print("\nОбучение логистической регрессии на Breast Cancer Wisconsin Dataset")
    
    # Загружаем датасет
    breast_cancer_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data'
    df = pd.read_csv(breast_cancer_url, header=None)
    df.columns = ['id'] + [f'feature_{i}' for i in range(1, 31)] + ['diagnosis']
    
    # Проверяем пропущенные значения в исходных данных
    print("Пропущенные значения в датасете до обработки:")
    print(df.isna().sum())
    
    # Проверяем уникальные значения в столбце 'diagnosis'
    print("Уникальные значения в столбце 'diagnosis':", df['diagnosis'].unique())
    
    # Удаляем строки с некорректными значениями в 'diagnosis' (не 'M' или 'B')
    df = df[df['diagnosis'].isin(['M', 'B'])]
    
    # Проверяем, что после фильтрации нет пропущенных значений
    if df['diagnosis'].isna().any():
        raise ValueError("Пропущенные значения в столбце 'diagnosis' после фильтрации")
    
    # Преобразуем целевую переменную
    df['target'] = df['diagnosis'].map({'M': 1, 'B': 0})
    
    # Проверяем, что все значения в 'target' корректны
    if df['target'].isna().any():
        raise ValueError("Пропущенные значения в целевой переменной после маппинга")
    
    # Удаляем столбец 'diagnosis', так как теперь у нас есть 'target'
    df = df.drop(columns=['diagnosis'])
    
    # Сохраняем датасет
    df.to_csv('breast_cancer.csv', index=False)
    
    # Создаем датасет
    classification_dataset = CSVDataset(
        csv_file='breast_cancer.csv',
        target_column='target',  # Исправлено с 'id' на 'target'
        column_types={f'feature_{i}': 'numeric' for i in range(1, 31)},
        scaler_type='minmax',
        ignore_columns=['id']
    )

    # Разделяем на тренировочную и валидационную выборки
    train_size = int(0.8 * len(classification_dataset))
    val_size = len(classification_dataset) - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(classification_dataset, [train_size, val_size])

    train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=32)

    print(f"Размер тренировочного датасета: {len(train_dataset)}")
    print(f"Размер валидационного датасета: {len(val_dataset)}")

    # Модель логистической регрессии
    classification_model = LogisticRegression(in_features=classification_dataset.get_num_features(), num_classes=2)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.SGD(classification_model.parameters(), lr=0.1)

    # Обучаем модель
    classification_model = train_model(
        classification_model, criterion, optimizer, train_dataloader, val_dataloader, 
        epochs=100, num_classes=2, task='classification'
    )

    # Сохраняем модель
    torch.save(classification_model.state_dict(), 'logistic_regression.pth')

Обучение линейной регрессии на Boston Housing Dataset
Количество признаков: 13
Имена признаков: ['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'b', 'lstat']
Размер тренировочного датасета: 404
Размер валидационного датасета: 102
Epoch 10: loss=64.9597, accuracy=64.9597, validation_loss=60.6057, validation_accuracy=60.6057, mse=71.4220, rmse=8.4512, r2=0.1987
Epoch 20: loss=50.3488, accuracy=50.3488, validation_loss=47.0618, validation_accuracy=47.0618, mse=56.6790, rmse=7.5285, r2=0.3641
Epoch 30: loss=44.9113, accuracy=44.9113, validation_loss=42.4440, validation_accuracy=42.4440, mse=50.9118, rmse=7.1352, r2=0.4288
Epoch 40: loss=42.3955, accuracy=42.3955, validation_loss=39.7146, validation_accuracy=39.7146, mse=47.3298, rmse=6.8797, r2=0.4690
Epoch 50: loss=40.3412, accuracy=40.3412, validation_loss=37.6679, validation_accuracy=37.6679, mse=44.5879, rmse=6.6774, r2=0.4997
Epoch 60: loss=35.3990, accuracy=35.3990, validation_loss=35.8536, validat

ValueError: Found array with 0 sample(s) (shape=(0, 30)) while a minimum of 1 is required by MinMaxScaler.