# Машинное обучение - Лабораторная работа 2

**Выполнила:**  
Идрисова Лена  
  

### Импорт библиотек

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_squared_error, mean_absolute_error

import gdown
from google.colab import drive

## Линейная регрессия

Считываем набор данных

In [None]:
url = url = 'https://drive.google.com/uc?export=download&id=1Y7MM-B-QaSWuzgQxjt6iPvezZC3Zzjs8'
output = 'advertising_new.csv'
gdown.download(url, output, quiet=False)
Advertising = pd.read_csv('/content/advertising_new.csv')
Advertising.head()

Downloading...
From: https://drive.google.com/uc?export=download&id=1Y7MM-B-QaSWuzgQxjt6iPvezZC3Zzjs8
To: /content/advertising_new.csv
100%|██████████| 14.8k/14.8k [00:00<00:00, 19.0MB/s]


Unnamed: 0,TV,radio,newspaper,sales
0,0.969852,0.981522,1.778945,1.552053
1,-1.197376,1.082808,0.669579,-0.696046
2,0.05205,1.217855,1.286405,0.86033
3,0.394182,-0.841614,1.281802,-0.215683
4,-1.045577,0.643905,-0.324708,-0.427043


Разделим данные в файле Advertising на обучающие и проверочные данные, соотношение составляет 20%.

In [None]:
X = Advertising.drop('sales', axis = 1)
y = Advertising['sales']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state=42)

In [None]:
X.head()

Unnamed: 0,TV,radio,newspaper
0,0.969852,0.981522,1.778945
1,-1.197376,1.082808,0.669579
2,0.05205,1.217855,1.286405
3,0.394182,-0.841614,1.281802
4,-1.045577,0.643905,-0.324708


#### SGDRegressor

In [None]:
'squared_error', 'huber', 'epsilon_insensitive'
'l1', 'l2', 'elasticnet'

('l1', 'l2', 'elasticnet')

In [None]:
models = []
loss_functions = ['squared_error', 'huber', 'epsilon_insensitive']
regularizations = ['l1', 'l2']

for loss_func in loss_functions:
    for penalty_type in regularizations:
        model = SGDRegressor(
            loss=loss_func,      # Выбор функции потерь
            penalty=penalty_type, # Тип регуляризации
            alpha=0.0001,        # Коэффициент регуляризации
            max_iter=1000,       # Ограничение на количество итераций
            tol=1e-3,            # Условие остановки
            random_state=42      # Фиксированное значение для воспроизводимости
        )
        model.fit(X_train, y_train)
        models.append((loss_func, penalty_type, model))

print("\nModel Evaluations:")
for i, (loss_func, penalty_type, model) in enumerate(models, 1):
    y_pred = model.predict(X_val)
    mse = mean_squared_error(y_val, y_pred)
    mae = mean_absolute_error(y_val, y_pred)
    print(f"Model {i}: Loss Function = {loss_func}, Regularization = {penalty_type}")
    print(f"  Mean Squared Error (MSE): {mse:.4f}")
    print(f"  Mean Absolute Error (MAE): {mae:.4f}\n")



Model Evaluations:
Model 1: Loss Function = squared_error, Regularization = l1
  Mean Squared Error (MSE): 0.0942
  Mean Absolute Error (MAE): 0.2627

Model 2: Loss Function = squared_error, Regularization = l2
  Mean Squared Error (MSE): 0.0942
  Mean Absolute Error (MAE): 0.2627

Model 3: Loss Function = huber, Regularization = l1
  Mean Squared Error (MSE): 0.1809
  Mean Absolute Error (MAE): 0.3198

Model 4: Loss Function = huber, Regularization = l2
  Mean Squared Error (MSE): 0.1806
  Mean Absolute Error (MAE): 0.3196

Model 5: Loss Function = epsilon_insensitive, Regularization = l1
  Mean Squared Error (MSE): 0.1028
  Mean Absolute Error (MAE): 0.2638

Model 6: Loss Function = epsilon_insensitive, Regularization = l2
  Mean Squared Error (MSE): 0.1028
  Mean Absolute Error (MAE): 0.2638



In [None]:
pipeline_reg = Pipeline(steps=[
    ('scaler', StandardScaler()),  # Масштабирование данных
    ('sgdregressor', SGDRegressor(random_state=42))  # Модель регрессии
])

In [None]:
param_grid_reg = {
    'sgdregressor__loss': ['squared_error', 'huber', 'epsilon_insensitive'],
    'sgdregressor__penalty': ['l1', 'l2', 'elasticnet'],
    'sgdregressor__alpha': [0.0001, 0.001, 0.01, 0.1],
    'sgdregressor__l1_ratio': [0.15, 0.5, 0.85]  # Только для elasticnet
}

# GridSearchCV
grid_search_reg = GridSearchCV(pipeline_reg, param_grid_reg, cv=5, scoring='neg_mean_squared_error')
grid_search_reg.fit(X_train, y_train)
best_params_reg = grid_search_reg.best_params_
print(f'Best parameters for regressor: {best_params_reg}')

Best parameters for regressor: {'sgdregressor__alpha': 0.0001, 'sgdregressor__l1_ratio': 0.15, 'sgdregressor__loss': 'squared_error', 'sgdregressor__penalty': 'l1'}


In [None]:
y_pred_reg = grid_search_reg.predict(X_val)
mse = mean_squared_error(y_val, y_pred_reg)
mae = mean_absolute_error(y_val, y_pred_reg)
print(f'MSE for regressor: {mse}')
print(f'MAE for regressor: {mae}')

MSE for regressor: 0.0945816122507953
MAE for regressor: 0.2629131966374521


##Вывод
Model 1: Loss Function = squared_error, Regularization = l1
такая модель выиграла при поиске нашего личного решения.

Model from GridSearchCV: Loss Function = squared_error, Regularization = l1

как видим наши модели совпадают.

##Классификация

In [None]:
url = 'https://drive.google.com/uc?export=download&id=1bZinjn1gh3ZovYM2e6m_lwdkScYsooDG'
output = 'heart_new.csv'
gdown.download(url, output, quiet=False)
Heart = pd.read_csv('/content/heart_new.csv')
Heart.head()
marker = Heart.pop("target")

Downloading...
From: https://drive.google.com/uc?export=download&id=1bZinjn1gh3ZovYM2e6m_lwdkScYsooDG
To: /content/heart_new.csv
100%|██████████| 7.55k/7.55k [00:00<00:00, 10.7MB/s]


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
1,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
2,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
3,57,1,0,140,192,0,1,148,0,0.4,1,0,1,1
4,56,0,1,140,294,0,0,153,0,1.3,1,0,2,1


In [None]:
X_train, X_test = train_test_split(Heart, test_size=0.2, random_state=42)
y_train, y_test = train_test_split(marker, test_size=0.2, random_state=42)
y_test = y_test.values

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, f1_score
classifiers = []
loss_functions = ['perceptron', 'hinge', 'squared_hinge']
regularizations = ['l1', 'l2']
for loss_func in loss_functions:
    for penalty_type in regularizations:
        clf = SGDClassifier(
            loss=loss_func,      # Выбор функции потерь
            penalty=penalty_type, # Тип регуляризации
            alpha=0.0001,        # Коэффициент регуляризации
            max_iter=1000,       # Ограничение на количество итераций
            tol=1e-3,            # Условие остановки
            random_state=42      # Фиксированное значение для воспроизводимости
        )
        clf.fit(X_train, y_train)
        classifiers.append((loss_func, penalty_type, clf))
print("\nClassifier Evaluations:")
for i, (loss_func, penalty_type, clf) in enumerate(classifiers, 1):
    y_pred = clf.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')


    print(f"Classifier {i}: Loss Function = {loss_func}, Regularization = {penalty_type}")
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  F1 Score: {f1:.4f}\n")



Classifier Evaluations:
Classifier 1: Loss Function = perceptron, Regularization = l1
  Accuracy: 0.4390
  F1 Score: 0.2942

Classifier 2: Loss Function = perceptron, Regularization = l2
  Accuracy: 0.4390
  F1 Score: 0.2942

Classifier 3: Loss Function = hinge, Regularization = l1
  Accuracy: 0.4390
  F1 Score: 0.2942

Classifier 4: Loss Function = hinge, Regularization = l2
  Accuracy: 0.4390
  F1 Score: 0.2942

Classifier 5: Loss Function = squared_hinge, Regularization = l1
  Accuracy: 0.6098
  F1 Score: 0.6054

Classifier 6: Loss Function = squared_hinge, Regularization = l2
  Accuracy: 0.6585
  F1 Score: 0.6304



In [None]:
classifiers = []

# Комбинации параметров для GridSearchCV
param_grid_clf = {
    'loss': ['perceptron', 'hinge', 'squared_hinge'],
    'penalty': ['l1', 'l2', 'elasticnet'],
    'alpha': [0.0001, 0.001, 0.01],
    'l1_ratio': [0.15, 0.5, 0.85]  # Только для elasticnet
}
clf = SGDClassifier(max_iter=1000, tol=1e-3, random_state=42)
grid_search_clf = GridSearchCV(clf, param_grid_clf, cv=5, scoring='f1_weighted')
grid_search_clf.fit(X_train, y_train)
best_clf = grid_search_clf.best_estimator_
best_params_clf = grid_search_clf.best_params_
print(f"Best Classifier Parameters: {best_params_clf}")
y_pred_best = best_clf.predict(X_test)
accuracy_best = accuracy_score(y_test, y_pred_best)
f1_best = f1_score(y_test, y_pred_best, average='weighted')

print("\nBest Classifier Evaluation:")
print(f"  Accuracy: {accuracy_best:.4f}")
print(f"  F1 Score: {f1_best:.4f}")


Best Classifier Parameters: {'alpha': 0.001, 'l1_ratio': 0.85, 'loss': 'perceptron', 'penalty': 'elasticnet'}

Best Classifier Evaluation:
  Accuracy: 0.6098
  F1 Score: 0.5172


##Вывод
Classifier 1: Loss Function = perceptron, Regularization = l1
такая модель выиграла при поиске нашего личного решения.

GridSearchCV Classifier : Loss Function = perceptron, Regularization = elasticnet

как видим наши модели не совпадают, при этом модель выбранная GridSearchCV показала более плохие результаты. Отсюда делаем вывод, что это может быть связано с переобучением модели, и GridSearchCV не дает гарантии на поиск идеальной модели