## Лабораторная работа №3

### Задание

Провести классификацию найденного датасета, методами линеной и логистической регрессий . В формате Markdown написать пояснения. Объяснить почему были выбраны именно такие гиперпараметры, была ли перекрестная проверка, и т.д.

### Выполнение работы:

#### Шаг 1. Загрузка датасета

In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import pandas as pd

file_path = "../../dataset/result_dataset.csv"
data = pd.read_csv(file_path)

data

Unnamed: 0,Date,Day,Month,Year,Customer_Age,Age_Group,Customer_Gender,Country,State,Product_Category,Sub_Category,Product,Order_Quantity,Unit_Cost,Unit_Price,Profit,Cost,Revenue
0,1.385424e+09,26,0,2013,19,0,0,0,0,0,0,0,8,45,120,1,360,950
1,1.448496e+09,26,0,2015,19,0,0,0,0,0,0,0,8,45,120,1,360,950
2,1.395533e+09,23,1,2014,49,1,0,1,1,0,0,0,23,45,120,1,1035,2401
3,1.458691e+09,23,1,2016,49,1,0,1,1,0,0,0,20,45,120,1,900,2088
4,1.400112e+09,15,2,2014,47,1,1,1,1,0,0,0,4,45,120,0,180,418
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113031,1.460419e+09,12,11,2016,41,1,0,5,9,1,16,129,3,24,64,0,72,184
113032,1.396397e+09,2,11,2014,18,0,0,1,8,1,16,128,22,24,64,1,528,1183
113033,1.459555e+09,2,11,2016,18,0,0,1,8,1,16,128,22,24,64,1,528,1183
113034,1.393891e+09,4,1,2014,37,1,1,4,17,1,16,127,24,24,64,1,576,1260


#### Шаг 2. Разделение данных и нормализация

In [2]:
# Разделение на признаки (X) и целевую переменную (y)
X = data.drop('Profit', axis=1)
y = data['Profit']

# Разделение данных на тренировочный и тестовый наборы
# random_state=42 - гарантирует, что данные каждый раз будут одинакого разбиваться
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Масштабирование признаков (нормализация)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#### Шаг 3. Линейная регрессия

In [3]:
linear_model = LinearRegression()
linear_model.fit(X_train_scaled, y_train)
linear_accuracy = linear_model.score(X_test_scaled, y_test)

#### Шаг 4. Логистическая регрессия

Гиперпараметры:

C: обратная сила регуляризации. Меньшие значения C указывают на более сильную регуляризацию.
penalty: задает тип регуляризации (например, L1 или L2).
solver: алгоритм, используемый для оптимизации весов (например, 'liblinear', 'saga', 'lbfgs' и другие).

In [4]:
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['saga', 'lbfgs']
}

#### Шаг 5. Перекрестная проверка гиперпараметров

In [5]:
logistic = LogisticRegression(max_iter=3000)
grid = GridSearchCV(logistic, param_grid, refit=True, verbose=3)
grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
[CV 1/5] END ...C=0.01, penalty=l1, solver=saga;, score=0.984 total time=   1.7s
[CV 2/5] END ...C=0.01, penalty=l1, solver=saga;, score=0.982 total time=  13.9s
[CV 3/5] END ...C=0.01, penalty=l1, solver=saga;, score=0.982 total time=  13.3s
[CV 4/5] END ...C=0.01, penalty=l1, solver=saga;, score=0.981 total time=  13.3s
[CV 5/5] END ...C=0.01, penalty=l1, solver=saga;, score=0.981 total time=  13.1s
[CV 1/5] END ....C=0.01, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 2/5] END ....C=0.01, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 3/5] END ....C=0.01, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 4/5] END ....C=0.01, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 5/5] END ....C=0.01, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 1/5] END ...C=0.01, penalty=l2, solver=saga;, score=0.929 total time=   0.2s
[CV 2/5] END ...C=0.01, penalty=l2, solver=saga;



[CV 2/5] END ......C=1, penalty=l1, solver=saga;, score=0.997 total time=  42.9s




[CV 3/5] END ......C=1, penalty=l1, solver=saga;, score=0.997 total time=  43.2s




[CV 4/5] END ......C=1, penalty=l1, solver=saga;, score=0.997 total time=  42.9s




[CV 5/5] END ......C=1, penalty=l1, solver=saga;, score=0.997 total time=  43.0s
[CV 1/5] END .......C=1, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 2/5] END .......C=1, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 3/5] END .......C=1, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 4/5] END .......C=1, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 5/5] END .......C=1, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 1/5] END ......C=1, penalty=l2, solver=saga;, score=0.994 total time=   2.5s
[CV 2/5] END ......C=1, penalty=l2, solver=saga;, score=0.993 total time=  17.1s
[CV 3/5] END ......C=1, penalty=l2, solver=saga;, score=0.992 total time=  17.1s
[CV 4/5] END ......C=1, penalty=l2, solver=saga;, score=0.992 total time=  17.2s
[CV 5/5] END ......C=1, penalty=l2, solver=saga;, score=0.993 total time=  17.2s
[CV 1/5] END .....C=1, penalty=l2, solver=lbfgs;, score=0.995 total time=   0.2s
[CV 2/5] END .....C=1, penal



[CV 1/5] END .....C=10, penalty=l1, solver=saga;, score=0.999 total time=  43.8s




[CV 2/5] END .....C=10, penalty=l1, solver=saga;, score=0.997 total time=  43.1s




[CV 3/5] END .....C=10, penalty=l1, solver=saga;, score=0.997 total time=  43.2s




[CV 4/5] END .....C=10, penalty=l1, solver=saga;, score=0.997 total time=  43.2s




[CV 5/5] END .....C=10, penalty=l1, solver=saga;, score=0.997 total time=  46.0s
[CV 1/5] END ......C=10, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 2/5] END ......C=10, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 3/5] END ......C=10, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 4/5] END ......C=10, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 5/5] END ......C=10, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 1/5] END .....C=10, penalty=l2, solver=saga;, score=0.998 total time=  13.0s
[CV 2/5] END .....C=10, penalty=l2, solver=saga;, score=0.997 total time=  37.2s
[CV 3/5] END .....C=10, penalty=l2, solver=saga;, score=0.997 total time=  38.6s
[CV 4/5] END .....C=10, penalty=l2, solver=saga;, score=0.996 total time=  37.4s
[CV 5/5] END .....C=10, penalty=l2, solver=saga;, score=0.997 total time=  37.3s
[CV 1/5] END ....C=10, penalty=l2, solver=lbfgs;, score=0.998 total time=   0.2s
[CV 2/5] END ....C=10, penal

20 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/home/ns/dynamic_languages/venv/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/ns/dynamic_languages/venv/lib/python3.11/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/ns/dynamic_languages/venv/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py", line 1169, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
 

#### Шаг 6. Вывод результатов

In [6]:
# Оценка производительности модели логистической регрессии на тестовом наборе
best_logistic = grid.best_estimator_
logistic_accuracy = best_logistic.score(X_test_scaled, y_test)
print(f"Лучшие гиперпараметры логистической регрессии: {grid.best_params_}")
print(f"Точность модели логистической регрессии: {logistic_accuracy}")
print(f"Точность модели линейной регрессии: {linear_accuracy}")

# Предсказания на тестовой выборке с использованием лучшей модели
predictions = best_logistic.predict(X_test_scaled)
# Отчет по классификации
print("\nОтчет по классификации:")
print(classification_report(y_test, predictions))

Лучшие гиперпараметры логистической регрессии: {'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}
Точность модели логистической регрессии: 0.9984961075725407
Точность модели линейной регрессии: 0.5855844265790964

Отчет по классификации:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     15691
           1       1.00      1.00      1.00      6917

    accuracy                           1.00     22608
   macro avg       1.00      1.00      1.00     22608
weighted avg       1.00      1.00      1.00     22608

