## Методы линеной и логистической регрессий

### Лабораторная работа №3

#### Задание 1
Провести классификацию найденного датасета, методами линеной и логистической регрессий . В формате Markdown написать пояснения. Объяснить почему были выбраны именно такие гиперпараметры, была ли перекрестная проверка, и т.д.

In [1]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# Загрузка данных
file_path = "../Electric_Vehicle_Population_Data.csv"
df = pd.read_csv(file_path)
df.dropna(inplace=True)
# Разделение на признаки (X) и целевую переменную (y)
X = df[['Postal Code', 'Model Year', 'Electric Range', 'Base MSRP', 'Legislative District', 'DOL Vehicle ID', '2020 Census Tract']][0:10000]
y = df['Electric Range'][0:10000]
# Разделение данных на тренировочный и тестовый наборы
# random_state=42 - гарантирует, что данные каждый раз будут одинакого разбиваться
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Масштабирование признаков (нормализация)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
linear_model = LinearRegression()
linear_model.fit(X_train_scaled, y_train)
linear_accuracy = linear_model.score(X_test_scaled, y_test)



Гиперпараметры:
   - C: обратная сила регуляризации. Меньшие значения C указывают на более сильную регуляризацию.
   - penalty: задает тип регуляризации (например, L1 или L2).
   - solver: алгоритм, используемый для оптимизации весов (например, 'liblinear', 'saga', 'lbfgs' и другие).

In [2]:
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga', 'lbfgs']
}

Перекрестная проверка гиперпараметров

In [3]:
logistic = LogisticRegression(max_iter=1000)
grid = GridSearchCV(logistic, param_grid, refit=True, verbose=3)
grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits




[CV 1/5] END C=0.01, penalty=l1, solver=liblinear;, score=0.563 total time=   0.3s
[CV 2/5] END C=0.01, penalty=l1, solver=liblinear;, score=0.566 total time=   0.3s
[CV 3/5] END C=0.01, penalty=l1, solver=liblinear;, score=0.562 total time=   0.2s
[CV 4/5] END C=0.01, penalty=l1, solver=liblinear;, score=0.562 total time=   0.2s
[CV 5/5] END C=0.01, penalty=l1, solver=liblinear;, score=0.566 total time=   0.3s
[CV 1/5] END ...C=0.01, penalty=l1, solver=saga;, score=0.562 total time=   2.3s
[CV 2/5] END ...C=0.01, penalty=l1, solver=saga;, score=0.565 total time=   2.3s
[CV 3/5] END ...C=0.01, penalty=l1, solver=saga;, score=0.561 total time=   2.3s
[CV 4/5] END ...C=0.01, penalty=l1, solver=saga;, score=0.562 total time=   2.3s
[CV 5/5] END ...C=0.01, penalty=l1, solver=saga;, score=0.565 total time=   2.2s
[CV 1/5] END ....C=0.01, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 2/5] END ....C=0.01, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 3/5] END ....C



[CV 1/5] END ......C=1, penalty=l1, solver=saga;, score=0.772 total time=  38.5s




[CV 2/5] END ......C=1, penalty=l1, solver=saga;, score=0.777 total time=  40.0s




[CV 3/5] END ......C=1, penalty=l1, solver=saga;, score=0.776 total time=  45.4s




[CV 4/5] END ......C=1, penalty=l1, solver=saga;, score=0.766 total time= 1.0min




[CV 5/5] END ......C=1, penalty=l1, solver=saga;, score=0.782 total time=  59.4s
[CV 1/5] END .......C=1, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 2/5] END .......C=1, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 3/5] END .......C=1, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 4/5] END .......C=1, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 5/5] END .......C=1, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 1/5] END .C=1, penalty=l2, solver=liblinear;, score=0.606 total time=   0.9s
[CV 2/5] END .C=1, penalty=l2, solver=liblinear;, score=0.618 total time=   0.9s
[CV 3/5] END .C=1, penalty=l2, solver=liblinear;, score=0.611 total time=   1.0s
[CV 4/5] END .C=1, penalty=l2, solver=liblinear;, score=0.608 total time=   1.1s
[CV 5/5] END .C=1, penalty=l2, solver=liblinear;, score=0.610 total time=   1.1s
[CV 1/5] END ......C=1, penalty=l2, solver=saga;, score=0.696 total time=  24.0s
[CV 2/5] END ......C=1, pena



[CV 1/5] END .....C=10, penalty=l1, solver=saga;, score=0.786 total time=  38.0s




[CV 2/5] END .....C=10, penalty=l1, solver=saga;, score=0.791 total time=  52.9s




[CV 3/5] END .....C=10, penalty=l1, solver=saga;, score=0.794 total time=  59.5s




[CV 4/5] END .....C=10, penalty=l1, solver=saga;, score=0.786 total time=  34.4s




[CV 5/5] END .....C=10, penalty=l1, solver=saga;, score=0.796 total time=  33.9s
[CV 1/5] END ......C=10, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 2/5] END ......C=10, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 3/5] END ......C=10, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 4/5] END ......C=10, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 5/5] END ......C=10, penalty=l1, solver=lbfgs;, score=nan total time=   0.0s
[CV 1/5] END C=10, penalty=l2, solver=liblinear;, score=0.644 total time=   0.6s
[CV 2/5] END C=10, penalty=l2, solver=liblinear;, score=0.646 total time=   0.6s
[CV 3/5] END C=10, penalty=l2, solver=liblinear;, score=0.641 total time=   0.6s
[CV 4/5] END C=10, penalty=l2, solver=liblinear;, score=0.637 total time=   0.6s
[CV 5/5] END C=10, penalty=l2, solver=liblinear;, score=0.636 total time=   0.6s




[CV 1/5] END .....C=10, penalty=l2, solver=saga;, score=0.775 total time=  26.1s




[CV 2/5] END .....C=10, penalty=l2, solver=saga;, score=0.777 total time=  26.0s




[CV 3/5] END .....C=10, penalty=l2, solver=saga;, score=0.775 total time=  25.7s




[CV 4/5] END .....C=10, penalty=l2, solver=saga;, score=0.764 total time=  25.7s




[CV 5/5] END .....C=10, penalty=l2, solver=saga;, score=0.782 total time=  25.6s
[CV 1/5] END ....C=10, penalty=l2, solver=lbfgs;, score=0.784 total time=   8.6s
[CV 2/5] END ....C=10, penalty=l2, solver=lbfgs;, score=0.786 total time=   9.4s
[CV 3/5] END ....C=10, penalty=l2, solver=lbfgs;, score=0.786 total time=   8.8s
[CV 4/5] END ....C=10, penalty=l2, solver=lbfgs;, score=0.777 total time=   8.5s
[CV 5/5] END ....C=10, penalty=l2, solver=lbfgs;, score=0.791 total time=   8.7s


20 fits failed out of a total of 120.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/home/marcus/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/marcus/.local/lib/python3.10/site-packages/sklearn/base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/marcus/.local/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 1169, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/home/marcus/.local/lib/python3.10/site-packages/sklearn/linear_model/_logisti

In [4]:
# Оценка производительности модели логистической регрессии на тестовом наборе
best_logistic = grid.best_estimator_
logistic_accuracy = best_logistic.score(X_test_scaled, y_test)
print(f"Лучшие гиперпараметры логистической регрессии: {grid.best_params_}")
print(f"Точность модели логистической регрессии: {logistic_accuracy}")
print(f"Точность модели линейной регрессии: {linear_accuracy}")

# Предсказания на тестовой выборке с использованием лучшей модели
predictions = best_logistic.predict(X_test_scaled)
# Отчет по классификации
print("\nОтчет по классификации:")
print(classification_report(y_test, predictions))

Лучшие гиперпараметры логистической регрессии: {'C': 10, 'penalty': 'l1', 'solver': 'saga'}
Точность модели логистической регрессии: 0.804
Точность модели линейной регрессии: 1.0

Отчет по классификации:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       971
           6       0.80      1.00      0.89         8
           8       0.00      0.00      0.00         1
          10       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         1
          13       0.00      0.00      0.00         6
          14       0.50      0.40      0.44        10
          16       0.25      0.08      0.12        13
          17       0.00      0.00      0.00         5
          18       0.36      0.44      0.40        18
          19       0.65      0.89      0.75        36
          20       0.00      0.00      0.00        10
          21       0.54      0.87      0.67        39
          22       0.00      0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
