## ДЗ 1

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

from sklearn.model_selection import train_test_split # разбиение данных на тренировочные и тестовые

from sklearn.compose import ColumnTransformer # преобразование столбцов
from sklearn.preprocessing import OneHotEncoder # кодирование категориальных переменных
from sklearn.preprocessing import LabelEncoder

from sklearn.preprocessing import StandardScaler # нормализация и масштабирование данных

from sklearn.linear_model import LogisticRegression # логистическая регрессия
from sklearn.neighbors import KNeighborsClassifier # k-ближайших новых соседей
from sklearn.svm import SVC # SVM для классификации
from sklearn.naive_bayes import GaussianNB # Naive bayes
from sklearn.tree import DecisionTreeClassifier, export_graphviz # Дерево Решений
from sklearn.ensemble import RandomForestClassifier  # Random Forest


from sklearn.tree import plot_tree # отрисовка дерева

from sklearn.metrics import confusion_matrix, accuracy_score, ConfusionMatrixDisplay

from sklearn.model_selection import cross_val_score # кроссвалидация
from sklearn.model_selection import GridSearchCV # подбор гиперпараметров с кроссвалидацией

### Шаг 1: Разведочный анализ данных (Exploratory Data Analysis, EDA)

In [14]:
names = ['Age', 'Workclass', 'fnlwgt', 'Education', 'Education_Num', 'Martial_Status', 'Occupation',
         'Relationship', 'Race', 'Sex', 'Capital_Gain', 'Capital_Loss', 'Hours_per_week', 'Country', 'Target']

data = pd.read_csv('adult.data', header=None, names=names, na_values="?", skipinitialspace=True)
data_test = pd.read_csv('adult.data', header=None, names=names, na_values="?", skipinitialspace=True)

data.fillna(data.mean(), inplace=True)
data_test.fillna(data_test.mean(), inplace=True)

data = pd.get_dummies(data, columns=['Workclass', 'Education', 'Martial_Status', 'Occupation',
                                     'Relationship', 'Race', 'Sex', 'Country'])
data_test = pd.get_dummies(data_test, columns=['Workclass', 'Education', 'Martial_Status', 'Occupation',
                                     'Relationship', 'Race', 'Sex', 'Country'])

X_train = data.drop('Target', axis=1)
y_train = data['Target']

X_test = data_test.drop('Target', axis=1)
y_test = data_test['Target']

X_train = X_train.drop('Country_Holand-Netherlands', axis=1)
X_test = X_test.drop('Country_Holand-Netherlands', axis=1)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

  data.fillna(data.mean(), inplace=True)
  data_test.fillna(data_test.mean(), inplace=True)


### Шаг 2: Создание признака для Голландии в тестовой выборке

In [15]:
data['Country_Holland'] = 0
data_test['Country_Holland'] = 0

### Шаг 3: Обучение Logistic Regression (baseline)

In [16]:
# Обучение Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train_scaled, y_train)

# Оценка модели
test_score = logreg.score(X_test_scaled, y_test)
y_pred_logreg = logreg.predict(X_test_scaled)
print("Accuracy Logistic Regression:", accuracy_score(y_test, y_pred_logreg))

Accuracy Logistic Regression: 0.8533521697736556


### Шаг 4-7: Обучение других моделей

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train_scaled, y_train)

svc = SVC()
svc.fit(X_train_scaled, y_train)

nb = GaussianNB()
nb.fit(X_train_scaled, y_train)

tree = DecisionTreeClassifier()
tree.fit(X_train_scaled, y_train)

forest = RandomForestClassifier()
forest.fit(X_train_scaled, y_train)

### Шаг 5: Обучение SVC с GridSearchCV

In [18]:
svc_params = {'gamma':[0.001, 0.01, 0.1, 1, 10, 100], 'C': [0.001, 0.01, 0.1, 1, 10, 100], 'max_iter':[50, 100, 200, 500, 1000]}
svc_grid = GridSearchCV(svc, svc_params, n_jobs=-1, cv=3)
svc_grid.fit(X_train_scaled, y_train)
best_svc = svc_grid.best_estimator_



### Шаг 6: Обучение Decision Tree с GridSearchCV

In [19]:
tree_params = {'max_depth': range(2, 11)}
tree_grid = GridSearchCV(tree, tree_params, n_jobs=-1, cv=5)
tree_grid.fit(X_train_scaled, y_train)
best_tree = tree_grid.best_estimator_

### Шаг 7: Обучение Random Forest с GridSearchCV

In [23]:
forest_params = {'max_depth': range(10, 21, 2), 'max_features': range(5, 105, 20)}
forest_grid = GridSearchCV(forest, forest_params, n_jobs=-1, cv=3)
forest_grid.fit(X_train_scaled, y_train)
best_forest = forest_grid.best_estimator_

### Шаг 9: Сравнение всех моделей с baseline

In [24]:
models = [logreg, knn, svc, nb, tree, forest, best_svc, best_tree, best_forest]
model_names = ['Logistic Regression', 'k-NN', 'SVC', 'Naive Bayes', 'Decision Tree', 'Random Forest', 'Best SVC', 'Best Decision Tree', 'Best Random Forest']

for model, name in zip(models, model_names):
    y_pred = model.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    print(f"Accuracy {name}: {acc}")

Accuracy Logistic Regression: 0.8533521697736556
Accuracy k-NN: 0.8764165719726053
Accuracy SVC: 0.8631491661803998
Accuracy Naive Bayes: 0.5285771321519609
Accuracy Decision Tree: 0.9999692884125181
Accuracy Random Forest: 0.9999692884125181
Accuracy Best SVC: 0.7822548447529253
Accuracy Best Decision Tree: 0.8639783790424127
Accuracy Best Random Forest: 0.9018150548201836


### Шаг 10: Вывод confusion matrix

In [25]:
conf_matrix_logreg = confusion_matrix(y_test, y_pred_logreg)
print("Confusion Matrix Logistic Regression:")
print(conf_matrix_logreg)

Confusion Matrix Logistic Regression:
[[23039  1681]
 [ 3094  4747]]


## ДЗ 2

### Шаг 1: Разбиение датасета digits на обучающую и тестовую выборку

In [26]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split

# Загрузка датасета
digits = load_digits()
X = digits.data
y = digits.target

# Разделение на обучающую и тестовую выборку
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

### Шаг 2: Поиск лучшего классификатора и параметров

In [27]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

svm = SVC()

param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1], 'kernel': ['linear', 'rbf', 'poly']}

grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Лучшие параметры:", best_params)

best_classifier = grid_search.best_estimator_

Лучшие параметры: {'C': 0.1, 'gamma': 0.001, 'kernel': 'poly'}


### Шаг 3: Вывод матрицы несоответствия

In [28]:
from sklearn.metrics import confusion_matrix

# Предсказание на тестовой выборке
y_pred = best_classifier.predict(X_test)

# Вывод матрицы несоответствия
conf_matrix = confusion_matrix(y_test, y_pred)
print("Матрица несоответствия:")
print(conf_matrix)

Матрица несоответствия:
[[23  0  0  0  0  0  0  0  0  0]
 [ 0 18  0  0  0  0  0  0  0  0]
 [ 0  0 26  0  0  0  0  0  0  0]
 [ 0  0  0 24  0  0  0  0  0  0]
 [ 0  0  0  0 37  0  0  0  0  0]
 [ 0  0  0  0  0 33  0  0  0  0]
 [ 0  0  0  0  0  0 30  0  0  0]
 [ 0  0  0  0  0  0  0 25  0  1]
 [ 0  0  0  0  0  0  0  0 19  0]
 [ 0  0  0  0  0  0  0  1  0 33]]
