In [2]:
# Импорт библиотек
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Пункт 1: Підготовка даних
# Загрузка данных
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status',
                'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
                'hours_per_week', 'native_country', 'income']
df = pd.read_csv(url, na_values=' ?', header=None, names=column_names)

# Просмотр данных
print("Shape:", df.shape)
display(df.head())

# Удаление пропусков
print("Пустые значения до замены:")
print(df.isna().sum())

# Замена пропущенных значений
# Для числовых столбцов - среднее значение
numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
df[numeric_columns] = df[numeric_columns].apply(lambda x: x.fillna(x.mean()))

# Для категориальных столбцов - мода
categorical_columns = df.select_dtypes(include=['object']).columns
df[categorical_columns] = df[categorical_columns].apply(lambda x: x.fillna(x.mode()[0]))

print("Пустые значения после замены:")
print(df.isna().sum())

# Кодирование категориальных признаков
df = pd.get_dummies(df, columns=['workclass', 'education', 'marital_status', 'occupation',
                                  'relationship', 'race', 'sex', 'native_country'], drop_first=True)

# Кодирование целевой переменной
label_encoder = LabelEncoder()
df['income'] = label_encoder.fit_transform(df['income'])

# Масштабирование количественных признаков
scaler = MinMaxScaler()
numerical_columns = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
# Просмотр данных
print("Shape:", df.shape)
display(df.head())

Shape: (32561, 15)


Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


Пустые значения до замены:
age                  0
workclass         1836
fnlwgt               0
education            0
education_num        0
marital_status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital_gain         0
capital_loss         0
hours_per_week       0
native_country     583
income               0
dtype: int64
Пустые значения после замены:
age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64
Shape: (32561, 98)


Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,income,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,...,native_country_ Portugal,native_country_ Puerto-Rico,native_country_ Scotland,native_country_ South,native_country_ Taiwan,native_country_ Thailand,native_country_ Trinadad&Tobago,native_country_ United-States,native_country_ Vietnam,native_country_ Yugoslavia
0,0.30137,0.044302,0.8,0.02174,0.0,0.397959,0,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1,0.452055,0.048238,0.8,0.0,0.0,0.122449,0,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,0.287671,0.138113,0.533333,0.0,0.0,0.397959,0,False,False,True,...,False,False,False,False,False,False,False,True,False,False
3,0.493151,0.151068,0.4,0.0,0.0,0.397959,0,False,False,True,...,False,False,False,False,False,False,False,True,False,False
4,0.150685,0.221488,0.8,0.0,0.0,0.397959,0,False,False,True,...,False,False,False,False,False,False,False,False,False,False


In [3]:
# Пункт 2: Перевірка балансу класів
print("Баланс классов:")
print(df['income'].value_counts())


Баланс классов:
income
0    24720
1     7841
Name: count, dtype: int64


In [4]:
# Пункт 4: Розділення даних
X = df.drop('income', axis=1)
y = df['income']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [5]:
# Пункт 5: Навчання логістичної регресії без регуляризації
log_reg = LogisticRegression(penalty=None, max_iter=1000, solver='lbfgs')
log_reg.fit(X_train, y_train)
y_pred_log = log_reg.predict(X_test)


In [6]:
# Пункт 6: Оцінка якості моделі
print("\nЛогистическая регрессия без регуляризации:")
print("Accuracy:", accuracy_score(y_test, y_pred_log))
print("Precision:", precision_score(y_test, y_pred_log))
print("Recall:", recall_score(y_test, y_pred_log))
print("F1-Score:", f1_score(y_test, y_pred_log))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_log))


Логистическая регрессия без регуляризации:
Accuracy: 0.8547446002661481
Precision: 0.7378888322284549
Recall: 0.6152210884353742
F1-Score: 0.6709946672849525
Confusion Matrix:
 [[6903  514]
 [ 905 1447]]


In [7]:
# Пункт 7: Оцінка якості моделі на основі перехресної перевірки
cv_scores = cross_val_score(log_reg, X, y, cv=5, scoring='accuracy')
print("\nСредняя точность (кросс-валидация):", cv_scores.mean())


Средняя точность (кросс-валидация): 0.8508646805802496


In [8]:
# Пункт 8: Навчання логістичної регресії з регуляризацією
log_reg_l2 = LogisticRegression(penalty='l2', max_iter=1000, solver='lbfgs', C=1.0)
log_reg_l2.fit(X_train, y_train)
y_pred_log_l2 = log_reg_l2.predict(X_test)

print("\nЛогистическая регрессия с L2-регуляризацией:")
print("Accuracy:", accuracy_score(y_test, y_pred_log_l2))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_log_l2))

# GridSearchCV для подбора параметра регуляризации
param_grid = {'C': [0.1, 1.0, 10.0]}
grid_search = GridSearchCV(log_reg_l2, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)
print("Лучший параметр регуляризации:", grid_search.best_params_)



Логистическая регрессия с L2-регуляризацией:
Accuracy: 0.8560753403623708
Confusion Matrix:
 [[6923  494]
 [ 912 1440]]
Лучший параметр регуляризации: {'C': 10.0}


In [9]:
# Підбір гіперпараметрів для k-NN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report

# Упрощенные параметры для RandomizedSearchCV
param_dist = {
    'n_neighbors': np.arange(1, 11),  # Уменьшили диапазон до 10
    'weights': ['uniform', 'distance'],  # Только два варианта веса
    'metric': ['euclidean', 'manhattan']  # Убрали minkowski для ускорения
}

# Підбір гіперпараметрів
knn = KNeighborsClassifier()
random_search = RandomizedSearchCV(
    knn,
    param_distributions=param_dist,
    n_iter=5,  # Уменьшили количество итераций до 5
    cv=2,  # Уменьшили количество фолдов до 2
    n_jobs=-1,  # Используем все доступные процессоры
    random_state=42
)

# Навчання моделі
random_search.fit(X_train, y_train)

# Кращі параметри
best_params = random_search.best_params_
print(f"Кращі параметри для k-NN: {best_params}")

# Оцінка на тестовій вибірці
best_knn = random_search.best_estimator_
y_pred_knn = best_knn.predict(X_test)

# Виведення звіту про класифікацію
print("Звіт про класифікацію для k-NN:")
print(classification_report(y_test, y_pred_knn))



Кращі параметри для k-NN: {'weights': 'uniform', 'n_neighbors': 9, 'metric': 'euclidean'}
Звіт про класифікацію для k-NN:
              precision    recall  f1-score   support

           0       0.88      0.91      0.89      7417
           1       0.67      0.59      0.63      2352

    accuracy                           0.83      9769
   macro avg       0.77      0.75      0.76      9769
weighted avg       0.82      0.83      0.83      9769



In [10]:
# Оценка k-NN
print("\nK-Nearest Neighbors:")
print("Accuracy:", accuracy_score(y_test, y_pred_knn))
print("Precision:", precision_score(y_test, y_pred_knn))
print("Recall:", recall_score(y_test, y_pred_knn))
print("F1-Score:", f1_score(y_test, y_pred_knn))


K-Nearest Neighbors:
Accuracy: 0.8303818200429931
Precision: 0.665871121718377
Recall: 0.5931122448979592
F1-Score: 0.6273892511805712


In [11]:
# Пункт 12: Порівняння класифікаторів
print("\nСравнение моделей:")
print("\nЛогистическая регрессия:")
print(classification_report(y_test, y_pred_log))
print("\nk-NN:")
print(classification_report(y_test, y_pred_knn))


Сравнение моделей:

Логистическая регрессия:
              precision    recall  f1-score   support

           0       0.88      0.93      0.91      7417
           1       0.74      0.62      0.67      2352

    accuracy                           0.85      9769
   macro avg       0.81      0.77      0.79      9769
weighted avg       0.85      0.85      0.85      9769


k-NN:
              precision    recall  f1-score   support

           0       0.88      0.91      0.89      7417
           1       0.67      0.59      0.63      2352

    accuracy                           0.83      9769
   macro avg       0.77      0.75      0.76      9769
weighted avg       0.82      0.83      0.83      9769

