In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.model_selection import GridSearchCV

In [2]:
#читаем данные
df = pd.read_csv('adult.csv')

In [3]:
#смотрим информацию датасета
#df.info()
#df.head()

In [4]:
#удалим строки, где есть пустые значения
df.dropna(inplace = True)
#найдем категориальные переменные из переменных типа int64
print(df['class'].unique(), '\n', df['age'].unique(), '\n', df['capitalgain'].unique(), '\n', \
      df['capitalloss'].unique(), '\n', df['hoursperweek'].unique())
#предполагаю, что это все именно категории

['<=50K' '>50K'] 
 [2 3 1 0 4] 
 [1 0 4 2 3] 
 [0 3 1 2 4] 
 [2 0 3 4 1]


In [5]:
#получим массив целевой переменной, закодируем классы
Y = [0 if x == '<=50K' else 1 for x in df['class']]

In [6]:
#закодируем категориальные переменные, уберем проблему мультиколинеарности (?)
df['age'] = df['age'].astype('category')
df['capitalgain'] = df['capitalgain'].astype('category')
df['capitalloss'] = df['capitalloss'].astype('category')
df['hoursperweek'] = df['hoursperweek'].astype('category')

X_dumm = pd.get_dummies(df.drop(['class'], axis = 1), drop_first = True)

#стандартизируем переменные
X_dumm_scalled = StandardScaler().fit_transform(X_dumm)
#X_dumm_scalled = MinMaxScaler().fit_transform(X_dumm)

In [7]:
#получим тренировочный и тестовый сэмплы
X_train, X_test, Y_train, Y_test = train_test_split(X_dumm_scalled, Y, test_size = 0.3, \
                                                    random_state = 1, stratify = Y)

In [8]:
#------------- Logistic Regression -------------------------

In [9]:
lrc = LogisticRegression(random_state = 42, max_iter = 1000).fit(X_train, Y_train)
lrc.score(X_train, Y_train), lrc.predict_proba(X_train)

(0.8526615068709524,
 array([[0.95048217, 0.04951783],
        [0.6790194 , 0.3209806 ],
        [0.63642399, 0.36357601],
        ...,
        [0.97354797, 0.02645203],
        [0.12135552, 0.87864448],
        [0.86125818, 0.13874182]]))

In [10]:
lrc.score(X_test, Y_test), lrc.predict_proba(X_test)

(0.8552369720645684,
 array([[8.86201620e-01, 1.13798380e-01],
        [9.93265003e-01, 6.73499743e-03],
        [5.97142720e-01, 4.02857280e-01],
        ...,
        [3.16926286e-04, 9.99683074e-01],
        [5.18297851e-01, 4.81702149e-01],
        [9.73605370e-01, 2.63946299e-02]]))

In [None]:
#------------- SVM -------------------------

In [None]:
%%time
svc = SVC(random_state = 42, gamma = 'auto').fit(X_train, Y_train)

In [None]:
%%time
svc.score(X_train, Y_train)

In [None]:
%%time
svc.score(X_test, Y_test)

In [None]:
#--------------- Вывод -----------------------
# Logistic Regression и SVM в лайт версии (без подбора гиперпараметров) показали одинаковый результат

In [None]:
#------------- Multi Layer Perceptron -------------------------

In [72]:
%%time
from sklearn.neural_network import MLPClassifier

#mlp = MLPClassifier(solver = 'adam', random_state = 42, hidden_layer_sizes = [107, 107], activation='relu', \
#                    alpha = .01, max_iter = 10000).fit(X_train, Y_train)


mlp = MLPClassifier(random_state = 42, max_iter = 10000).fit(X_train, Y_train)

CPU times: user 3min 7s, sys: 4.42 s, total: 3min 12s
Wall time: 1min 39s


In [73]:
%%time
mlp.score(X_train, Y_train)

CPU times: user 109 ms, sys: 9.19 ms, total: 118 ms
Wall time: 61.2 ms


0.9151160954035698

In [74]:
%%time
mlp.score(X_test, Y_test)

CPU times: user 53.7 ms, sys: 5.17 ms, total: 58.9 ms
Wall time: 29.9 ms


0.8208889216481168