In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.model_selection import GridSearchCV

In [29]:
#читаем данные
df = pd.read_csv('adult.csv')

In [30]:
#смотрим информацию датасета
#df.info()
#df.head()

In [31]:
#удалим строки, где есть пустые значения
df.dropna(inplace = True)
#найдем категориальные переменные из переменных типа int64
print(df['class'].unique(), '\n', df['age'].unique(), '\n', df['capitalgain'].unique(), '\n', \
      df['capitalloss'].unique(), '\n', df['hoursperweek'].unique())
#предполагаю, что это все именно категории

['<=50K' '>50K'] 
 [2 3 1 0 4] 
 [1 0 4 2 3] 
 [0 3 1 2 4] 
 [2 0 3 4 1]


In [32]:
#получим массив целевой переменной, закодируем классы
Y = [0 if x == '<=50K' else 1 for x in df['class']]

In [33]:
#закодируем категориальные переменные, уберем проблему мультиколинеарности (?)
df['age'] = df['age'].astype('category')
df['capitalgain'] = df['capitalgain'].astype('category')
df['capitalloss'] = df['capitalloss'].astype('category')
df['hoursperweek'] = df['hoursperweek'].astype('category')

X_dumm = pd.get_dummies(df.drop(['class'], axis = 1), drop_first = True)

#стандартизируем переменные
X_dumm_scalled = StandardScaler().fit_transform(X_dumm)
#X_dumm_scalled = MinMaxScaler().fit_transform(X_dumm)

In [34]:
#получим тренировочный и тестовый сэмплы
X_train, X_test, Y_train, Y_test = train_test_split(X_dumm_scalled, Y, test_size = 0.3, \
                                                    random_state = 1, stratify = Y)

In [20]:
#------------- Logistic Regression -------------------------

In [23]:
lrc = LogisticRegression(random_state = 42, max_iter = 1000).fit(X_train, Y_train)
lrc.score(X_train, Y_train), lrc.predict_proba(X_train)

(0.8527246880429632,
 array([[0.95082581, 0.04917419],
        [0.67614764, 0.32385236],
        [0.66209342, 0.33790658],
        ...,
        [0.97230293, 0.02769707],
        [0.12953193, 0.87046807],
        [0.85886229, 0.14113771]]))

In [24]:
lrc.score(X_test, Y_test), lrc.predict_proba(X_test)

(0.8550158472764797,
 array([[0.84941404, 0.15058596],
        [0.9931956 , 0.0068044 ],
        [0.59803775, 0.40196225],
        ...,
        [0.00152487, 0.99847513],
        [0.51760229, 0.48239771],
        [0.97063888, 0.02936112]]))

In [None]:
#------------- SVM -------------------------

In [None]:
%%time
svc = SVC(random_state = 42, gamma = 'auto').fit(X_train, Y_train)

In [None]:
%%time
svc.score(X_train, Y_train)

In [None]:
%%time
svc.score(X_test, Y_test)

In [None]:
#--------------- Вывод -----------------------
# Logistic Regression и SVM в лайт версии (без подбора гиперпараметров) показали одинаковый результат

In [None]:
#------------- Multi Layer Perceptron -------------------------

In [None]:
%%time
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(solver='adam', random_state=0, hidden_layer_sizes=[70], activation='tanh', \
                    alpha = 0.0001, max_iter = 10000).fit(X_train, Y_train)

In [39]:
%%time
mlp.score(X_train, Y_train)

CPU times: user 54.8 ms, sys: 4.64 ms, total: 59.4 ms
Wall time: 31.5 ms


0.870194282103933

In [40]:
%%time
mlp.score(X_test, Y_test)

CPU times: user 23 ms, sys: 2.69 ms, total: 25.7 ms
Wall time: 16.6 ms


0.8454337731259675