In [8]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix

In [75]:
data = pd.read_csv('adult.csv', na_values='?')
data.columns = data.columns.str.replace('-','_')
data.dropna(how='any', inplace=True)

In [78]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income
0,25,2,226802,1,7,4,6,3,2,1,0,0,40,38,0
1,38,2,89814,11,9,2,4,0,4,1,0,0,50,38,0
2,28,1,336951,7,12,2,10,0,4,1,0,0,40,38,1
3,44,2,160323,15,10,2,6,0,2,1,7688,0,40,38,1
5,34,2,198693,0,6,4,7,1,4,1,0,0,30,38,0


In [77]:
le = LabelEncoder()
columns = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'gender', 'native_country', 'income']
for column in columns:
    le.fit(data[column])
    data[column] = le.transform(data[column])

In [80]:
X = data.copy()
X.drop(['fnlwgt', 'income'], axis=1, inplace=True)
Y = data['income']

In [81]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

** LogisticRegression

In [88]:
model_LR = make_pipeline(StandardScaler(),LogisticRegression(max_iter=1000))

In [89]:
model_LR.fit(X_train, y_train )

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('logisticregression',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=1000,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='lbfgs', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [91]:
model_LR.score(X_test, y_test)

0.820078130758458

In [106]:
conf_mat = confusion_matrix(y_test, model_LR.predict(X_test))
conf_mat

array([[9628,  613],
       [1828, 1498]], dtype=int64)

In [112]:
print(f'Precision: {(conf_mat[0][0] / (conf_mat[0][0] + conf_mat[1][0])):.4}')
print(f'Recall:    {(conf_mat[0][0] / (conf_mat[0][0] + conf_mat[0][1])):.4}')

Precision: 0.8404
Recall:    0.9401


** SVM

In [92]:
model_SVM = make_pipeline(StandardScaler(), SVC(gamma='auto'))
model_SVM.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('svc',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma='auto', kernel='rbf', max_iter=-1, probability=False,
                     random_state=None, shrinking=True, tol=0.001,
                     verbose=False))],
         verbose=False)

In [94]:
model_SVM.score(X_test, y_test)

0.8496351440996536

In [104]:
data_predict1 = model_SVM.predict(X_test)
conf_mat1 = confusion_matrix(y_test, data_predict1)
conf_mat1

array([[9664,  577],
       [1463, 1863]], dtype=int64)

In [113]:
print(f'Precision: {(conf_mat1[0][0] / (conf_mat1[0][0] + conf_mat1[1][0])):.4}')
print(f'Recall:    {(conf_mat1[0][0] / (conf_mat1[0][0] + conf_mat1[0][1])):.4}')

Precision: 0.8685
Recall:    0.9437
