In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/adult-incomes-in-the-united-states/adult.data
/kaggle/input/adult-incomes-in-the-united-states/adult.names
/kaggle/input/adult-incomes-in-the-united-states/old.adult.names
/kaggle/input/adult-incomes-in-the-united-states/adult.test


In [2]:
from sklearn import preprocessing, neighbors, metrics
from sklearn import model_selection, ensemble, pipeline, feature_selection
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
df = pd.read_csv('/kaggle/input/adult-incomes-in-the-united-states/adult.data', header=None, na_values='?')
df = df.dropna()

In [3]:
from collections import Counter
target = df.values[:,-1]
counter = Counter(target)
for k,v in counter.items():
	per = v / len(target) * 100
	print('Class=%s, Count=%d, Percentage=%.3f%%' % (k, v, per))

Class= <=50K, Count=24720, Percentage=75.919%
Class= >50K, Count=7841, Percentage=24.081%


In [4]:
df.columns=['age','workclass','fnlwgt','education','education-num','marital-status','occupation',
                  'relationship','race','sex','capital-gain','capital-loss','hours-per-week','native country','label']

In [5]:
X, y = df.drop('label', axis=1), df['label']

In [6]:
cat_ix = X.select_dtypes(include=['object', 'bool']).columns
num_ix = X.select_dtypes(include=['int64', 'float64']).columns
y = LabelEncoder().fit_transform(y)

In [7]:

steps = [('c',OneHotEncoder(handle_unknown='ignore'),cat_ix), ('n',MinMaxScaler(),num_ix)]
ct = ColumnTransformer(steps)
xval = model_selection.KFold(10, shuffle=True)
pipe = pipeline.Pipeline([('t',ct),
    ('fs', feature_selection.SelectFromModel(ensemble.RandomForestClassifier(n_estimators=100), threshold='median')),
    ('model', ensemble.RandomForestClassifier(n_estimators=100))])

In [8]:
result = model_selection.cross_validate(pipe, X, y, scoring='accuracy', return_train_score=True, return_estimator=True, cv=xval)


train_r2 = result['train_score'].mean()
train_r2_sd = result['train_score'].std()
test_r2 = result['test_score'].mean()
test_r2_sd = result['test_score'].std()

In [9]:
print(f'Train mean: ', train_r2)
print(f'Train std: ', train_r2_sd)
print(f'Test mean: ', test_r2)
print(f'Test std: ', test_r2_sd)

Train mean:  0.9999351644147755
Train std:  2.3886742033755128e-05
Test mean:  0.8548266764685248
Test std:  0.0052682875120746275


In [10]:
scorer = metrics.make_scorer(metrics.cohen_kappa_score)
result = model_selection.cross_validate(pipe, X, y, scoring=scorer, return_train_score=True, return_estimator=True, cv=xval)


train_c = result['train_score'].mean()
train_c_sd = result['train_score'].std()
test_c = result['test_score'].mean()
test_c_sd = result['test_score'].std()

In [11]:
print(f'Train mean: ', train_c)
print(f'Train std: ', train_c_sd)
print(f'Test mean: ', test_c)
print(f'Test std: ', test_c_sd)

Train mean:  0.9998133104738199
Train std:  8.37055218510151e-05
Test mean:  0.5829776446347192
Test std:  0.011010457294418985


In [17]:
def test_feature_remover(X, y, feature_name):
    cat_ix = X.select_dtypes(include=['object', 'bool']).columns
    num_ix = X.select_dtypes(include=['int64', 'float64']).columns
    y = LabelEncoder().fit_transform(y)
    
    steps = [('c',OneHotEncoder(handle_unknown='ignore'),cat_ix),
             ('n',MinMaxScaler(),num_ix)]
    ct = ColumnTransformer(steps)
    xval = model_selection.KFold(10, shuffle=True)
    pipe = pipeline.Pipeline([('t',ct),
        ('model', ensemble.RandomForestClassifier(n_estimators=100))])
    scorer = metrics.make_scorer(metrics.cohen_kappa_score)
    result = model_selection.cross_validate(pipe, X, y, 
            scoring=scorer, return_train_score=True, return_estimator=True, cv=xval)


    train_c = result['train_score'].mean()
    train_c_sd = result['train_score'].std()
    test_c = result['test_score'].mean()
    test_c_sd = result['test_score'].std()
    print(f'{feature_name} mean: ', test_c)
    print(f'{feature_name} std: ', test_c_sd)

In [18]:
for col in df.columns:
    X, y = df.drop(['label', col], axis=1), df['label']
    test_feature_remover(X, y, col)

age mean:  0.5332312025001275
age std:  0.009824065355395515
workclass mean:  0.5703804852358824
workclass std:  0.010497615208541733
fnlwgt mean:  0.562181065742511
fnlwgt std:  0.013304078688073305
education mean:  0.5851071328763173
education std:  0.02368839350694559
education-num mean:  0.5812460227564431
education-num std:  0.008317711907197864
marital-status mean:  0.579266419256227
marital-status std:  0.014494953743316892
occupation mean:  0.5613884680899812
occupation std:  0.01653551515906972
relationship mean:  0.5802722986843708
relationship std:  0.019589362617857268
race mean:  0.580719189983953
race std:  0.023953012874600377
sex mean:  0.5775973960723408
sex std:  0.017448473397268634
capital-gain mean:  0.5187278459799699
capital-gain std:  0.011446305508669688
capital-loss mean:  0.5569415760404486
capital-loss std:  0.010513360491182625
hours-per-week mean:  0.5616360113683614
hours-per-week std:  0.023716277407740557
native country mean:  0.5798036209234116
native 


age mean:  0.5332312025001275
age std:  0.009824065355395515
workclass mean:  0.5703804852358824
workclass std:  0.010497615208541733
fnlwgt mean:  0.562181065742511
fnlwgt std:  0.013304078688073305
education mean:  0.5851071328763173
education std:  0.02368839350694559
education-num mean:  0.5812460227564431
education-num std:  0.008317711907197864
marital-status mean:  0.579266419256227
marital-status std:  0.014494953743316892
occupation mean:  0.5613884680899812
occupation std:  0.01653551515906972
relationship mean:  0.5802722986843708
relationship std:  0.019589362617857268
race mean:  0.580719189983953
race std:  0.023953012874600377
sex mean:  0.5775973960723408
sex std:  0.017448473397268634
capital-gain mean:  0.5187278459799699
capital-gain std:  0.011446305508669688
capital-loss mean:  0.5569415760404486
capital-loss std:  0.010513360491182625
hours-per-week mean:  0.5616360113683614
hours-per-week std:  0.023716277407740557
native country mean:  0.5798036209234116
native country std:  0.015791037987065142