In [17]:
import numpy as np 
import numpy.random as random 
import scipy as sp 
from pandas import Series,DataFrame 
import pandas as pd 

import matplotlib.pyplot as plt 
import matplotlib as mpl 
import seaborn as sns 
%matplotlib inline 

import sklearn 

%precision 3

import requests,zipfile 
import io 

#データを取得
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
res = requests.get(url).content

data = pd.read_csv(io.StringIO(res.decode('utf-8')),header = None)

data.columns =['age','workclass','fnlwgt','education','education-num','marital-status',
              'occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','fig-50K']

<h2>個人の年齢や性別,職業などからその人の収入が50K(5万ドル)を超えるのかを予測するモデルを構築する</h2>

In [18]:
data.shape

(32561, 15)

In [19]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,fig-50K
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
age               32561 non-null int64
workclass         32561 non-null object
fnlwgt            32561 non-null int64
education         32561 non-null object
education-num     32561 non-null int64
marital-status    32561 non-null object
occupation        32561 non-null object
relationship      32561 non-null object
race              32561 non-null object
sex               32561 non-null object
capital-gain      32561 non-null int64
capital-loss      32561 non-null int64
hours-per-week    32561 non-null int64
native-country    32561 non-null object
fig-50K           32561 non-null object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [21]:
data.isnull().any()

age               False
workclass         False
fnlwgt            False
education         False
education-num     False
marital-status    False
occupation        False
relationship      False
race              False
sex               False
capital-gain      False
capital-loss      False
hours-per-week    False
native-country    False
fig-50K           False
dtype: bool

In [22]:
data.groupby('fig-50K').size()

fig-50K
 <=50K    24720
 >50K      7841
dtype: int64

In [23]:
data['fin_flg'] = data['fig-50K'].map(lambda x: 1 if x == ' >50K' else 0)
data.groupby('fin_flg').size()   #←これを目的変数にする

fin_flg
0    24720
1     7841
dtype: int64

In [35]:
print(data.isnull().sum())

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
fig-50K           0
fin_flg           0
dtype: int64


In [36]:
data.corr()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,fin_flg
age,1.0,-0.076646,0.036527,0.077674,0.057775,0.068756,0.234037
fnlwgt,-0.076646,1.0,-0.043195,0.000432,-0.010252,-0.018768,-0.009463
education-num,0.036527,-0.043195,1.0,0.12263,0.079923,0.148123,0.335154
capital-gain,0.077674,0.000432,0.12263,1.0,-0.031615,0.078409,0.223329
capital-loss,0.057775,-0.010252,0.079923,-0.031615,1.0,0.054256,0.150526
hours-per-week,0.068756,-0.018768,0.148123,0.078409,0.054256,1.0,0.229689
fin_flg,0.234037,-0.009463,0.335154,0.223329,0.150526,0.229689,1.0


In [39]:
data.groupby('native-country').size()

native-country
 ?                               583
 Cambodia                         19
 Canada                          121
 China                            75
 Columbia                         59
 Cuba                             95
 Dominican-Republic               70
 Ecuador                          28
 El-Salvador                     106
 England                          90
 France                           29
 Germany                         137
 Greece                           29
 Guatemala                        64
 Haiti                            44
 Holand-Netherlands                1
 Honduras                         13
 Hong                             20
 Hungary                          13
 India                           100
 Iran                             43
 Ireland                          24
 Italy                            73
 Jamaica                          81
 Japan                            62
 Laos                             18
 Mexico                

In [50]:
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split 

X = data[['age','fnlwgt','education-num','capital-gain','capital-loss','hours-per-week']]
y = data['fin_flg']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.6)
model = LogisticRegression()
model.fit(X_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [51]:
model.score(X_train,y_train)

0.79745085995086

In [52]:
model.score(X_test,y_test)

0.7961816041357425

In [54]:
model.coef_

array([[-3.932e-03, -3.495e-06, -2.739e-03,  3.530e-04,  7.412e-04,
        -1.196e-02]])