In [1]:
import pprint

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

from google.colab import files

#前処理

In [76]:
train = pd.read_csv('train.tsv', encoding='utf-8', sep='\t')
test = pd.read_csv('test.tsv', encoding='utf-8', sep='\t')

In [77]:
label = ['id', 'Y', 'age', 'education-num', 'race', 'sex', 'hours-per-week', 'native-country']
label_test = ['id', 'age', 'education-num', 'race', 'sex', 'hours-per-week', 'native-country']

train1 = pd.DataFrame()
for i in label:
  train1[i] = train[i]

test1 = pd.DataFrame()
for i in label_test:
  test1[i] = test[i]

train2 = pd.concat([train1, test1], axis=0)

In [78]:
train2

Unnamed: 0,id,Y,age,education-num,race,sex,hours-per-week,native-country
0,5,<=50K,90,7,White,Male,40,United-States
1,10,<=50K,46,12,White,Male,55,United-States
2,11,>50K,37,9,White,Female,40,United-States
3,13,<=50K,45,14,Black,Female,40,United-States
4,14,>50K,40,12,White,Male,42,United-States
...,...,...,...,...,...,...,...,...
16276,32549,,59,13,White,Male,40,United-States
16277,32556,,21,10,Asian-Pac-Islander,Female,40,United-States
16278,32558,,59,10,White,Female,14,Cuba
16279,32559,,57,9,White,Male,40,United-States


In [79]:
train2.isna().sum()

id                    0
Y                 16281
age                   0
education-num         0
race                  0
sex                   0
hours-per-week        0
native-country        0
dtype: int64

In [80]:
#ダミー変数
train2['white'] = 0
train2['male'] = 0
train2['US'] = 0
train2['Y_2'] = np.nan

for i in range(len(train2)):
  if train2.iat[i, train2.columns.get_loc('race')] == 'White':
    train2.iat[i, train2.columns.get_loc('white')] = 1
  if train2.iat[i, train2.columns.get_loc('sex')] == 'Male':
    train2.iat[i, train2.columns.get_loc('male')] = 1
  if train2.iat[i, train2.columns.get_loc('native-country')] == 'United-States':
    train2.iat[i, train2.columns.get_loc('US')] = 1
  if train2.iat[i, train2.columns.get_loc('Y')] == '<=50K':
    train2.iat[i, train2.columns.get_loc('Y_2')] = 1
  if train2.iat[i, train2.columns.get_loc('Y')] == '>50K':
    train2.iat[i, train2.columns.get_loc('Y_2')] = 0


train3 = train2.drop(['race', 'sex', 'native-country', 'Y'], axis=1)

In [81]:
train3

Unnamed: 0,id,age,education-num,hours-per-week,white,male,US,Y_2
0,5,90,7,40,1,1,1,1.0
1,10,46,12,55,1,1,1,1.0
2,11,37,9,40,1,0,1,0.0
3,13,45,14,40,0,0,1,1.0
4,14,40,12,42,1,1,1,0.0
...,...,...,...,...,...,...,...,...
16276,32549,59,13,40,1,1,1,
16277,32556,21,10,40,0,0,1,
16278,32558,59,10,14,1,0,0,
16279,32559,57,9,40,1,1,1,


#基礎分析

In [82]:
train4 = train3[train2.Y_2.notna()]
test4 = train3[~train2.Y_2.notna()]

In [83]:
train4

Unnamed: 0,id,age,education-num,hours-per-week,white,male,US,Y_2
0,5,90,7,40,1,1,1,1.0
1,10,46,12,55,1,1,1,1.0
2,11,37,9,40,1,0,1,0.0
3,13,45,14,40,0,0,1,1.0
4,14,40,12,42,1,1,1,0.0
...,...,...,...,...,...,...,...,...
16275,32552,42,10,40,1,1,1,1.0
16276,32553,59,9,60,1,1,1,1.0
16277,32554,42,11,40,1,0,1,1.0
16278,32555,19,10,15,1,0,1,1.0


In [84]:
train4.describe()

Unnamed: 0,id,age,education-num,hours-per-week,white,male,US,Y_2
count,16280.0,16280.0,16280.0,16280.0,16280.0,16280.0,16280.0,16280.0
mean,16286.900921,38.603808,10.089558,40.410012,0.852641,0.665602,0.894103,0.754791
std,9389.421853,13.672102,2.570911,12.311716,0.354474,0.471794,0.307715,0.430224
min,5.0,17.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,8169.5,28.0,9.0,40.0,1.0,0.0,1.0,1.0
50%,16267.5,37.0,10.0,40.0,1.0,1.0,1.0,1.0
75%,24458.25,48.0,12.0,45.0,1.0,1.0,1.0,1.0
max,32557.0,90.0,16.0,99.0,1.0,1.0,1.0,1.0


In [85]:
test4.describe()

Unnamed: 0,id,age,education-num,hours-per-week,white,male,US,Y_2
count,16281.0,16281.0,16281.0,16281.0,16281.0,16281.0,16281.0,0.0
mean,16273.099502,38.559487,10.071801,40.464898,0.855906,0.672809,0.897611,
std,9410.240397,13.609075,2.574576,12.383353,0.351196,0.469202,0.303169,
min,0.0,17.0,1.0,1.0,0.0,0.0,0.0,
25%,8120.0,28.0,9.0,40.0,1.0,0.0,1.0,
50%,16293.0,37.0,10.0,40.0,1.0,1.0,1.0,
75%,24383.0,48.0,12.0,45.0,1.0,1.0,1.0,
max,32560.0,90.0,16.0,99.0,1.0,1.0,1.0,


#検証

In [86]:
X_train, X_test, y_train, y_test = train_test_split(train4.drop('Y_2', axis=1),
                                                    train4.Y_2,
                                                    stratify=train4.Y_2,
                                                    test_size=0.5,
                                                    random_state=0)

In [87]:
予測変数1 = ['age', 'US']
予測変数2 = ['age', 'white', 'male', 'US']
予測変数3 = ['age', 'white', 'male', 'hours-per-week', 'US']
予測変数L = ['age', 'education-num', 'white', 'male', 'hours-per-week', 'US']

In [88]:
accuracy_Logistic = dict()
accuracy_SVC = dict()
accuracy_DecisionTree = dict()

def model_accuracy(モデル, 辞書):
  for i, 予測変数 in enumerate([予測変数1, 予測変数2, 予測変数3, 予測変数L]):
    model = モデル
    model.fit(X_train[予測変数], y_train)
    m = confusion_matrix(y_test, model.predict(X_test[予測変数]))
    accuracy = (m[0, 0] + m[1, 1]) / m.sum()
    辞書[i] = accuracy

model_accuracy(LogisticRegression(), accuracy_Logistic)
model_accuracy(SVC(gamma=0.001, C=1), accuracy_SVC)
model_accuracy(DecisionTreeClassifier(criterion = 'entropy', max_depth=10, random_state=0), accuracy_DecisionTree)

In [89]:
pd.Series(accuracy_Logistic)

0    0.739558
1    0.732432
2    0.748034
3    0.798034
dtype: float64

In [90]:
pd.Series(accuracy_SVC)

0    0.754791
1    0.754791
2    0.754668
3    0.797789
dtype: float64

In [91]:
pd.Series(accuracy_DecisionTree)

0    0.754300
1    0.756511
2    0.758845
3    0.787961
dtype: float64

#投稿

In [92]:
model = LogisticRegression()
model.fit(X_train[予測変数L], y_train)
model.predict(test4[予測変数L])

array([1., 1., 1., ..., 1., 1., 1.])

In [104]:
df1 = test4['id']

list_df2 = []
for i in model.predict(test4[予測変数L]):
 if i == 1: list_df2.append('<=50K')
 else: list_df2.append('>50K')
df2 = pd.DataFrame(list_df2)

df = pd.concat([df1, df2], axis=1)

In [105]:
df

Unnamed: 0,id,0
0,0,<=50K
1,1,<=50K
2,2,<=50K
3,3,>50K
4,4,>50K
...,...,...
16276,32549,>50K
16277,32556,<=50K
16278,32558,<=50K
16279,32559,<=50K


In [106]:
df.to_csv('sample_submit_2.csv', index=False, header=False)
files.download("sample_submit_2.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

#出力

In [103]:
train4.describe().to_csv("BigData2_基本統計量.csv")
files.download("BigData2_基本統計量.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>