In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import font_manager, rc
%matplotlib inline

from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import VotingClassifier
from imblearn.under_sampling import *
from imblearn.combine import SMOTETomek
from sklearn.metrics import classification_report
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

In [2]:
human = pd.read_csv('human.csv', encoding = 'cp949')
human.head()

Unnamed: 0,아이디,나이,노동 계급,fnlwgt,학력,교육 수,혼인 상태,직업,관계,인종,성별,자본 이득,자본 손실,주당 시간,모국
0,H20001,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,H20002,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,H20003,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,H20004,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,H20005,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba


In [3]:
human['성별'].value_counts()

 Male      21790
 Female    10771
Name: 성별, dtype: int64

In [4]:
#타입 바꾸기
obj = ['성별','노동 계급','학력','혼인 상태','직업','관계','인종','모국']
human[obj] = human[obj].apply(lambda x: x.astype('category').cat.codes)

In [5]:
human.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   아이디     32561 non-null  object
 1   나이      32561 non-null  int64 
 2   노동 계급   32561 non-null  int8  
 3   fnlwgt  32561 non-null  int64 
 4   학력      32561 non-null  int8  
 5   교육 수    32561 non-null  int64 
 6   혼인 상태   32561 non-null  int8  
 7   직업      32561 non-null  int8  
 8   관계      32561 non-null  int8  
 9   인종      32561 non-null  int8  
 10  성별      32561 non-null  int8  
 11  자본 이득   32561 non-null  int64 
 12  자본 손실   32561 non-null  int64 
 13  주당 시간   32561 non-null  int64 
 14  모국      32561 non-null  int8  
dtypes: int64(6), int8(8), object(1)
memory usage: 2.0+ MB


In [6]:
human.head()

Unnamed: 0,아이디,나이,노동 계급,fnlwgt,학력,교육 수,혼인 상태,직업,관계,인종,성별,자본 이득,자본 손실,주당 시간,모국
0,H20001,39,6,77516,9,13,4,0,1,4,1,2174,0,40,38
1,H20002,50,5,83311,9,13,2,3,0,4,1,0,0,13,38
2,H20003,38,3,215646,11,9,0,5,1,4,1,0,0,40,38
3,H20004,53,3,234721,1,7,2,5,0,2,1,0,0,40,38
4,H20005,28,3,338409,9,13,2,9,5,2,0,0,0,40,4


In [7]:
human=human.fillna('*')

In [8]:
human.corr()

Unnamed: 0,나이,노동 계급,fnlwgt,학력,교육 수,혼인 상태,직업,관계,인종,성별,자본 이득,자본 손실,주당 시간,모국
나이,1.0,0.003787,-0.076646,-0.010508,0.036527,-0.266288,-0.020947,-0.263698,0.028718,0.088832,0.077674,0.057775,0.068756,-0.001151
노동 계급,0.003787,1.0,-0.016656,0.023513,0.052085,-0.064731,0.254892,-0.090461,0.049742,0.095981,0.033835,0.012216,0.138962,-0.00769
fnlwgt,-0.076646,-0.016656,1.0,-0.028145,-0.043195,0.028153,0.001597,0.008931,-0.021291,0.026858,0.000432,-0.010252,-0.018768,-0.051966
학력,-0.010508,0.023513,-0.028145,1.0,0.359153,-0.038407,-0.02126,-0.010876,0.014131,-0.027356,0.030046,0.016746,0.05551,0.064288
교육 수,0.036527,0.052085,-0.043195,0.359153,1.0,-0.069304,0.109697,-0.094153,0.031838,0.01228,0.12263,0.079923,0.148123,0.05084
혼인 상태,-0.266288,-0.064731,0.028153,-0.038407,-0.069304,1.0,-0.009654,0.185451,-0.068013,-0.129314,-0.043393,-0.034187,-0.190519,-0.023819
직업,-0.020947,0.254892,0.001597,-0.02126,0.109697,-0.009654,1.0,-0.075607,0.006763,0.080296,0.025505,0.017987,0.080383,-0.012543
관계,-0.263698,-0.090461,0.008931,-0.010876,-0.094153,0.185451,-0.075607,1.0,-0.116055,-0.582454,-0.057919,-0.061062,-0.248974,-0.005507
인종,0.028718,0.049742,-0.021291,0.014131,0.031838,-0.068013,0.006763,-0.116055,1.0,0.087204,0.011145,0.018899,0.04191,0.137852
성별,0.088832,0.095981,0.026858,-0.027356,0.01228,-0.129314,0.080296,-0.582454,0.087204,1.0,0.04848,0.045567,0.229309,-0.008119


In [9]:
from sklearn.model_selection import train_test_split

x = human.drop(['아이디','성별'], axis=1)
y = human['성별']
#y = train.GENDER.values
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(24420, 13) (8141, 13) (24420,) (8141,)


##### 1. Decision Trees #####

In [10]:
tree = DecisionTreeClassifier(max_depth=6, random_state=0)

In [11]:
tree.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=6, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

In [12]:
pred_tree = tree.predict(X_test); pred_tree

array([0, 0, 1, ..., 1, 1, 0], dtype=int8)

In [13]:
display(tree.score(X_test, y_test))

0.825451418744626

In [14]:
humanew =pd.read_csv('human_new.csv',encoding='cp949')
humanew.head()

Unnamed: 0,아이디,나이,노동 계급,fnlwgt,학력,교육 수,혼인 상태,직업,관계,인종,자본 이득,자본 손실,주당 시간,모국
0,H0001,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,0,0,40,United-States
1,H0002,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,0,0,50,United-States
2,H0003,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,0,0,40,United-States
3,H0004,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,7688,0,40,United-States
4,H0005,18,,103497,Some-college,10,Never-married,,Own-child,White,0,0,30,United-States


In [15]:
#타입 바꾸기
obj = ['노동 계급','학력','혼인 상태','직업','관계','인종','모국']
humanew[obj] = humanew[obj].apply(lambda x: x.astype('category').cat.codes)

In [16]:
humanew['SEX'] = tree.predict(humanew.loc[:,'나이':'모국'])

##### 2. SVM

In [17]:
from sklearn.svm import SVC 
svm = SVC(random_state=0)
svm.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=0, shrinking=True, tol=0.001,
    verbose=False)

In [18]:
display(svm.score(X_test, y_test))

0.6680997420464316

##### 3. Neural Networks

In [19]:
from sklearn.neural_network import MLPClassifier 
mlp = MLPClassifier()
mlp.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

In [20]:
display(mlp.score(X_test, y_test))

0.7230070015968554

In [21]:
from sklearn.dummy import DummyClassifier
dummy = DummyClassifier(strategy='most_frequent').fit(X_train, y_train)
pred_dummy = dummy.predict(X_test)
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(max_depth=2).fit(X_train, y_train)
pred_tree = tree.predict(X_test)

In [22]:
from sklearn.metrics import accuracy_score
print("Dummy model:")
print(accuracy_score(y_test, pred_dummy))
print("Decision tree:")
print(accuracy_score(y_test, pred_tree))

Dummy model:
0.6682225770789829
Decision tree:
0.7750890553985997


In [23]:
from sklearn.metrics import confusion_matrix
print("Dummy model:")
print(confusion_matrix(y_test, pred_dummy))
print("Decision tree:")
print(confusion_matrix(y_test, pred_tree))

Dummy model:
[[   0 2701]
 [   0 5440]]
Decision tree:
[[1066 1635]
 [ 196 5244]]


In [24]:
logreg = LogisticRegression()
tree = DecisionTreeClassifier()
knn = KNeighborsClassifier()
voting = VotingClassifier(
    estimators = [('logreg', logreg), ('tree', tree), ('knn', knn)],
    voting = 'hard')
# hard -> voting, soft -> averaging
# soft 방법이 일반적인 보팅방법

In [25]:
from sklearn.metrics import accuracy_score
for clf in (logreg, tree, knn, voting) :
    clf.fit(X_train, y_train)
    print(clf.__class__.__name__, 
          accuracy_score(y_test, clf.predict(X_test)))

LogisticRegression 0.6759611841297138
DecisionTreeClassifier 0.805552143471318
KNeighborsClassifier 0.6567989190517135
VotingClassifier 0.7402039061540351


### Averaging predictions

In [26]:
averaging = VotingClassifier(
    estimators = [('logreg', logreg), ('tree', tree), ('knn', knn)],
    voting = 'soft')
averaging.fit(X_train, y_train)

VotingClassifier(estimators=[('logreg',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('tree',
                              DecisionTreeClassifier(ccp_alpha=0.0,
                                                     class_weight=None,
                                                     criterion='...
                                        

In [27]:
averaging.fit(X_train, y_train).score(X_test, y_test)

0.7944969905417025

In [28]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
svm = SVC()
mlp = MLPClassifier(alpha=1)

In [29]:
from mlxtend.classifier import StackingClassifier
import numpy as np
np.random.seed(1671)

stacking = StackingClassifier(classifiers=[tree, knn, svm, mlp], 
                              meta_classifier=logreg, # blender or meta-learner
                              use_probas=False,
                              average_probas=False)

for clf in (tree, knn, svm, mlp, stacking) :
    clf.fit(X_train, y_train)
    print(clf.__class__.__name__, accuracy_score(
        y_test, clf.predict(X_test)))

DecisionTreeClassifier 0.8069033288293821
KNeighborsClassifier 0.6567989190517135
SVC 0.6680997420464316
MLPClassifier 0.3467632968922737
StackingClassifier 0.8078860090897924


### Bagging

In [30]:
from sklearn.ensemble import BaggingClassifier

bagging = BaggingClassifier(base_estimator=KNeighborsClassifier(), random_state=0, n_estimators=200)
bagging.fit(X_train, y_train).score(X_test, y_test)

0.663063505711829

### Boosting
- AdaBoost(Adaptive Boosting)
- Gradient Boosting

In [31]:
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(n_estimators=200, random_state=0)
ada.fit(X_train, y_train).score(X_test, y_test)

0.8415428080088441

In [32]:
from sklearn.ensemble import GradientBoostingClassifier

gbm = GradientBoostingClassifier(n_estimators=200, random_state=0)
gbm.fit(X_train, y_train).score(X_test, y_test)

0.8437538385947673

In [33]:
# 파라미터 서치
def bestGBDTNextModel(model, isKfold, nfold, searchCV, Xtrain, ytrain, Xtest, ytest, nIter, scoring, errScore, verbose, nJobs):
    # GridSearchCV을 위해 파라미터 값을 제한함.
    grd_prams = {}
    classifier = XGBClassifier(random_state=0, objective='binary:logistic')
    cv = KFold(n_splits=nfold, shuffle=True, random_state=0)
    
    if model == 'LGBM':
        # 그래디언트 부스팅 결정 트리(GBDT)    
        grd_prams.update({'max_depth': [50, 100],
              'learning_rate' : [0.01, 0.05],
              'num_leaves': [150, 200],
              'n_estimators': [300, 400],
              'num_boost_round':[4000, 5000],
              'subsample': [0.5, 1],
              'reg_alpha': [0.01, 0.1],
              'reg_lambda': [0.01, 0.1],
              'min_data_in_leaf': [20, 30],
              'lambda_l1': [0.01, 0.1],
              'lambda_l2': [0.01, 0.1]
            })
        
        #grd_prams.update({'max_depth': [50, 75, 90, 100],
        #      'learning_rate' : [0.01, 0.05, 0.07, 0.1],
        #      'num_leaves': [300,600,900,1200],
        #      'n_estimators': [100, 300, 500, 900],
        #      'num_boost_round':[1000, 2000, 3000, 4000],
        #      'num_leaves': [30, 60, 120, 150, 200],
        #      'reg_alpha': [0.01, 0.1, 0.5, 0.7, 1.0],
        #      'min_data_in_leaf': [50, 100, 300, 800],
        #      'lambda_l1': [0, 0.1, 0.5, 1.0],
        #      'lambda_l2': [0, 0.01, 1.0]})
        
        classifier = LGBMClassifier(random_state=0, boosting_type='gbdt', objective='binary', metric='auc')
        
    elif model == 'XGB':
        grd_prams.update({'n_estimators': [300, 500],
            'learning_rate': [0.001, 0.01],
            'subsample': [0.5, 1],
            'max_depth': [5, 6],
            'colsample_bytree': [0.97, 1.24],
            'min_child_weight': [1, 2],
            'gamma': [0.001, 0.005],
            'nthread': [3, 4],
            'reg_lambda': [0.5, 1.0],
            'reg_alpha': [0.01, 0.1]
          })
        
        #grd_prams.update({'n_estimators': [300, 500, 700],
        #    'learning_rate': [0.01, 0.03, 0.05, 0.07, 0.09],
        #    'subsample': [0.5, 1],
        #    'max_depth': [4, 5, 6, 7, 8, 9, 10],
        #    'colsample_bytree': [0.52, 0.97, 1,55, 2.32, 3.46],
        #    'min_child_weight': [1, 2, 3, 4],
        #    'gamma': [0.001, 0.01, 0.1, 0, 1],
        #    'nthread': [3, 4, 5],
        #    'reg_lambda': [0.01, 0.1, 0.5, 0.7, 1.0],
        #    'reg_alpha': [0.01, 0.1, 0.5, 0.7, 1.0]
        #  })
    
    if isKfold == False:
        cv = StratifiedShuffleSplit(n_splits=nfold, test_size=0.2, random_state=0)
    
    grid_ = RandomizedSearchCV(classifier, param_distributions=grd_prams, n_iter=nIter, scoring=scoring, error_score=errScore, verbose=verbose, n_jobs=nJobs, cv=cv)

    # 속도 이슈
    if searchCV == 'GRID': 
        grid_ = GridSearchCV(classifier, param_grid=grd_prams, n_jobs=nJobs, scoring=scoring, verbose=verbose, cv=cv)
    
    grid_.fit(Xtrain, ytrain)
    score_ = grid_.score(Xtest, ytest)
    
    #best = {"best_param":grid_.best_params_, 
    #        "best_score":grid_.best_score_, 
    #        "best_estimator":grid_.best_estimator_,
    #        "test_score":score_
    #       }
    
    print("{} grid_.best_score {}".format(model, np.round(grid_.best_score_,3)))
    print("{} grid_.best_score {}".format(model, np.round(score_,3)))
    print("{} best_estimator {}".format(model, grid_.best_estimator_))

    return grid_.best_params_

In [34]:
# scoring="roc_auc|f1" => 0.755
# categories_indices = [X_train.columns.get_loc(col) for col in ['주구매코너']]
best_param1 = bestGBDTNextModel('XGB', False, 5, 'RANDOM', X_train, y_train, X_test, y_test, 15, 'roc_auc', 0, 3, -1)
xgb = XGBClassifier(**best_param1)
score_xgb = xgb.fit(X_train, y_train).score(X_test, y_test)
print("score_lgbm1 ::: {}".format(score_xgb))
print("-----------------------------------")
y_xgb = xgb.predict(X_test)
print(classification_report(y_test, y_xgb))

# 최고=0.755 Score LGBM best_estimator
#  LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
#         importance_type='split', lambda_l1=0.01, lambda_l2=0,
#         learning_rate=0.01, max_depth=50, metric='auc',
#         min_child_samples=20, min_child_weight=0.001, min_data_in_leaf=20,
#         min_split_gain=0.0, n_estimators=300, n_jobs=-1,
#         num_boost_round=4000, num_leaves=150, objective='binary',
#         random_state=0, reg_alpha=0.1, reg_lambda=0.0, silent=True,
#         subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:   39.5s
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:  4.7min finished


XGB grid_.best_score 0.932
XGB grid_.best_score 0.928
XGB best_estimator XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.97, gamma=0.001, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.01, max_delta_step=0, max_depth=6,
              min_child_weight=2, missing=nan, monotone_constraints=None,
              n_estimators=300, n_jobs=4, nthread=4, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0.01,
              reg_lambda=1.0, scale_pos_weight=1, subsample=1, tree_method=None,
              validate_parameters=False, verbosity=None)
score_lgbm1 ::: 0.8374892519346517
-----------------------------------
              precision    recall  f1-score   support

           0       0.74      0.78      0.76      2701
           1       0.89      0.87      0.88      5440

    accuracy                           0.84      814

In [45]:
human_GBM = humanew.copy()
human_GBM.head()

Unnamed: 0,아이디,나이,노동 계급,fnlwgt,학력,교육 수,혼인 상태,직업,관계,인종,자본 이득,자본 손실,주당 시간,모국,SEX
0,H0001,25,3,226802,1,7,4,6,3,2,0,0,40,37,1
1,H0002,38,3,89814,11,9,2,4,0,4,0,0,50,37,1
2,H0003,28,1,336951,7,12,2,10,0,4,0,0,40,37,1
3,H0004,44,3,160323,15,10,2,6,0,2,7688,0,40,37,1
4,H0005,18,-1,103497,15,10,4,-1,3,4,0,0,30,37,0


In [46]:
new = pd.read_csv('human_new.csv',encoding='cp949')
new.head()

Unnamed: 0,아이디,나이,노동 계급,fnlwgt,학력,교육 수,혼인 상태,직업,관계,인종,자본 이득,자본 손실,주당 시간,모국
0,H0001,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,0,0,40,United-States
1,H0002,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,0,0,50,United-States
2,H0003,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,0,0,40,United-States
3,H0004,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,7688,0,40,United-States
4,H0005,18,,103497,Some-college,10,Never-married,,Own-child,White,0,0,30,United-States


In [47]:
obj = ['노동 계급', '학력', '혼인 상태', '직업', '관계', '인종', '모국']
new[obj] = new[obj].apply(lambda x: x.astype('category').cat.codes)

In [48]:
new.head()

Unnamed: 0,아이디,나이,노동 계급,fnlwgt,학력,교육 수,혼인 상태,직업,관계,인종,자본 이득,자본 손실,주당 시간,모국
0,H0001,25,3,226802,1,7,4,6,3,2,0,0,40,37
1,H0002,38,3,89814,11,9,2,4,0,4,0,0,50,37
2,H0003,28,1,336951,7,12,2,10,0,4,0,0,40,37
3,H0004,44,3,160323,15,10,2,6,0,2,7688,0,40,37
4,H0005,18,-1,103497,15,10,4,-1,3,4,0,0,30,37


In [49]:
new['SEX'] =gbm.predict(new.loc[:,'나이':'모국'])

In [50]:
new.head()

Unnamed: 0,아이디,나이,노동 계급,fnlwgt,학력,교육 수,혼인 상태,직업,관계,인종,자본 이득,자본 손실,주당 시간,모국,SEX
0,H0001,25,3,226802,1,7,4,6,3,2,0,0,40,37,1
1,H0002,38,3,89814,11,9,2,4,0,4,0,0,50,37,1
2,H0003,28,1,336951,7,12,2,10,0,4,0,0,40,37,1
3,H0004,44,3,160323,15,10,2,6,0,2,7688,0,40,37,1
4,H0005,18,-1,103497,15,10,4,-1,3,4,0,0,30,37,0


In [51]:
new.rename(columns={"아이디":"ID"}, inplace = True)

In [52]:
new.head()

Unnamed: 0,ID,나이,노동 계급,fnlwgt,학력,교육 수,혼인 상태,직업,관계,인종,자본 이득,자본 손실,주당 시간,모국,SEX
0,H0001,25,3,226802,1,7,4,6,3,2,0,0,40,37,1
1,H0002,38,3,89814,11,9,2,4,0,4,0,0,50,37,1
2,H0003,28,1,336951,7,12,2,10,0,4,0,0,40,37,1
3,H0004,44,3,160323,15,10,2,6,0,2,7688,0,40,37,1
4,H0005,18,-1,103497,15,10,4,-1,3,4,0,0,30,37,0


In [53]:
str = ['ID','SEX']
new[str].head()

Unnamed: 0,ID,SEX
0,H0001,1
1,H0002,1
2,H0003,1
3,H0004,1
4,H0005,0


In [54]:
new[str].to_csv('kaggle2_data.csv',index=False)