## 1. Understanding Data

In [1]:
import os
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings(action='ignore')

### Before Start
본격적으로 시작하기 전에 데이터에 대해서 아주 약간만 깊이 있게 이해해보는 시간을 가져보려고 합니다.

### Q. 디스크의 CSV파일의 용량은 그렇게 높진 않은데 메모리로 읽기만 하면 몇 배로 늘어나는 이유는?
캐글을 하시다 보면 이런 경험이 한번쯤은 다들 있으실 것 같습니다.   
분명히 CSV 파일로는 1GB 보다 아래였는데 판다스로 read를 하면 2~3GB로 늘어나는 경우가 종종 있는데, 이유가 무엇일까요?

### Load data

In [126]:
PATH = '../input/kakr-4th-competition/'
train = pd.read_csv(PATH + 'train.csv')
test  = pd.read_csv(PATH + 'test.csv')

### 데이터 확인

* id
* age : 나이
* workclass : 고용 형태
* fnlwgt : 사람 대표성을 나타내는 가중치 (final weight의 약자)
* education : 교육 수준
* education_num : 교육 수준 수치
* marital_status: 결혼 상태
* occupation : 업종
* relationship : 가족 관계
* race : 인종
* sex : 성별
* capital_gain : 양도 소득
* capital_loss : 양도 손실
* hours_per_week : 주당 근무 시간
* native_country : 국적
* income : 수익 (예측해야 하는 값)
    * \>50K : 1
    * <=50K : 0

### Pandas 라이브러리
Pandas 라이브러리에서 많이 쓰이는 것들을 위주로 살펴보도록 하겠습니다.

loc, iloc, info, describe, value_counts, head, tail, sample, map, apply, groupby 등

In [None]:
train.shape

In [None]:
train.apply(lambda x: x['age'] + x['hours_per_week'], axis=1)

In [None]:
## 판다스 사용 팁
train.describe()

In [None]:
train.capital_gain < 99999

In [None]:
train

In [None]:
train.income = train.income.map(lambda x: int(x=='>50K'))

In [None]:
train.age.plot(kind='box')

## 2. Data Preprocessing

### 2.1 결측치 처리

In [None]:
train[train.apply(lambda x: "?" in list(x), axis=1)]

In [None]:
train.occupation.value_counts()

In [None]:
train.workclass.value_counts()

### 결측치 처리
삭제? 대치?

In [None]:
train[(train['workclass'] == '?')&(train['occupation'] == '?')]

In [None]:
train.loc[(train.age >= 30)&(train.workclass=='?'), 'workclass'] = 'No'

In [None]:
train.workclass.value_counts()

In [None]:
train.native_country.value_counts()

### Null check 관련 팁
데이터를 일일이 다 확인하지 않고 결측치가 있는지 확인하는 방법은?

In [None]:
train.info()

### 2.2 이상치 처리
수치형 데이터에 이상한 값이 없는지 한번 확인해보도록 하겠습니다.

In [None]:
train.describe()

In [None]:
train.loc[train.capital_gain < 99999].sort_values('capital_gain', ascending=False)

### log 표현

In [None]:
train["log_capital_gain"] = train.capital_gain.map(lambda x: np.log(x, where=(x!=0)))

In [None]:
train[['capital_gain', 'log_capital_gain']].describe()

In [None]:
train.capital_gain.map(lambda x: np.log(x, where=(x!=0))).describe()

### 2.3 Scaling

Min-max Scaler: 
범위가 정해진 값이 필요할 때  
아웃라이어에 민감함
  
Standard Scaler: 
평균을 0, 표준편차를 1로 맞추어 정규분포의 특성을 가지도록 만듦  
아웃라이어에 영향을 덜 받음

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

mm_scaler = MinMaxScaler()
st_scaler = StandardScaler()

In [None]:
train['MM_fnlwgt'] = mm_scaler.fit_transform(train['fnlwgt'].values.reshape(-1,1))
test['MM_fnlwgt'] = mm_scaler.transform(test['fnlwgt'].values.reshape(-1,1))

train['MM_age'] = mm_scaler.fit_transform(train['age'].values.reshape(-1,1))
test['MM_age'] = mm_scaler.transform(test['age'].values.reshape(-1,1))

train['ST_fnlwgt'] = st_scaler.fit_transform(train['fnlwgt'].values.reshape(-1,1))
test['ST_fnlwgt'] = st_scaler.transform(test['fnlwgt'].values.reshape(-1,1))

train['ST_age'] = st_scaler.fit_transform(train['age'].values.reshape(-1,1))
test['ST_age'] = st_scaler.transform(test['age'].values.reshape(-1,1))

In [None]:
train.describe()[['MM_fnlwgt', 'MM_age']]

In [None]:
train.describe()[['ST_fnlwgt', 'ST_age']].round(6)

## 3. Feature Engineering

### 3.1 변수의 종류
변수 종류에 따라서 어떤 차이점이 있는지 살펴봅시다.

In [None]:
train

### 3.2 인코딩
알고리즘이 이해하기 어려운 Feature들을 어떻게 처리할 지 알아봅시다

In [None]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [None]:
train

### Onehot encoder

In [None]:
oe = OneHotEncoder()
oe_result = oe.fit_transform(train['workclass'].values.reshape(-1, 1))

In [None]:
oe.get_feature_names(['workclass'])

In [120]:
oe_result

<26049x10 sparse matrix of type '<class 'numpy.float64'>'
	with 26049 stored elements in Compressed Sparse Row format>

In [123]:
sub = pd.DataFrame(data=oe_result.toarray(), columns=oe.get_feature_names(['workclass']))

In [125]:
pd.concat([train, sub], axis=1)

Unnamed: 0,id,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,...,workclass_?,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_No,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay
0,0,40,Private,168538,HS-grad,9,Married-civ-spouse,Sales,Husband,White,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,1,17,Private,101626,9th,5,Never-married,Machine-op-inspct,Own-child,White,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,2,18,Private,353358,Some-college,10,Never-married,Other-service,Own-child,White,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,3,21,Private,151158,Some-college,10,Never-married,Prof-specialty,Own-child,White,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,4,24,Private,122234,Some-college,10,Never-married,Adm-clerical,Not-in-family,Black,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26044,26044,57,Private,250201,11th,7,Married-civ-spouse,Other-service,Husband,White,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
26045,26045,23,Private,238092,Bachelors,13,Never-married,Prof-specialty,Own-child,White,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
26046,26046,78,No,165694,Masters,14,Widowed,?,Not-in-family,White,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
26047,26047,26,Self-emp-not-inc,151626,HS-grad,9,Never-married,Prof-specialty,Own-child,Black,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


### Label encoder

In [129]:
le = LabelEncoder()
le.fit_transform(train['workclass'].values.reshape(-1, 1))

array([4, 4, 4, ..., 0, 6, 0])

In [135]:
workclass_to_num = dict(zip(train['workclass'].unique(), [0,1,2,3,4,5,6,7,8,9]))

In [136]:
train['workclass'].map(workclass_to_num)

0        0
1        0
2        0
3        0
4        0
        ..
26044    0
26045    0
26046    2
26047    3
26048    2
Name: workclass, Length: 26049, dtype: int64

In [130]:
train['workclass'].unique()

array(['Private', 'State-gov', '?', 'Self-emp-not-inc', 'Local-gov',
       'Federal-gov', 'Self-emp-inc', 'Without-pay', 'Never-worked'],
      dtype=object)

사이킷런 인코딩은 불편한점이 너무 많아서.. 직접 만드는 게 나을 수도 있습니다.

### (Target) Mean Encoding

In [None]:
train

In [137]:
male_positive = train.loc[train['sex']=='Male', "income"]
female_positive = train.loc[train['sex']=='Female', "income"]

In [144]:
(male_positive.value_counts() / male_positive.shape[0]).iloc[1]

0.3061434618464707

In [141]:
female_positive.value_counts() / female_positive.shape[0]

<=50K    0.888759
>50K     0.111241
Name: income, dtype: float64

### onehot 인코딩 일괄적으로 하는 간단한 방법~

In [127]:
dummied = pd.get_dummies(train)

In [128]:
dummied

Unnamed: 0,id,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,workclass_?,workclass_Federal-gov,workclass_Local-gov,...,native_country_Scotland,native_country_South,native_country_Taiwan,native_country_Thailand,native_country_Trinadad&Tobago,native_country_United-States,native_country_Vietnam,native_country_Yugoslavia,income_<=50K,income_>50K
0,0,40,168538,9,0,0,60,0,0,0,...,0,0,0,0,0,1,0,0,0,1
1,1,17,101626,5,0,0,20,0,0,0,...,0,0,0,0,0,1,0,0,1,0
2,2,18,353358,10,0,0,16,0,0,0,...,0,0,0,0,0,1,0,0,1,0
3,3,21,151158,10,0,0,25,0,0,0,...,0,0,0,0,0,1,0,0,1,0
4,4,24,122234,10,0,0,20,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26044,26044,57,250201,7,0,0,52,0,0,0,...,0,0,0,0,0,1,0,0,1,0
26045,26045,23,238092,13,0,0,40,0,0,0,...,0,0,0,0,0,1,0,0,1,0
26046,26046,78,165694,14,0,0,15,1,0,0,...,0,0,0,0,0,1,0,0,1,0
26047,26047,26,151626,9,0,0,40,0,0,0,...,0,0,0,0,0,1,0,0,1,0


### 3.3 PCA

In [145]:
from sklearn.decomposition import PCA
pca = PCA(n_components=60, svd_solver='full')

In [146]:
dummied = dummied.drop(columns=['id', 'income_<=50K', 'income_>50K'])

In [148]:
X_train_std = st_scaler.fit_transform(dummied)

In [153]:
X_train_std = st_scaler.fit_transform(dummied)
X_train_pca = pca.fit_transform(X_train_std)

In [155]:
X_train_pca.shape

(26049, 60)

In [161]:
X_train_2.shape

(26049, 107)

In [164]:
pca.explained_variance_ratio_

array([0.0423312 , 0.02803306, 0.02437229, 0.02290246, 0.02174415,
       0.01786761, 0.01638822, 0.01516837, 0.01409289, 0.01337622,
       0.01293731, 0.0123117 , 0.01209563, 0.01166806, 0.01151036,
       0.01129384, 0.01126632, 0.01100187, 0.01096332, 0.01076095,
       0.01062694, 0.01055432, 0.0104432 , 0.0103509 , 0.01021698,
       0.01020578, 0.01017242, 0.01013916, 0.01005222, 0.01001369,
       0.00991158, 0.00987461, 0.00982822, 0.00977661, 0.00976044,
       0.00968834, 0.00967712, 0.00963459, 0.00959399, 0.00956318,
       0.00949365, 0.00945525, 0.00944413, 0.00942466, 0.00940088,
       0.00938556, 0.00937015, 0.00936648, 0.00936014, 0.00935912,
       0.00935466, 0.0093523 , 0.00934787, 0.00934071, 0.00933429,
       0.0093288 , 0.00931446, 0.00929649, 0.00928916, 0.0092711 ])

In [159]:
X_train_2 = X_train_std - X_train_std.mean(axis=0)

res = np.dot(X_train_2, pca.components_.T)
res

array([[ 2.15104545,  1.10863836, -0.89692251, ...,  0.0590396 ,
         0.06308399, -0.36166509],
       [-1.82588016,  3.1441548 , -1.52213373, ..., -0.32314159,
        -0.18343577, -0.44857784],
       [-2.1404661 ,  1.32485889, -1.88436268, ..., -0.18708453,
         0.09141367,  0.09783785],
       ...,
       [-1.6781281 , -3.24235079,  1.50911091, ..., -0.27485239,
         0.2678334 ,  0.13183017],
       [-2.46116536, -0.67196355,  0.64926634, ...,  0.03788511,
        -0.27461167, -0.20737092],
       [-3.17066038, -0.44892892, -0.66250624, ..., -0.08734472,
        -0.0287194 , -0.27267377]])

In [162]:
np.allclose(X_train_pca, res)

True

## 4. Before Modeling
시간이 남는다면..