### How to handle missing values

#### 데이터가 없을 때 취할 수 있는 방법들
 - sample(row)을 drop
 - 결측이 많은 feature는 feature 자체를 drop
 - 최빈값, 평균값 등으로 비어있는 데이터를 채우기

In [3]:
import pandas as pd
import numpy as np

In [5]:
raw_data = {'first_name' : ['Jason', np.nan, 'Tina', 'Jake', 'Amy'],
           'last_name' : ['Miller', np.nan, 'Ali', 'Milner', 'Cooze'],
           'age' : [42, np.nan, 36, 24, 73],
           'sex' : ['m', np.nan, 'f', 'm', 'f'],
           'preTestScore' : [4, np.nan, np.nan, 2, 3],
           'postTestScore' : [25, np.nan, np.nan, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'sex', 'preTestScore', 'postTestScore'])
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,,,,,,
2,Tina,Ali,36.0,f,,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


#### Data Drop

In [7]:
df.isnull().sum()   # 각 column별 결측치 수

first_name       1
last_name        1
age              1
sex              1
preTestScore     2
postTestScore    2
dtype: int64

In [8]:
df_no_missing = df.dropna()
df_no_missing

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [9]:
df_cleaned = df.dropna(how = 'all')    # 모든 데이터가 비어 있으면 drop
df_cleaned

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
2,Tina,Ali,36.0,f,,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [10]:
df_cleaned = df.dropna(thresh = 2, axis = 1)    # 데이터가 최소 3개 이상 없을 때 drop, column 기준
df_cleaned

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,,,,,,
2,Tina,Ali,36.0,f,,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


#### Fill data

In [12]:
df.fillna(0)   # 모든 nan을 0으로 채우기

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,0,0,0.0,0,0.0,0.0
2,Tina,Ali,36.0,f,0.0,0.0
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [13]:
df['preTestScore'].fillna(df['preTestScore'].mean(), inplace = True)    # 평균값으로 결측치 채우기, inplace : 원본 데이터 수정
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,,,,,3.0,
2,Tina,Ali,36.0,f,3.0,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [14]:
df['postTestScore'].fillna(df.groupby('sex')['postTestScore'].transform('mean'), inplace = True)  # 성별에 따라 평균값을 구해서 채우기
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,,,,,3.0,
2,Tina,Ali,36.0,f,3.0,70.0
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [15]:
df[df['age'].notnull() & df['sex'].notnull()]     # age와 sex가 모두 null이 아닌 값 찾기

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
2,Tina,Ali,36.0,f,3.0,70.0
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [18]:
df.groupby('sex')['postTestScore'].transform('mean')   # 성별 postTestScore 평균

0    43.5
1     NaN
2    70.0
3    43.5
4    70.0
Name: postTestScore, dtype: float64

### How to handle categorical data

#### One-Hot encoding
실제 dataset의 크기만큼 binary feature를 생성(더미 처리)

In [19]:
edges = pd.DataFrame({'source' : [0, 1, 2],
                     'target' : [2, 2, 3],
                     'weight' : [3, 4, 5],
                     'color' : ['red', 'blue', 'green']})

0    0
1    1
2    2
Name: source, dtype: int64

In [28]:
edges.dtypes    # object type의 데이터만 one-hot-encoding됨

color     object
source     int64
target     int64
weight     int64
dtype: object

In [29]:
pd.get_dummies(edges)   # pandas 내장 함수 사용

Unnamed: 0,source,target,weight,color_blue,color_green,color_red
0,0,2,3,0,0,1
1,1,2,4,1,0,0
2,2,3,5,0,1,0


In [30]:
pd.get_dummies(edges['color'])

Unnamed: 0,blue,green,red
0,0,0,1
1,1,0,0
2,0,1,0


In [31]:
pd.get_dummies(edges[['color']])    # 이중 괄호를 이용하여 데이터프레임 형태로 반환

Unnamed: 0,color_blue,color_green,color_red
0,0,0,1
1,1,0,0
2,0,1,0


In [33]:
weight_dict = {3 : 'M', 4 : 'L', 5 : 'XL'}
edges['weight_sign'] = edges['weight'].map(weight_dict)    # map 함수를 이용하여 encoding
edges

Unnamed: 0,color,source,target,weight,weight_sign
0,red,0,2,3,M
1,blue,1,2,4,L
2,green,2,3,5,XL


In [34]:
weight_sign = pd.get_dummies(edges['weight_sign'])
weight_sign

Unnamed: 0,L,M,XL
0,0,1,0
1,1,0,0
2,0,0,1


In [36]:
pd.concat([edges, weight_sign], axis =1)

Unnamed: 0,color,source,target,weight,weight_sign,L,M,XL
0,red,0,2,3,M,0,1,0
1,blue,1,2,4,L,1,0,0
2,green,2,3,5,XL,0,0,1


#### Data Binning
데이터의 구간을 나누는 것 (ordinal data로 변환)

In [50]:
bins = [0, 3, 5]    # 구간이 시작하는 수
group_names = ['Low', 'High']    # 구간명
df['categories'] = pd.cut(df['preTestScore'], bins, labels = group_names)    # 나누기
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,categories
0,Jason,Miller,42.0,m,4.0,25.0,High
1,,,,,3.0,,Low
2,Tina,Ali,36.0,f,3.0,70.0,Low
3,Jake,Milner,24.0,m,2.0,62.0,Low
4,Amy,Cooze,73.0,f,3.0,70.0,Low


In [52]:
from sklearn import preprocessing

In [56]:
le = preprocessing.LabelEncoder()
le.fit(df['preTestScore'])    # data에 맞게 fitting
le.transform(df['preTestScore'])     # fitting된 기준으로 encoding

array([2, 1, 1, 0, 1])

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,categories
0,Jason,Miller,42.0,m,4.0,25.0,High
1,,,,,3.0,,Low
2,Tina,Ali,36.0,f,3.0,70.0,Low
3,Jake,Milner,24.0,m,2.0,62.0,Low
4,Amy,Cooze,73.0,f,3.0,70.0,Low
