### How to handle missing values

#### 데이터가 없을 때 취할 수 있는 방법들
 - sample(row)을 drop
 - 결측이 많은 feature는 feature 자체를 drop
 - 최빈값, 평균값 등으로 비어있는 데이터를 채우기

In [63]:
import pandas as pd
import numpy as np

In [64]:
raw_data = {'first_name' : ['Jason', np.nan, 'Tina', 'Jake', 'Amy'],
           'last_name' : ['Miller', np.nan, 'Ali', 'Milner', 'Cooze'],
           'age' : [42, np.nan, 36, 24, 73],
           'sex' : ['m', np.nan, 'f', 'm', 'f'],
           'preTestScore' : [4, np.nan, np.nan, 2, 3],
           'postTestScore' : [25, np.nan, np.nan, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'sex', 'preTestScore', 'postTestScore'])
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,,,,,,
2,Tina,Ali,36.0,f,,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


#### Data Drop

In [65]:
df.isnull().sum()   # 각 column별 결측치 수

first_name       1
last_name        1
age              1
sex              1
preTestScore     2
postTestScore    2
dtype: int64

In [66]:
df_no_missing = df.dropna()
df_no_missing

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [67]:
df_cleaned = df.dropna(how = 'all')    # 모든 데이터가 비어 있으면 drop
df_cleaned

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
2,Tina,Ali,36.0,f,,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [68]:
df_cleaned = df.dropna(thresh = 2, axis = 1)    # 데이터가 최소 3개 이상 없을 때 drop, column 기준
df_cleaned

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,,,,,,
2,Tina,Ali,36.0,f,,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


#### Fill data

In [69]:
df.fillna(0)   # 모든 nan을 0으로 채우기

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,0,0,0.0,0,0.0,0.0
2,Tina,Ali,36.0,f,0.0,0.0
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [70]:
df['preTestScore'].fillna(df['preTestScore'].mean(), inplace = True)    # 평균값으로 결측치 채우기, inplace : 원본 데이터 수정
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,,,,,3.0,
2,Tina,Ali,36.0,f,3.0,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [71]:
df['postTestScore'].fillna(df.groupby('sex')['postTestScore'].transform('mean'), inplace = True)  # 성별에 따라 평균값을 구해서 채우기
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,,,,,3.0,
2,Tina,Ali,36.0,f,3.0,70.0
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [72]:
df[df['age'].notnull() & df['sex'].notnull()]     # age와 sex가 모두 null이 아닌 값 찾기

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
2,Tina,Ali,36.0,f,3.0,70.0
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


In [73]:
df.groupby('sex')['postTestScore'].transform('mean')   # 성별 postTestScore 평균

0    43.5
1     NaN
2    70.0
3    43.5
4    70.0
Name: postTestScore, dtype: float64

### How to handle categorical data

#### One-Hot encoding
실제 dataset의 크기만큼 binary feature를 생성(더미 처리)

In [74]:
edges = pd.DataFrame({'source' : [0, 1, 2],
                     'target' : [2, 2, 3],
                     'weight' : [3, 4, 5],
                     'color' : ['red', 'blue', 'green']})

In [75]:
edges.dtypes    # object type의 데이터만 one-hot-encoding됨

color     object
source     int64
target     int64
weight     int64
dtype: object

In [76]:
pd.get_dummies(edges)   # pandas 내장 함수 사용

Unnamed: 0,source,target,weight,color_blue,color_green,color_red
0,0,2,3,0,0,1
1,1,2,4,1,0,0
2,2,3,5,0,1,0


In [77]:
pd.get_dummies(edges['color'])

Unnamed: 0,blue,green,red
0,0,0,1
1,1,0,0
2,0,1,0


In [78]:
pd.get_dummies(edges[['color']])    # 이중 괄호를 이용하여 데이터프레임 형태로 반환

Unnamed: 0,color_blue,color_green,color_red
0,0,0,1
1,1,0,0
2,0,1,0


In [79]:
weight_dict = {3 : 'M', 4 : 'L', 5 : 'XL'}
edges['weight_sign'] = edges['weight'].map(weight_dict)    # map 함수를 이용하여 encoding
edges

Unnamed: 0,color,source,target,weight,weight_sign
0,red,0,2,3,M
1,blue,1,2,4,L
2,green,2,3,5,XL


In [80]:
weight_sign = pd.get_dummies(edges['weight_sign'])
weight_sign

Unnamed: 0,L,M,XL
0,0,1,0
1,1,0,0
2,0,0,1


In [81]:
pd.concat([edges, weight_sign], axis =1)

Unnamed: 0,color,source,target,weight,weight_sign,L,M,XL
0,red,0,2,3,M,0,1,0
1,blue,1,2,4,L,1,0,0
2,green,2,3,5,XL,0,0,1


#### Data Binning
데이터의 구간을 나누는 것 (ordinal data로 변환)

In [88]:
raw_data = {'regiment': ['Nighthawks', 'Nighthawks', 'Nighthawks', 'Nighthawks', 'Dragoons', 'Dragoons', 'Dragoons', 'Dragoons', 'Scouts', 'Scouts', 'Scouts', 'Scouts'],
        'company': ['1st', '1st', '2nd', '2nd', '1st', '1st', '2nd', '2nd','1st', '1st', '2nd', '2nd'],
        'name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze', 'Jacon', 'Ryaner', 'Sone', 'Sloan', 'Piger', 'Riani', 'Ali'],
        'preTestScore': [4, 24, 31, 2, 3, 4, 24, 31, 2, 3, 2, 3],
        'postTestScore': [25, 94, 57, 62, 70, 25, 94, 57, 62, 70, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['regiment', 'company', 'name', 'preTestScore', 'postTestScore'])
df

Unnamed: 0,regiment,company,name,preTestScore,postTestScore
0,Nighthawks,1st,Miller,4,25
1,Nighthawks,1st,Jacobson,24,94
2,Nighthawks,2nd,Ali,31,57
3,Nighthawks,2nd,Milner,2,62
4,Dragoons,1st,Cooze,3,70
5,Dragoons,1st,Jacon,4,25
6,Dragoons,2nd,Ryaner,24,94
7,Dragoons,2nd,Sone,31,57
8,Scouts,1st,Sloan,2,62
9,Scouts,1st,Piger,3,70


In [89]:
bins = [0, 25, 50, 75, 100]    # 구간이 시작하는 수
group_names = ['Low', 'Okay', 'Good', 'Great']    # 구간명
categories = pd.cut(df['postTestScore'], bins, labels = group_names)    # 나누기
categories

0       Low
1     Great
2      Good
3      Good
4      Good
5       Low
6     Great
7      Good
8      Good
9      Good
10     Good
11     Good
Name: postTestScore, dtype: category
Categories (4, object): [Low < Okay < Good < Great]

In [90]:
from sklearn import preprocessing

In [91]:
le = preprocessing.LabelEncoder()
le.fit(df['preTestScore'])    # data에 맞게 fitting
le.transform(df['preTestScore'])     # fitting된 기준으로 encoding
# train set과 test set에 동일한 기준을 적용하기 위해서

array([2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 0, 1])

In [106]:
raw_example = df.as_matrix()
data = raw_example.copy()

In [102]:
le.fit(raw_example[:, 0])
le.classes_

array(['Dragoons', 'Nighthawks', 'Scouts'], dtype=object)

In [107]:
data[:, 0] = le.transform(raw_example[:, 0])

In [109]:
one_hot_enc = preprocessing.OneHotEncoder()
one_hot_enc.fit(data[:, 0].reshape(-1, 1))    # 2 dimension으로 변환
onehotlabels = one_hot_enc.transform(data[:, 0].reshape(-1, 1)).toarray()
onehotlabels

array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]])

### Feature Scaling

#### Min-Max normaization

In [110]:
df = pd.DataFrame({'A':[14.00,90.20,90.95,96.27,91.21],'B':[103.02,107.26,110.35,114.23,114.68], 'C':['big','small','big','small','small']})
df

Unnamed: 0,A,B,C
0,14.0,103.02,big
1,90.2,107.26,small
2,90.95,110.35,big
3,96.27,114.23,small
4,91.21,114.68,small


In [111]:
df['A'] - df['A'].min()

0     0.00
1    76.20
2    76.95
3    82.27
4    77.21
Name: A, dtype: float64

In [112]:
(df['A'] - df['A'].min()) / (df['A'].max() - df['A'].min())

0    0.000000
1    0.926219
2    0.935335
3    1.000000
4    0.938495
Name: A, dtype: float64

In [114]:
df['A'] = (df['A'] - df['A'].min()) / (df['A'].max() - df['A'].min()) * (5 - 1) + 1
df

Unnamed: 0,A,B,C
0,1.0,103.02,big
1,4.704874,107.26,small
2,4.741339,110.35,big
3,5.0,114.23,small
4,4.753981,114.68,small


In [115]:
df['B'] = (df['B'] - df['B'].mean()) / (df['B'].std())

In [116]:
df

Unnamed: 0,A,B,C
0,1.0,-1.40525,big
1,4.704874,-0.54023,small
2,4.741339,0.090174,big
3,5.0,0.881749,small
4,4.753981,0.973556,small


In [119]:
df = pd.io.parsers.read_csv(
    'https://raw.githubusercontent.com/rasbt/pattern_classification/master/data/wine_data.csv',
     header=None,
     usecols=[0,1,2]
    )

df.columns=['Class label', 'Alcohol', 'Malic acid']

df.head()

Unnamed: 0,Class label,Alcohol,Malic acid
0,1,14.23,1.71
1,1,13.2,1.78
2,1,13.16,2.36
3,1,14.37,1.95
4,1,13.24,2.59


In [121]:
minmax_scale = preprocessing.MinMaxScaler().fit(df[['Alcohol', 'Malic acid']])
df_minmax = minmax_scale.transform(df[['Alcohol', 'Malic acid']])
df_minmax[:3]

array([[0.84210526, 0.1916996 ],
       [0.57105263, 0.2055336 ],
       [0.56052632, 0.3201581 ]])

In [123]:
std_scaler = preprocessing.StandardScaler().fit(df[['Alcohol', 'Malic acid']])
df_std = std_scaler.transform(df[['Alcohol', 'Malic acid']])
df_std[:3]

array([[ 1.51861254, -0.5622498 ],
       [ 0.24628963, -0.49941338],
       [ 0.19687903,  0.02123125]])