In [28]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

## 결측값 처리

In [29]:
df = pd.DataFrame(np.random.randn(5, 3),
                 columns = ['C1', 'C2', 'C3'])

df.iloc[0, 0] = None
df.loc[1, ['C1', 'C3']] = np.nan
df.loc[2, 'C2'] = np.nan
df.loc[3, 'C2'] = np.nan
df.loc[4, 'C3'] = np.nan
df

Unnamed: 0,C1,C2,C3
0,,0.356155,-0.705823
1,,0.95973,
2,0.06632,,-0.223484
3,-1.827293,,-0.558154
4,1.236629,0.164454,


#### 1. 결측값을 특정 값으로 채우기

In [30]:
df_0 = df.fillna(0)
df_0

Unnamed: 0,C1,C2,C3
0,0.0,0.356155,-0.705823
1,0.0,0.95973,0.0
2,0.06632,0.0,-0.223484
3,-1.827293,0.0,-0.558154
4,1.236629,0.164454,0.0


In [31]:
df_missing = df.fillna('missing')
df_missing

Unnamed: 0,C1,C2,C3
0,missing,0.356155,-0.705823
1,missing,0.95973,missing
2,0.06632,missing,-0.223484
3,-1.82729,missing,-0.558154
4,1.23663,0.164454,missing


#### 2. 결측값을 앞 방향 혹은 뒷방향으로 채우기

In [32]:
# 앞방향으로 채우기 1
df.fillna(method = 'ffill')

Unnamed: 0,C1,C2,C3
0,,0.356155,-0.705823
1,,0.95973,-0.705823
2,0.06632,0.95973,-0.223484
3,-1.827293,0.95973,-0.558154
4,1.236629,0.164454,-0.558154


In [33]:
# 앞방향으로 채우기 2
df.fillna(method = 'pad')

Unnamed: 0,C1,C2,C3
0,,0.356155,-0.705823
1,,0.95973,-0.705823
2,0.06632,0.95973,-0.223484
3,-1.827293,0.95973,-0.558154
4,1.236629,0.164454,-0.558154


In [34]:
# 뒷방향으로 채우기 1
df.fillna(method = 'bfill')

Unnamed: 0,C1,C2,C3
0,0.06632,0.356155,-0.705823
1,0.06632,0.95973,-0.223484
2,0.06632,0.164454,-0.223484
3,-1.827293,0.164454,-0.558154
4,1.236629,0.164454,


In [35]:
# 뒷방향으로 채우기 2
df.fillna(method = 'backfill')

Unnamed: 0,C1,C2,C3
0,0.06632,0.356155,-0.705823
1,0.06632,0.95973,-0.223484
2,0.06632,0.164454,-0.223484
3,-1.827293,0.164454,-0.558154
4,1.236629,0.164454,


#### 2-1. 앞/뒤 방향으로 결측값 채우는 횟수를 제한하기

In [36]:
# 앞 방향이나 뒷 방향으로 채워나갈 때 fillna(limit=1)를 
# 사용해서 결측값 채우는 '개수'를 '1'개로 한정해 보겠습니다.  
# 시계열 데이터 분석할 때 유용하게 사용하는 기능 중의 하나입니다.

df.fillna(method = 'ffill', limit = 1)

Unnamed: 0,C1,C2,C3
0,,0.356155,-0.705823
1,,0.95973,-0.705823
2,0.06632,0.95973,-0.223484
3,-1.827293,,-0.558154
4,1.236629,0.164454,-0.558154


In [37]:
df.fillna(method = 'bfill', limit = 1)

Unnamed: 0,C1,C2,C3
0,,0.356155,-0.705823
1,0.06632,0.95973,-0.223484
2,0.06632,,-0.223484
3,-1.827293,0.164454,-0.558154
4,1.236629,0.164454,


#### 1-3. 결측값을 변수별 평균값으로 대체하기

In [38]:
df.fillna(df.mean(axis = 0))

Unnamed: 0,C1,C2,C3
0,-0.174781,0.356155,-0.705823
1,-0.174781,0.95973,-0.49582
2,0.06632,0.493446,-0.223484
3,-1.827293,0.493446,-0.558154
4,1.236629,0.164454,-0.49582


#### 1-4. 결측값을 다른 변수의 값으로 대체하기

In [39]:
df['C2_new'] = np.where(pd.notnull(df['C2']) == True, df['C2'], df['C1'])
df

Unnamed: 0,C1,C2,C3,C2_new
0,,0.356155,-0.705823,0.356155
1,,0.95973,,0.95973
2,0.06632,,-0.223484,0.06632
3,-1.827293,,-0.558154,-1.827293
4,1.236629,0.164454,,0.164454


#### 이 외에 방식
- interpolate(시계열 보간법) <br>
- EDA를 통해 판단 (왜 결측되었는지, 0으로 측정된건 아닌지 등)<br>
- 지도, 비지도 학습 (학습을 통해 결측치처리)

## 이상치 처리

In [78]:
data = pd.read_csv('./[Dataset] 작업형 제2유형/X_train.csv',
            encoding = 'cp949')

In [79]:
# IQR 기반 예제 코드
def outliers_iqr(ys):
    quartile_1, quartile_3 = np.percentile(ys, [25, 75])
    iqr = quartile_3 - quartile_1
    lower_bound = quartile_1 - (iqr * 1.5)
    upper_bound = quartile_3 + (iqr * 1.5)
    return np.where((ys > upper_bound) | (ys < lower_bound))

In [80]:
outlier_index = outliers_iqr(data['총구매액'])[0]
print('이상치 제거 전 데이터 수: ',len(data))
data.drop(outlier_index, axis = 0, inplace = True)
print('이상치 제거 후 데이터 수: ',len(data))

이상치 제거 전 데이터 수:  3500
이상치 제거 후 데이터 수:  3144


## 데이터 분포 변환

In [81]:
data = pd.read_csv('./[Dataset] 작업형 제2유형/X_train.csv',
            encoding = 'cp949')

In [82]:
numerical_feats = data.dtypes[data.dtypes != 'object'].index

In [83]:
for col in numerical_feats:
    print('{:15}'.format(col),
         'Skewness:{:.2f}'.format(data[col].skew()),
         'Kurtosis:{:.2f}'.format(data[col].kurt()))

cust_id         Skewness:0.00 Kurtosis:-1.20
총구매액            Skewness:4.17 Kurtosis:28.14
최대구매액           Skewness:6.11 Kurtosis:80.48
환불금액            Skewness:4.92 Kurtosis:35.53
내점일수            Skewness:2.82 Kurtosis:11.80
내점당구매건수         Skewness:2.95 Kurtosis:15.26
주말방문비율          Skewness:0.94 Kurtosis:0.14
구매주기            Skewness:2.10 Kurtosis:5.55


In [84]:
data['log_최대구매액'] = np.log1p(data['최대구매액'])
print('Skewness:{:.2f}'.format(data['log_최대구매액'].skew()),
         'Kurtosis:{:.2f}'.format(data['log_최대구매액'].kurt()))

Skewness:-0.85 Kurtosis:0.73


## 데이터 스케일링

#### 스케일러 종류
- StandardScaler: 기본 스케일, 평균과 표준편차 사용 <br>
- MinMaxScaler: 최대/최소값이 각각 1, 0이 되도록 스케일링 <br>
- MaxAbsScaler: 최대 절대값과 0이 각각 1, 0이 되도록 스케일링 <br>
- RobustScaler: 중앙값과 IQR 사용, 아웃라이어 영향을 최소화 <br>

In [96]:
data = pd.read_csv('./[Dataset] 작업형 제2유형/X_train.csv',
            encoding = 'cp949')

In [97]:
numerical_feats = data.dtypes[data.dtypes != 'object'].index

#### 1. StandardScaler
- 평균을 제거하고 데이터를 단위 분산으로 조정
- 이상치가 있다면 영향을 미쳐 변환된 데이터의 확산은 매우 달라짐

In [99]:
from sklearn.preprocessing import StandardScaler
standardscaler = StandardScaler()
data[numerical_feats] = standardscaler.fit_transform(data[numerical_feats])
data.head()

Unnamed: 0,cust_id,총구매액,최대구매액,환불금액,주구매상품,주구매지점,내점일수,내점당구매건수,주말방문비율,구매주기
0,-1.731556,-0.14458,-0.262608,-0.36291,기타,강남점,-0.009338,0.554247,0.758623,-0.159962
1,-1.730566,-0.54919,-0.547967,-0.501176,스포츠,잠실점,-0.635003,-0.698168,-1.06053,-0.806554
2,-1.729577,-0.5427,-0.563504,,남성 캐주얼,관악점,-0.635003,-0.436675,-1.06053,-0.806554
3,-1.728587,-0.463911,-0.460465,,기타,광주점,-0.046142,-0.204236,0.037746,-0.200374
4,-1.727597,-0.384561,0.135544,,보석,본 점,-0.635003,-0.698168,-1.06053,2.588052


#### 2. MinMaxScaler
- 모든 feature 값이 0~1사이에 있도록 데이터를 재조정
- 다만 이상치가 있는 경우 변환된 값이 매우 좁은 범위로 압출될 수 있음

In [104]:
from sklearn.preprocessing import MinMaxScaler
minmaxscaler = MinMaxScaler()
data[numerical_feats] = minmaxscaler.fit_transform(data[numerical_feats])
data.head()

Unnamed: 0,cust_id,총구매액,최대구매액,환불금액,주구매상품,주구매지점,내점일수,내점당구매건수,주말방문비율,구매주기
0,0.0,0.05081,0.02009,0.012159,기타,강남점,0.06338,0.1373,0.527027,0.10241
1,0.000286,0.022966,0.007226,0.000522,스포츠,잠실점,0.003521,0.023715,0.0,0.006024
2,0.000572,0.023412,0.006526,,남성 캐주얼,관악점,0.003521,0.047431,0.0,0.006024
3,0.000857,0.028834,0.011171,,기타,광주점,0.059859,0.068511,0.318182,0.096386
4,0.001143,0.034295,0.038037,,보석,본 점,0.003521,0.023715,0.0,0.512048


#### 3. MaxAbsScaler
- 절대값이 0~1 사이에 매핑되도도록 함
- 즉 -1 ~ 1 사이로 재조정
- 양수 데이터로만 구성된 특징 데이터셋에는 MinMax와 유사하게 동작
- 이상치에 민감

In [106]:
from sklearn.preprocessing import MaxAbsScaler
maxabsscaler = MaxAbsScaler()
data[numerical_feats] = maxabsscaler.fit_transform(data[numerical_feats])
data.head()

Unnamed: 0,cust_id,총구매액,최대구매액,환불금액,주구매상품,주구매지점,내점일수,내점당구매건수,주말방문비율,구매주기
0,0.0,0.05081,0.02009,0.012159,기타,강남점,0.06338,0.1373,0.527027,0.10241
1,0.000286,0.022966,0.007226,0.000522,스포츠,잠실점,0.003521,0.023715,0.0,0.006024
2,0.000572,0.023412,0.006526,,남성 캐주얼,관악점,0.003521,0.047431,0.0,0.006024
3,0.000857,0.028834,0.011171,,기타,광주점,0.059859,0.068511,0.318182,0.096386
4,0.001143,0.034295,0.038037,,보석,본 점,0.003521,0.023715,0.0,0.512048


#### 4. RobustScaler
- 아웃라이어의 영향을 최소화한 기법
- 중앙값과 IQR을 사용하기 때문에 표준화 후 동일한 값을 더 넓게 분포시킴

In [107]:
from sklearn.preprocessing import RobustScaler
robustscaler = RobustScaler()
data[numerical_feats] = robustscaler.fit_transform(data[numerical_feats])
data.head()

Unnamed: 0,cust_id,총구매액,최대구매액,환불금액,주구매상품,주구매지점,내점일수,내점당구매건수,주말방문비율,구매주기
0,-1.0,0.393669,0.071039,-0.024336,기타,강남점,0.478261,0.913992,0.641745,0.166667
1,-0.999428,-0.256353,-0.383373,-0.324413,스포츠,잠실점,-0.26087,-0.487805,-0.608056,-0.5
2,-0.998857,-0.245927,-0.408114,,남성 캐주얼,관악점,-0.26087,-0.195122,-0.608056,-0.5
3,-0.998285,-0.119349,-0.244032,,기타,광주점,0.434783,0.065041,0.146486,0.125
4,-0.997714,0.00813,0.705065,,보석,본 점,-0.26087,-0.487805,-0.608056,3.0
