# 수치형 데이터 다루기

## 특성 스케일 바꾸기

In [1]:
import numpy as np
from sklearn import preprocessing

feature = np.array([[-500.5],
                   [-100.1],
                   [0],
                   [100.1],
                   [900.9]])

In [2]:
minmax_scale = preprocessing.MinMaxScaler(feature_range = (0,1))

scaled_feature = minmax_scale.fit_transform(feature)



In [3]:
scaled_feature

array([[0.        ],
       [0.28571429],
       [0.35714286],
       [0.42857143],
       [1.        ]])

In [4]:
preprocessing.MinMaxScaler().fit_transform(feature[:3])

array([[0. ],
       [0.8],
       [1. ]])

In [5]:
preprocessing.MinMaxScaler().fit_transform(feature[3:])

array([[0.],
       [1.]])

In [6]:
scaler = preprocessing.MinMaxScaler().fit(feature[:3])
scaler.transform(feature[:3])

array([[0. ],
       [0.8],
       [1. ]])

In [7]:
scaler.transform(feature[3:])

array([[1.2],
       [2.8]])

## 특성을 표준화하기

In [10]:
x = np.array([[-1000.1],
            [-200.2],
            [500.5],
            [600.6],
            [9000.9]])

scaler = preprocessing.StandardScaler()
standardized = scaler.fit_transform(x)
standardized

array([[-0.76058269],
       [-0.54177196],
       [-0.35009716],
       [-0.32271504],
       [ 1.97516685]])

In [12]:
print("평균 : ",round(standardized.mean()))

평균 :  0.0


In [13]:
print("표준편차 : ",standardized.std())

표준편차 :  1.0


In [14]:
robust_scaler = preprocessing.RobustScaler()

robust_scaler.fit_transform(x)

array([[-1.87387612],
       [-0.875     ],
       [ 0.        ],
       [ 0.125     ],
       [10.61488511]])

In [15]:
preprocessing.QuantileTransformer().fit_transform(x)



array([[0.  ],
       [0.25],
       [0.5 ],
       [0.75],
       [1.  ]])

## 정규화하기

In [16]:
from sklearn.preprocessing import Normalizer

features = np.array([[0.5,0.5],
                    [1.1, 3.4],
                    [1.5 , 20.2],
                    [1.63, 34.4],
                    [10.9, 3.3]])



In [18]:
normalizer = Normalizer(norm='l2')

normalizer.transform(features)

array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04733062, 0.99887928],
       [0.95709822, 0.28976368]])

In [19]:
feature_l2_norm = Normalizer(norm='l2').transform(features)
# l2는 직선거리

In [20]:
feature_l2_norm

array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04733062, 0.99887928],
       [0.95709822, 0.28976368]])

In [21]:
feature_l1_norm = Normalizer(norm='l1').transform(features)
#l1은 사람이 도로를 따라 걷는 거리

In [22]:
feature_l1_norm

array([[0.5       , 0.5       ],
       [0.24444444, 0.75555556],
       [0.06912442, 0.93087558],
       [0.04524008, 0.95475992],
       [0.76760563, 0.23239437]])

In [24]:
print( "첫 번째 샘플값의 합 :", feature_l1_norm[0,0]+ feature_l1_norm[0,1])

첫 번째 샘플값의 합 : 1.0


In [25]:
#l1 
features / np.sum(np.abs(features),axis = 1, keepdims=True)

array([[0.5       , 0.5       ],
       [0.24444444, 0.75555556],
       [0.06912442, 0.93087558],
       [0.04524008, 0.95475992],
       [0.76760563, 0.23239437]])

In [26]:
#l2
features/ np.sqrt(np.sum(np.square(features),axis =1,keepdims = True))

array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04733062, 0.99887928],
       [0.95709822, 0.28976368]])

In [27]:
Normalizer(norm='max').transform(features)

array([[1.        , 1.        ],
       [0.32352941, 1.        ],
       [0.07425743, 1.        ],
       [0.04738372, 1.        ],
       [1.        , 0.30275229]])

## 다항 특성과 교차항 특성 생성하기

In [28]:
from sklearn.preprocessing import PolynomialFeatures

features = np.array([[2,3],
                    [2,3],
                    [2,3]])



In [29]:
Polynomial_interaction = PolynomialFeatures(degree =2, include_bias = False)

In [31]:
Polynomial_interaction.fit_transform(features)

array([[2., 3., 4., 6., 9.],
       [2., 3., 4., 6., 9.],
       [2., 3., 4., 6., 9.]])

In [32]:
interaction = PolynomialFeatures(degree = 2, interaction_only=True, include_bias = False)

In [35]:
interaction.fit_transform(features)

array([[2., 3., 6.],
       [2., 3., 6.],
       [2., 3., 6.]])

In [47]:
polynomial_bias = PolynomialFeatures(degree =2 , include_bias= True).fit(features)

In [41]:
polynomial_bias.fit_transform(features)

array([[1., 2., 3., 4., 6., 9.],
       [1., 2., 3., 4., 6., 9.],
       [1., 2., 3., 4., 6., 9.]])

In [48]:
polynomial_bias.get_feature_names()

['1', 'x0', 'x1', 'x0^2', 'x0 x1', 'x1^2']

## 특성 변환하기

In [49]:
from sklearn.preprocessing import FunctionTransformer

features = np.array([[2,3],
                    [2,3],
                    [2,3]])

In [50]:
def add_ten(x):
    return x+10

ten_transformer = FunctionTransformer(add_ten)

ten_transformer.transform(features)

array([[12, 13],
       [12, 13],
       [12, 13]])

In [51]:
import pandas as pd

df= pd.DataFrame(features, columns =['feature1','feature2'])

In [53]:
df.apply(add_ten)

Unnamed: 0,feature1,feature2
0,12,13
1,12,13
2,12,13


In [56]:
FunctionTransformer(add_ten, validate =False).transform(np.array([1,2,3]))

array([11, 12, 13])

In [57]:
# ColumnTransforemr을 사용하면 특성 배열이나 데이터 프레임의 열마다 다른 변환을 적용 할 수 있다.

from sklearn.compose import ColumnTransformer

def add_hundred(x):
    return x+100



In [61]:
ct =ColumnTransformer(
    [("add_ten",FunctionTransformer(add_ten,validate = True),['feature1']),
    ("add_hundred",FunctionTransformer(add_hundred,validate=True),['feature2'])])

## 이상치 감지하기

In [62]:
ct.fit_transform(df)

array([[ 12, 103],
       [ 12, 103],
       [ 12, 103]])

In [64]:
from sklearn.covariance import EllipticEnvelope
from sklearn.datasets import make_blobs

# 모의 데이터를 만듭니다.
features,_= make_blobs(n_samples =10,
                      n_features= 2,
                      centers = 1,
                      random_state =1)

In [65]:
features[0,0] = 10000
features[0,1] = 10000

In [66]:
outlier_detector = EllipticEnvelope(contamination=.1)

outlier_detector.fit(features)

outlier_detector.predict(features)

array([-1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

In [73]:
feature = features[:,0]

def indices_of_outliers(x):
    q1,q3 = np.percentile(x,[25,75])
    iqr = q3-q1
    lower_bound = q1 -(iqr * 1.5)
    upper_bound = q3 + (iqr * 1.5)
    return np.where((x>upper_bound) | (x<lower_bound))

In [74]:
indices_of_outliers(feature)

(array([0], dtype=int64),)

## 이상치 다루기

In [75]:
houses = pd.DataFrame()

In [76]:
houses['Price'] = [534433, 392333, 293222, 4322032]
houses['Bathrooms']=[2,3.5,2,116]
houses['Square_feet']=[1500,2500,1500,48000]

In [77]:
houses[houses['Bathrooms']<20]

Unnamed: 0,Price,Bathrooms,Square_feet
0,534433,2.0,1500
1,392333,3.5,2500
2,293222,2.0,1500


In [78]:
houses['Outlier'] = np.where(houses['Bathrooms']<20,0,1)

In [79]:
houses

Unnamed: 0,Price,Bathrooms,Square_feet,Outlier
0,534433,2.0,1500,0
1,392333,3.5,2500,0
2,293222,2.0,1500,0
3,4322032,116.0,48000,1


In [81]:
houses['Log_of_Square_Feet'] = [np.log(x) for x in houses['Square_feet']]

In [82]:
houses

Unnamed: 0,Price,Bathrooms,Square_feet,Outlier,Log_of_Square_Feet
0,534433,2.0,1500,0,7.31322
1,392333,3.5,2500,0,7.824046
2,293222,2.0,1500,0,7.31322
3,4322032,116.0,48000,1,10.778956


## 특성 이산화하기


In [83]:
from sklearn.preprocessing import Binarizer

age = np.array([[6],
               [12],
               [20],
               [36],
               [65]])



In [84]:
binarizer = Binarizer(18)




In [85]:
binarizer.fit_transform(age)

array([[0],
       [0],
       [1],
       [1],
       [1]])

In [87]:
np.digitize(age, bins=[20,30,64])

array([[0],
       [0],
       [1],
       [2],
       [3]], dtype=int64)

In [88]:
np.digitize(age, bins=[20,30,64], right = True)

array([[0],
       [0],
       [0],
       [2],
       [3]], dtype=int64)

In [89]:
np.digitize(age, bins=[18])

array([[0],
       [0],
       [1],
       [1],
       [1]], dtype=int64)

In [90]:
from sklearn.preprocessing import KBinsDiscretizer

kb= KBinsDiscretizer(4, encode = 'ordinal', strategy= 'quantile')

kb.fit_transform(age)

array([[0.],
       [1.],
       [2.],
       [3.],
       [3.]])

In [91]:
kb = KBinsDiscretizer(4, encode = 'onehot-dense', strategy = 'quantile')

kb.fit_transform(age)

array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.]])

In [93]:
kb =KBinsDiscretizer(4, encode = 'onehot-dense', strategy = 'uniform')

kb.fit_transform(age)

array([[1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])

In [95]:
kb.bin_edges_

array([array([ 6.  , 20.75, 35.5 , 50.25, 65.  ])], dtype=object)

In [96]:
import pandas as pd
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans

In [99]:
features,_= make_blobs(n_samples = 50,
                      n_features = 2,
                      centers = 3,
                      random_state = 1)

In [100]:
dataframe = pd.DataFrame(features, columns = ['features_1','features_2'])

clusterer = KMeans(3, random_state = 0)

In [102]:
clusterer.fit(features)

KMeans(n_clusters=3, random_state=0)

In [103]:
dataframe['group'] = clusterer.predict(features)

In [106]:
dataframe.head(5)

Unnamed: 0,features_1,features_2,group
0,-9.877554,-3.336145,2
1,-7.28721,-8.353986,0
2,-6.943061,-7.023744,0
3,-7.440167,-8.791959,0
4,-6.641388,-8.075888,0


## 누락된 값을 가진 샘플을 삭제하기

In [110]:
features = np.array([[1.1,11.1],
                  [2.2,22.2],
                  [3.3,33.3],
                  [4.4,44.4],
                  [np.nan, 55]])

In [111]:
features[~np.isnan(features).any(axis=1)]

array([[ 1.1, 11.1],
       [ 2.2, 22.2],
       [ 3.3, 33.3],
       [ 4.4, 44.4]])

In [112]:
dataframe = pd.DataFrame(features, columns=['feature1','feature2'])



In [114]:
dataframe.dropna()

Unnamed: 0,feature1,feature2
0,1.1,11.1
1,2.2,22.2
2,3.3,33.3
3,4.4,44.4


In [115]:
# MCAR : 완전히 랜덤하게 누락
# MAR : 랜덤하게 누락
# MNAR : 랜덤하지 않게 누락

## 누락된 값 채우기

In [121]:
# 텐서플로우 업데이트 후 다시 코드 실행
from fancyimpute import KNN
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_blobs



ImportError: Keras requires TensorFlow 2.2 or higher. Install TensorFlow via `pip install tensorflow`