<a href="https://colab.research.google.com/github/JakeOh/202007_itw_bd18/blob/master/lab_python/python58_transform.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
np.random.seed(1)

df = pd.DataFrame(data={'gender': ['M'] * 3 + ['F'] * 3,
                        'income': np.random.randint(1, 11, 6)})

df

Unnamed: 0,gender,income
0,M,6
1,M,9
2,M,10
3,F,6
4,F,1
5,F,1


* **표준화(standardization)**: 변수(컬럼)의 평균을 0으로, 표준편차를 1로 변환.
* **정규화(normalization)**: 변수(컬럼)의 최솟값을 0으로, 최댓값을 1로, 사이의 값들은 0 ~ 1 사이의 값으로 변환.

In [3]:
def standardization(x):
    """x: array-like 자료 타입(numpy.ndarray, pandas.Series, ...).
    x_prime = (x - x.mean) / x.standard_deviation
    x_prime을 리턴.
    """
    return (x - x.mean()) / x.std()

In [4]:
def normalization(x):
    """x: array-like 자료 타입.
    x_prime = (x - x.min) / (x.max - x.min)
    x_prime을 리턴.
    """
    return (x - x.min()) / (x.max() - x.min())

In [5]:
income_std = standardization(df['income'])
income_std

0    0.130410
1    0.912871
2    1.173691
3    0.130410
4   -1.173691
5   -1.173691
Name: income, dtype: float64

In [6]:
income_std.mean()  #> 변환된 데이터의 평균 = 0

0.0

In [7]:
income_std.std()  #> 변환된 데이터의 표준편차 = 1

1.0

In [8]:
income_norm = normalization(df['income'])
income_norm

0    0.555556
1    0.888889
2    1.000000
3    0.555556
4    0.000000
5    0.000000
Name: income, dtype: float64

In [9]:
df['income'].transform(standardization)

0    0.130410
1    0.912871
2    1.173691
3    0.130410
4   -1.173691
5   -1.173691
Name: income, dtype: float64

In [10]:
df['income'].transform([standardization, normalization])

Unnamed: 0,standardization,normalization
0,0.13041,0.555556
1,0.912871,0.888889
2,1.173691,1.0
3,0.13041,0.555556
4,-1.173691,0.0
5,-1.173691,0.0


lambda expression(람다 표현식):
```
lambda param1, param2, ...: return_value
```

In [11]:
df['income'].transform(lambda x: (x - x.mean()) / x.std())

0    0.130410
1    0.912871
2    1.173691
3    0.130410
4   -1.173691
5   -1.173691
Name: income, dtype: float64

In [12]:
df['gender'].transform(lambda x: x.lower())

0    m
1    m
2    m
3    f
4    f
5    f
Name: gender, dtype: object

In [13]:
df['gender'].transform(lambda x: 0 if x == 'M' else 1)

0    0
1    0
2    0
3    1
4    1
5    1
Name: gender, dtype: int64

In [14]:
df['income_std'] = df['income'].transform(standardization)
df

Unnamed: 0,gender,income,income_std
0,M,6,0.13041
1,M,9,0.912871
2,M,10,1.173691
3,F,6,0.13041
4,F,1,-1.173691
5,F,1,-1.173691


In [15]:
df['income_norm'] = df['income'].transform(normalization)
df

Unnamed: 0,gender,income,income_std,income_norm
0,M,6,0.13041,0.555556
1,M,9,0.912871,0.888889
2,M,10,1.173691,1.0
3,F,6,0.13041,0.555556
4,F,1,-1.173691,0.0
5,F,1,-1.173691,0.0


In [16]:
df.groupby('gender')['income'].transform(standardization)

0   -1.120897
1    0.320256
2    0.800641
3    1.154701
4   -0.577350
5   -0.577350
Name: income, dtype: float64

In [17]:
df.groupby('gender')['income'].transform(normalization)

0    0.00
1    0.75
2    1.00
3    1.00
4    0.00
5    0.00
Name: income, dtype: float64

* 결측치 대체: 
  * 평균으로 대체, 최빈값 대체, ...
  * 그룹별 변환(transform)을 이용한 결측치(missing value) 대체

In [18]:
df = pd.DataFrame(data={'gender': ['M'] * 3 + ['F'] * 3,
                        'income': [1, np.nan, 3, np.nan, 4, 6]})
df

Unnamed: 0,gender,income
0,M,1.0
1,M,
2,M,3.0
3,F,
4,F,4.0
5,F,6.0


In [19]:
df['income'].mean()  # (1 + 3 + 4 + 6) / 4

3.5

In [20]:
df['income'].fillna(df['income'].mean())

0    1.0
1    3.5
2    3.0
3    3.5
4    4.0
5    6.0
Name: income, dtype: float64

In [21]:
s = df.groupby('gender')['income'].mean()
s

gender
F    5.0
M    2.0
Name: income, dtype: float64

In [22]:
df.fillna(s)

Unnamed: 0,gender,income
0,M,1.0
1,M,
2,M,3.0
3,F,
4,F,4.0
5,F,6.0


In [23]:
df.groupby('gender')['income'].transform(lambda x: x.fillna(x.mean()))

0    1.0
1    2.0
2    3.0
3    5.0
4    4.0
5    6.0
Name: income, dtype: float64

* seaborn 패키지에 포함된 iris 데이터 세트를 데이터 프레임으로 생성.
* 품종(species)을 제외한 모든 변수들을 표준화/정규화
* 품종(species)을 제외한 모든 변수들을 품종별로 표준화/정규화

In [24]:
iris = sns.load_dataset('iris')
iris

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [26]:
iris.describe()  # 기술 통계량 요약.

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [29]:
cols = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
iris_std = iris[cols].transform(standardization)  # 각 변수들을 표준화.
iris_std

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,-0.897674,1.015602,-1.335752,-1.311052
1,-1.139200,-0.131539,-1.335752,-1.311052
2,-1.380727,0.327318,-1.392399,-1.311052
3,-1.501490,0.097889,-1.279104,-1.311052
4,-1.018437,1.245030,-1.335752,-1.311052
...,...,...,...,...
145,1.034539,-0.131539,0.816859,1.443994
146,0.551486,-1.278680,0.703564,0.919223
147,0.793012,-0.131539,0.816859,1.050416
148,0.430722,0.786174,0.930154,1.443994


In [30]:
iris_std.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,-1.457168e-15,-1.638319e-15,-1.2923e-15,-5.543714e-16
std,1.0,1.0,1.0,1.0
min,-1.86378,-2.42582,-1.562342,-1.442245
25%,-0.8976739,-0.5903951,-1.222456,-1.179859
50%,-0.05233076,-0.1315388,0.3353541,0.1320673
75%,0.672249,0.5567457,0.7602115,0.7880307
max,2.483699,3.080455,1.779869,1.706379


In [37]:
iris_df = iris.iloc[:, :-1]
iris_df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [40]:
iris_norm = iris[cols].transform(normalization)
iris_norm

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,0.222222,0.625000,0.067797,0.041667
1,0.166667,0.416667,0.067797,0.041667
2,0.111111,0.500000,0.050847,0.041667
3,0.083333,0.458333,0.084746,0.041667
4,0.194444,0.666667,0.067797,0.041667
...,...,...,...,...
145,0.666667,0.416667,0.711864,0.916667
146,0.555556,0.208333,0.677966,0.750000
147,0.611111,0.416667,0.711864,0.791667
148,0.527778,0.583333,0.745763,0.916667


In [41]:
iris_norm.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,0.428704,0.440556,0.467458,0.458056
std,0.230018,0.181611,0.299203,0.317599
min,0.0,0.0,0.0,0.0
25%,0.222222,0.333333,0.101695,0.083333
50%,0.416667,0.416667,0.567797,0.5
75%,0.583333,0.541667,0.694915,0.708333
max,1.0,1.0,1.0,1.0


In [45]:
iris_std = iris.groupby('species')[cols].transform(standardization)
iris_std

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,0.266674,0.189941,-0.357011,-0.436492
1,-0.300718,-1.129096,-0.357011,-0.436492
2,-0.868111,-0.601481,-0.932836,-0.436492
3,-1.151807,-0.865288,0.218813,-0.436492
4,-0.017022,0.453749,-0.357011,-0.436492
...,...,...,...,...
145,0.176134,0.080621,-0.637803,0.997633
146,-0.452916,-1.469783,-1.000191,-0.458766
147,-0.138391,0.080621,-0.637803,-0.094666
148,-0.610178,1.320944,-0.275415,0.997633


In [48]:
iris_std.iloc[50:100, :].describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,50.0,50.0,50.0,50.0
mean,1.14353e-16,-1.486589e-15,4.196643e-16,8.204548e-16
std,1.0,1.0,1.0,1.0
min,-2.007086,-2.453805,-2.681359,-1.648524
25%,-0.6509469,-0.7807562,-0.5532963,-0.6371595
50%,-0.06974431,0.09560281,0.1915256,-0.1314774
75%,0.7051925,0.7329548,0.7235413,0.8798869
max,2.061332,2.007659,1.787573,2.396933


In [50]:
iris_norm = iris.groupby('species')[cols].transform(normalization)
iris_norm

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,0.533333,0.571429,0.444444,0.200000
1,0.400000,0.333333,0.444444,0.200000
2,0.266667,0.428571,0.333333,0.200000
3,0.200000,0.380952,0.555556,0.200000
4,0.466667,0.619048,0.444444,0.200000
...,...,...,...,...
145,0.600000,0.500000,0.291667,0.818182
146,0.466667,0.187500,0.208333,0.454545
147,0.533333,0.500000,0.291667,0.545455
148,0.433333,0.750000,0.375000,0.818182


In [52]:
iris_norm.iloc[50:100, :].describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,50.0,50.0,50.0,50.0
mean,0.493333,0.55,0.6,0.4075
std,0.245796,0.224142,0.223767,0.247191
min,0.0,0.0,0.0,0.0
25%,0.333333,0.375,0.47619,0.25
50%,0.47619,0.571429,0.642857,0.375
75%,0.666667,0.714286,0.761905,0.625
max,1.0,1.0,1.0,1.0


* seaborn 패키지의 tips 샘플 데이터 프레임 로딩.
* 성별, 시간별 영수증금액의 평균, 팁의 최댓값과 최솟값.
  * pivot_table
  * groupby

In [53]:
tips = sns.load_dataset('tips')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [55]:
tips.pivot_table(values=['total_bill', 'tip'],
                 index=['sex', 'time'],
                 aggfunc={'total_bill': np.mean,
                          'tip': [np.max, np.min]})

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,tip,total_bill
Unnamed: 0_level_1,Unnamed: 1_level_1,amax,amin,mean
sex,time,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Male,Lunch,6.7,1.44,18.048485
Male,Dinner,10.0,1.0,21.461452
Female,Lunch,5.17,1.25,16.339143
Female,Dinner,6.5,1.0,19.213077


In [57]:
tips.groupby(['sex', 'time'])['total_bill', 'tip'].aggregate({'total_bill': np.mean,
                                                              'tip': [np.max, np.min]})

  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip,tip
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,amax,amin
sex,time,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Male,Lunch,18.048485,6.7,1.44
Male,Dinner,21.461452,10.0,1.0
Female,Lunch,16.339143,5.17,1.25
Female,Dinner,19.213077,6.5,1.0
