<a href="https://colab.research.google.com/github/JakeOh/202105_itw_bd26/blob/main/lab_ml/ml01_numpy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# `np.ndarray` 속성(attribute)

In [2]:
# 모든 원소가 1.0인 배열 생성
arr = np.ones((3, 4))  # 파라미터 dtype의 기본값은 np.float64(64bits에 저장되는 실수) 타입.
print(arr)

[[1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]]


`ndarray` 클래스의 속성들(attributes)

In [3]:
arr.dtype  # 배열 원소의 데이터 타입

dtype('float64')

In [4]:
arr.shape  # 배열의 각 축(axis)을 따라서 있는 원소들의 개수들의 tuple

(3, 4)

In [5]:
arr.size  # 배열의 전체 원소 개수

12

In [6]:
arr.ndim  # 배열의 차원의 개수. 배열의 축(axis)의 개수

2

In [7]:
# [1, 6) 범위의 정수들로 이루어진 1차원 배열
arr =  np.arange(1, 6)
print(arr)

[1 2 3 4 5]


In [8]:
print('dtype:', arr.dtype)
print('shape:', arr.shape)
print('size:', arr.size)
print('ndim:', arr.ndim)

dtype: int64
shape: (5,)
size: 5
ndim: 1


In [9]:
# 3차원 배열
arr = np.arange(1, 25).reshape((2, 3, 4))
print(arr)

[[[ 1  2  3  4]
  [ 5  6  7  8]
  [ 9 10 11 12]]

 [[13 14 15 16]
  [17 18 19 20]
  [21 22 23 24]]]


In [10]:
print('dtype:', arr.dtype)
print('ndim:', arr.ndim)
print('shape:', arr.shape)
print('size:', arr.size)

dtype: int64
ndim: 3
shape: (2, 3, 4)
size: 24


# `np.ndarray` indexing

In [11]:
print(arr[0])

[[ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]]


In [12]:
print(arr[0, 0])

[1 2 3 4]


In [13]:
print(arr[0, 0, 0])

1


* Python list의 인덱스 사용방법: `list[a][b][c]...`
* numpy ndarray의 인덱스 사용방법:
    * `array[a][b][c]...`
    * `array[a, b, c, ...]`

# `np.ndarray` slicing

In [14]:
# 1차원 배열
np.random.seed(1)
arr = np.random.rand(10)
print(arr)

[4.17022005e-01 7.20324493e-01 1.14374817e-04 3.02332573e-01
 1.46755891e-01 9.23385948e-02 1.86260211e-01 3.45560727e-01
 3.96767474e-01 5.38816734e-01]


In [15]:
arr[1:5]  # [1, 5) 범위의 원소들로 이루어진 부분집합(1차원 array)

array([7.20324493e-01, 1.14374817e-04, 3.02332573e-01, 1.46755891e-01])

In [16]:
# 2차원 배열
arr = np.arange(1, 21).reshape((4, 5))
print(arr)

[[ 1  2  3  4  5]
 [ 6  7  8  9 10]
 [11 12 13 14 15]
 [16 17 18 19 20]]


In [17]:
# row는 전체 선택, column은 처음 2개 선택
print(arr[:, 0:2])

[[ 1  2]
 [ 6  7]
 [11 12]
 [16 17]]


In [18]:
# 첫 2개 row를 선택, 모든 column 선택
print(arr[0:2, :])

[[ 1  2  3  4  5]
 [ 6  7  8  9 10]]


In [19]:
print(arr[:2, :3])

[[1 2 3]
 [6 7 8]]


In [20]:
# 3차원 배열
arr = np.arange(1, 61).reshape((3, 4, 5))
print(arr)

[[[ 1  2  3  4  5]
  [ 6  7  8  9 10]
  [11 12 13 14 15]
  [16 17 18 19 20]]

 [[21 22 23 24 25]
  [26 27 28 29 30]
  [31 32 33 34 35]
  [36 37 38 39 40]]

 [[41 42 43 44 45]
  [46 47 48 49 50]
  [51 52 53 54 55]
  [56 57 58 59 60]]]


In [21]:
print(arr[:2, :, :])

[[[ 1  2  3  4  5]
  [ 6  7  8  9 10]
  [11 12 13 14 15]
  [16 17 18 19 20]]

 [[21 22 23 24 25]
  [26 27 28 29 30]
  [31 32 33 34 35]
  [36 37 38 39 40]]]


In [22]:
print(arr[:1, :, :])  #> slicing -> 3차원 배열

[[[ 1  2  3  4  5]
  [ 6  7  8  9 10]
  [11 12 13 14 15]
  [16 17 18 19 20]]]


In [23]:
print(arr[0])  #> 원소 참조 -> 2차원 배열

[[ 1  2  3  4  5]
 [ 6  7  8  9 10]
 [11 12 13 14 15]
 [16 17 18 19 20]]


# `np.ndarray` 모양(shape) 변경

* `np.ndarray.reshape(shape)`
* `np.ndarray.ravel()`
* `np.ndarray.flatten()`
* `np.newaxis` 이용


In [24]:
# 1차원 배열 생성
arr_1d = np.arange(1, 13)
print(arr_1d)

[ 1  2  3  4  5  6  7  8  9 10 11 12]


In [25]:
# 1차원 배열 arr_1d를 (3, 4) shape을 갖는 2D 배열로 변환
arr_2d = arr_1d.reshape((3, 4))
print(arr_2d)

[[ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]]


In [26]:
# 1차원 배열 arr_1d를 row가 2개인 2D 배열로 변환
# shape을 argument로 전달할 때, 자동 계산할 수 있는 1개 차원의 수는 -1로 대체가능함.
arr_2d = arr_1d.reshape((2, -1))
print(arr_2d)

[[ 1  2  3  4  5  6]
 [ 7  8  9 10 11 12]]


In [27]:
# 1차원 배열 arr_1d를 column이 3개인 2D 배열로 변환
arr_2d = arr_1d.reshape((-1, 3))
arr_2d

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12]])

In [28]:
# 2차원 배열 arr_2d를 1차원 배열로 변환
result = arr_2d.reshape(4*3)
result

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])

In [29]:
result = arr_2d.reshape((12,))
result

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])

In [30]:
arr_2d.reshape((-1,))

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])

In [31]:
arr_1d = np.arange(1, 49)
arr_1d

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48])

In [32]:
# 1차원 배열 arr_1d를 (4, 4, 3) shape을 갖는 3차원 배열로 변환
arr_3d = arr_1d.reshape((4, 4, 3))
arr_3d

array([[[ 1,  2,  3],
        [ 4,  5,  6],
        [ 7,  8,  9],
        [10, 11, 12]],

       [[13, 14, 15],
        [16, 17, 18],
        [19, 20, 21],
        [22, 23, 24]],

       [[25, 26, 27],
        [28, 29, 30],
        [31, 32, 33],
        [34, 35, 36]],

       [[37, 38, 39],
        [40, 41, 42],
        [43, 44, 45],
        [46, 47, 48]]])

In [33]:
arr_1d.reshape((4, 4, -1))

array([[[ 1,  2,  3],
        [ 4,  5,  6],
        [ 7,  8,  9],
        [10, 11, 12]],

       [[13, 14, 15],
        [16, 17, 18],
        [19, 20, 21],
        [22, 23, 24]],

       [[25, 26, 27],
        [28, 29, 30],
        [31, 32, 33],
        [34, 35, 36]],

       [[37, 38, 39],
        [40, 41, 42],
        [43, 44, 45],
        [46, 47, 48]]])

In [34]:
# 3차원 배열 arr_3d를 1차원 배열로 변환
result = arr_3d.reshape(4*4*3)
result

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48])

In [35]:
arr_3d.reshape((48,))

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48])

In [36]:
arr_3d.reshape((-1,))

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48])

In [37]:
arr_2d

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12]])

In [38]:
# 2D 배열을 1D 배열로 변환
raveled = arr_2d.ravel()  # 1차원으로 변환된 view를 리턴.
raveled

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])

In [39]:
flattened = arr_2d.flatten()  # 1차원으로 변환된 복사본을 리턴
flattened

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])

In [40]:
flattened[0] = 100  # flatten() 함수의 결과에서 원소를 변경
print(flattened)
print(arr_2d)  # 원본 2D 배열에는 영향을 미치지 않음!

[100   2   3   4   5   6   7   8   9  10  11  12]
[[ 1  2  3]
 [ 4  5  6]
 [ 7  8  9]
 [10 11 12]]


In [41]:
raveled[0] = 100  # ravel() 함수의 결과에서 원소를 변경
print(raveled)
print(arr_2d)  # 원본 2D 배열에 영향을 미침!

[100   2   3   4   5   6   7   8   9  10  11  12]
[[100   2   3]
 [  4   5   6]
 [  7   8   9]
 [ 10  11  12]]


In [42]:
# 3D 배열을 1D 배열로 변환
arr_3d.ravel()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48])

In [43]:
arr_3d.flatten()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48])

## `np.newaxis` 속성을 사용한 차원 늘리기

In [44]:
a = np.arange(1, 6)  # (5,) shape을 갖는 1D 배열
print(a)
print('ndim:', a.ndim)
print('shape:', a.shape)

[1 2 3 4 5]
ndim: 1
shape: (5,)


In [45]:
col_vector = a[:, np.newaxis]
print(col_vector)
print('ndim:', col_vector.ndim)
print('shape:', col_vector.shape)

[[1]
 [2]
 [3]
 [4]
 [5]]
ndim: 2
shape: (5, 1)


In [46]:
a.reshape((5, 1))

array([[1],
       [2],
       [3],
       [4],
       [5]])

In [47]:
a.reshape((-1, 1))

array([[1],
       [2],
       [3],
       [4],
       [5]])

In [48]:
row_vector = a[np.newaxis, :]
print(row_vector)
print('ndim:', row_vector.ndim)
print('shape:', row_vector.shape)

[[1 2 3 4 5]]
ndim: 2
shape: (1, 5)


In [49]:
a.reshape((1, 5))

array([[1, 2, 3, 4, 5]])

In [50]:
a.reshape((1, -1))

array([[1, 2, 3, 4, 5]])

In [51]:
a = np.arange(1, 17).reshape((4, 4))
print(a)

[[ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]
 [13 14 15 16]]


In [52]:
a[:, :, np.newaxis]

array([[[ 1],
        [ 2],
        [ 3],
        [ 4]],

       [[ 5],
        [ 6],
        [ 7],
        [ 8]],

       [[ 9],
        [10],
        [11],
        [12]],

       [[13],
        [14],
        [15],
        [16]]])

In [53]:
a.reshape((4, 4, 1))

array([[[ 1],
        [ 2],
        [ 3],
        [ 4]],

       [[ 5],
        [ 6],
        [ 7],
        [ 8]],

       [[ 9],
        [10],
        [11],
        [12]],

       [[13],
        [14],
        [15],
        [16]]])

In [54]:
a[np.newaxis, :, :]

array([[[ 1,  2,  3,  4],
        [ 5,  6,  7,  8],
        [ 9, 10, 11, 12],
        [13, 14, 15, 16]]])

In [55]:
a.reshape((1, 4, 4))

array([[[ 1,  2,  3,  4],
        [ 5,  6,  7,  8],
        [ 9, 10, 11, 12],
        [13, 14, 15, 16]]])

# concatenate

* `np.concatenate([a1, a2, ...], axis=0)`: 행 또는 열 방향으로 이어붙이기
* `np.r_[a1, a2]`: 행 이어붙이기(row concatenation)
* `np.c_[a1, a2]`: 열 이어붙이기(column concatenation)

In [56]:
# (2, 3) shape을 갖는 2D 배열
a1 = np.arange(1, 7).reshape((2, 3))
print(a1)

[[1 2 3]
 [4 5 6]]


In [57]:
# (1, 3) shape을 갖는 2D 배열
a2 = np.array([[7, 8, 9]])
print(a2)

[[7 8 9]]


In [58]:
np.concatenate([a1, a2])  # axis=0인 경우는 생략 가능(default)

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [59]:
np.r_[a1, a2]

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [60]:
a1

array([[1, 2, 3],
       [4, 5, 6]])

In [61]:
a3 = np.array([[10],
               [20]])
a3

array([[10],
       [20]])

In [62]:
np.concatenate([a1, a3], axis=1)

array([[ 1,  2,  3, 10],
       [ 4,  5,  6, 20]])

In [63]:
np.c_[a1, a3]

array([[ 1,  2,  3, 10],
       [ 4,  5,  6, 20]])

# Broadcasting

* `np.ndarray`의 산술연산(+, -, *, /, //, %, ...)은 같은 위치(인덱스)의 원소들끼리(element-wise) 계산됨.
* 서로 shape이 다른 배열들 사이에서 산술연산이 가능하도록 shape을 맞춰서 연산하는 방법 -> broadcast

In [64]:
a = np.array([1, 2, 3])     # (3,) shape을 갖는 1D 배열
b = np.array([1, 2, 3, 4])  # (4,) shape을 갖는 1D 배열
# a + b
#> ValueError: operands could not bes broadcat together with shapes (3,) (4,) 
#> 배열 a와 배열 b는 모양을 같은 shape으로 맞출 수 없다.

## ndarray와 scalar(숫자 한개)의 broadcast 연산

In [65]:
a

array([1, 2, 3])

In [66]:
a + 10

array([11, 12, 13])

In [67]:
a2 = np.array([[1, 2, 3],
               [4, 5, 6]])
a2

array([[1, 2, 3],
       [4, 5, 6]])

In [68]:
a2 + 10

array([[11, 12, 13],
       [14, 15, 16]])

## 2D array와 1D array, 또는 2D array와 2D array에서의 broadcast

In [69]:
a = np.array([[1, 2, 3],
              [4, 5, 6]])   # (2, 3) shape을 갖는 2D array
b = np.array([10, 20, 30])  # (3, ) shape을 1D array

In [70]:
a + b  # (3,) 1D ---> (1, 3) 2D ---> (2, 3) 2D array

array([[11, 22, 33],
       [14, 25, 36]])

In [71]:
c = np.array([[1, 2, 3]])  # (1, 3) shape을 갖는 2D array
a + c

array([[2, 4, 6],
       [5, 7, 9]])

In [72]:
d = np.array([10, 20])  # (2,) shape을 갖는 1D array
# a + d
#> ValueError: operands could not be broadcast together with shapes (2,3) (2,) 

In [73]:
e = np.array([[10],
              [20]])  # (2, 1) shape을 갖는 2D array

In [74]:
a + e

array([[11, 12, 13],
       [24, 25, 26]])

## broadcast의 활용

* **표준화(standardization)**: 평균이 0이 되고, 표준편차를 1이 되도록 변수들의 스케일을 변환하는 것.
* **정규화(normalization)**: 최솟값이 0이 되고, 최댓값이 1이 되도록 변수들의 스케일을 변환하는 것.

In [75]:
x = np.array([1, 2, 3, 4, 5])  # (5,) shape을 갖는 1D array

In [76]:
np.mean(x)

3.0

In [77]:
np.std(x)

1.4142135623730951

In [78]:
# 표준화
x_std = (x - np.mean(x)) / np.std(x)
x_std

array([-1.41421356, -0.70710678,  0.        ,  0.70710678,  1.41421356])

In [79]:
# 정규화
x_norm = (x - np.min(x)) / (np.max(x) - np.min(x))
x_norm

array([0.  , 0.25, 0.5 , 0.75, 1.  ])

In [80]:
# (3, 2) shape을 갖는 2D array
x = np.array([[1, 2],
              [3, 4],
              [5, 6]])

In [81]:
x

array([[1, 2],
       [3, 4],
       [5, 6]])

In [82]:
np.mean(x)  # 모든 원소들의 평균

3.5

In [83]:
np.mean(x, axis=0)  #> (2,) shape을 갖는 1D array

array([3., 4.])

In [84]:
np.mean(x, axis=1)  #> (3,) shape을 갖는 1D array

array([1.5, 3.5, 5.5])

In [85]:
np.mean(x, axis=0, keepdims=True)  #> (1, 2) shape을 갖는 2D array

array([[3., 4.]])

In [86]:
np.mean(x, axis=1, keepdims=True)  #> (3, 1) shape을 갖는 2D array

array([[1.5],
       [3.5],
       [5.5]])

In [87]:
# 표준화
(x - np.mean(x, axis=0, keepdims=True)) / np.std(x, axis=0, keepdims=True)

array([[-1.22474487, -1.22474487],
       [ 0.        ,  0.        ],
       [ 1.22474487,  1.22474487]])

In [88]:
# 정규화
(x - np.min(x, axis=0, keepdims=True)) / (np.max(x, axis=0, keepdims=True) - np.min(x, axis=0, keepdims=True))

array([[0. , 0. ],
       [0.5, 0.5],
       [1. , 1. ]])

* seaborn 패키지의 iris 예제 데이터 셋을 로드
* iris 데이터프레임에서 species 변수를 제거
* iris 각 변수들을 표준화(standardization)
* iris 각 변수들을 정규화(normalization)

> (Hint) `pd.DataFrame.values`: 데이터 프레임의 값들로만 이루어진 2D ndarray를 리턴.

> (Hint) `pd.DataFrame`의 통계 관련 함수들을 직접 사용

In [89]:
iris = sns.load_dataset('iris')

In [90]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [91]:
iris.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [97]:
iris.iloc[:, :-1]

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [100]:
# iris 데이터 프레임에서 species 컬럼을 제거하고, numpy ndarray(배열)로 변환
X = iris.drop(columns='species').values  # values 속성은 to_numpy() 메서드를 사용하는 것과 동일.
# 2차원 이상의 배열 - 변수 이름을 대문자로 시작
# 1차원 배열 - 변수 이름을 소문자로 시작

In [102]:
X[-5:, :]

array([[6.7, 3. , 5.2, 2.3],
       [6.3, 2.5, 5. , 1.9],
       [6.5, 3. , 5.2, 2. ],
       [6.2, 3.4, 5.4, 2.3],
       [5.9, 3. , 5.1, 1.8]])

In [129]:
np.mean(X, axis=0, keepdims=True)  #> iris.describe() 결과의 mean과 동일해야 함.

array([[5.84333333, 3.05733333, 3.758     , 1.19933333]])

In [109]:
np.std(X, axis=0)  # iris.describe() 결과의 std와 동일해야 함.

array([0.82530129, 0.43441097, 1.75940407, 0.75969263])

In [110]:
np.max(X, axis=0)

array([7.9, 4.4, 6.9, 2.5])

In [111]:
np.min(X, axis=0)

array([4.3, 2. , 1. , 0.1])

In [113]:
# 표준화
X_standardized = (X - np.mean(X, axis=0)) / np.std(X, axis=0)
X_standardized[:5, :]

array([[-0.90068117,  1.01900435, -1.34022653, -1.3154443 ],
       [-1.14301691, -0.13197948, -1.34022653, -1.3154443 ],
       [-1.38535265,  0.32841405, -1.39706395, -1.3154443 ],
       [-1.50652052,  0.09821729, -1.2833891 , -1.3154443 ],
       [-1.02184904,  1.24920112, -1.34022653, -1.3154443 ]])

In [114]:
# 확인: 평균 == 0, 표준편차 == 1
np.mean(X_standardized, axis=0)

array([-4.73695157e-16, -7.81597009e-16, -4.26325641e-16, -4.73695157e-16])

In [115]:
np.std(X_standardized, axis=0)

array([1., 1., 1., 1.])

In [126]:
# 정규화
X_normed = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
X_normed[:5, :]

array([[0.22222222, 0.625     , 0.06779661, 0.04166667],
       [0.16666667, 0.41666667, 0.06779661, 0.04166667],
       [0.11111111, 0.5       , 0.05084746, 0.04166667],
       [0.08333333, 0.45833333, 0.08474576, 0.04166667],
       [0.19444444, 0.66666667, 0.06779661, 0.04166667]])

In [127]:
# 확인: 최솟값 == 0, 최댓값 == 1
X_normed.min(axis=0)

array([0., 0., 0., 0.])

In [128]:
X_normed.max(axis=0)

array([1., 1., 1., 1.])

`pandas.DataFrame`의 통계 메서드 이용

In [135]:
features = iris.drop(columns='species')  # iris 데이터프레임에서 species 컬럼만 제거
features

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [137]:
features.mean()  # axis=0은 생략(default)

sepal_length    5.843333
sepal_width     3.057333
petal_length    3.758000
petal_width     1.199333
dtype: float64

In [139]:
# DataFrame에서 산술 연산은 같은 인덱스의 원소들끼리 수행됨.
features_std = (features - features.mean()) / features.std()
features_std.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,-0.897674,1.015602,-1.335752,-1.311052
1,-1.1392,-0.131539,-1.335752,-1.311052
2,-1.380727,0.327318,-1.392399,-1.311052
3,-1.50149,0.097889,-1.279104,-1.311052
4,-1.018437,1.24503,-1.335752,-1.311052


In [140]:
features_std.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,-1.457168e-15,-1.638319e-15,-1.2923e-15,-5.543714e-16
std,1.0,1.0,1.0,1.0
min,-1.86378,-2.42582,-1.562342,-1.442245
25%,-0.8976739,-0.5903951,-1.222456,-1.179859
50%,-0.05233076,-0.1315388,0.3353541,0.1320673
75%,0.672249,0.5567457,0.7602115,0.7880307
max,2.483699,3.080455,1.779869,1.706379


In [141]:
features_norm = (features - features.min()) / (features.max() - features.min())
features_norm.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,0.222222,0.625,0.067797,0.041667
1,0.166667,0.416667,0.067797,0.041667
2,0.111111,0.5,0.050847,0.041667
3,0.083333,0.458333,0.084746,0.041667
4,0.194444,0.666667,0.067797,0.041667


In [142]:
features_norm.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,0.428704,0.440556,0.467458,0.458056
std,0.230018,0.181611,0.299203,0.317599
min,0.0,0.0,0.0,0.0
25%,0.222222,0.333333,0.101695,0.083333
50%,0.416667,0.416667,0.567797,0.5
75%,0.583333,0.541667,0.694915,0.708333
max,1.0,1.0,1.0,1.0
