# 개요

1. numpy
2. pandas
3. torch로 vector, matrix 등 만들어보기

## 참고자료
- Official tutorial: https://docs.scipy.org/doc/numpy/reference/
- Stanford Univ. CS231 tutorial: http://cs231n.github.io/python-numpy-tutorial/

# 1. numpy, array, matrix 등

In [5]:
import warnings
warnings.filterwarnings("ignore")

In [1]:
import numpy as np

## 리스트, 행렬 선언

In [2]:
# list_name(변수명 설정) = ["item-1", "item-2", ... , "item-n"]

a = [1, 2, 3]
b = ["a", "b", "c"]
c = [["a"], ["a", "b"], ["a", "c"], ["a", "b", "c"]]

print(a)
print(b)
print(c)

[1, 2, 3]
['a', 'b', 'c']
[['a'], ['a', 'b'], ['a', 'c'], ['a', 'b', 'c']]


In [3]:
# append()로 리스트에 item 추가하기
c.append('new_item')
c.append(['new_item_list'])

print(c)

[['a'], ['a', 'b'], ['a', 'c'], ['a', 'b', 'c'], 'new_item', ['new_item_list']]


In [6]:
a = np.array([1, 2, 3])
b = np.array(["a", "b", "c"])
c = np.array([["a"], ["a", "b"], ["a", "c"], ["a", "b", "c"]])

print(a)
print(b)
print(c)

[1 2 3]
['a' 'b' 'c']
[list(['a']) list(['a', 'b']) list(['a', 'c']) list(['a', 'b', 'c'])]


In [7]:
# 배열(또는 행렬)을 1로 채우기
print(np.ones((1, 5)))
print(np.ones((3, 2)))

[[1. 1. 1. 1. 1.]]
[[1. 1.]
 [1. 1.]
 [1. 1.]]


In [8]:
# 배열(또는 행렬)을 0으로 채우기
print(np.zeros((1, 5)))
print(np.zeros((3, 2)))

[[0. 0. 0. 0. 0.]]
[[0. 0.]
 [0. 0.]
 [0. 0.]]


In [9]:
# 주어진 범위 안에서 순차적으로 증가하는 리스트 만들기
# np.arange([start,] stop, [step, ] dtype=None)

print(np.arange(10))
print(np.arange(3,7, dtype=np.float))
print(np.arange(3,10,2))

[0 1 2 3 4 5 6 7 8 9]
[3. 4. 5. 6.]
[3 5 7 9]


In [12]:
# 행렬 선언하기
mat1 = np.array([[1,2,3],[4,5,6]])

# random으로 matrix 만들기
mat2 = np.random.randint(low=1, high=10, size=(3,2))
mat3 = np.random.rand(3,2)

print(mat1)
print( )
print(mat2)
print( )
print(mat3)

[[1 2 3]
 [4 5 6]]

[[3 6]
 [4 4]
 [4 7]]

[[0.67535138 0.15169741]
 [0.13050552 0.37230303]
 [0.9132264  0.62475987]]


## 리스트 인덱싱(Indexing) & 슬라이싱(Slicing)

In [13]:
# Indexing
a = [1,3,5,7,9,11]
print(a[2], a[5], a[-1])

5 11 11


In [14]:
# Slicing
b = [2,4,6,8,10]
print(b[2:])
print(b[:2])
print(b[:])

[6, 8, 10]
[2, 4]
[2, 4, 6, 8, 10]


## numpy의 reshape (PyTorch의 view와 비교)


In [15]:
# (row, column)
mat1 = np.random.rand(6,3)
print(mat1)

[[0.98137376 0.46013542 0.20802182]
 [0.78274541 0.53676745 0.30371796]
 [0.13353251 0.85752984 0.3428316 ]
 [0.49569291 0.96555494 0.44211489]
 [0.47495687 0.76687987 0.80760352]
 [0.6022605  0.68287025 0.2403448 ]]


In [16]:
# -1: all
print(mat1.reshape(1, -1).shape)
print(mat1.reshape(1, -1))
print("=====")
print(mat1.reshape(-1, 1).shape)
print(mat1.reshape(-1, 1))

(1, 18)
[[0.98137376 0.46013542 0.20802182 0.78274541 0.53676745 0.30371796
  0.13353251 0.85752984 0.3428316  0.49569291 0.96555494 0.44211489
  0.47495687 0.76687987 0.80760352 0.6022605  0.68287025 0.2403448 ]]
=====
(18, 1)
[[0.98137376]
 [0.46013542]
 [0.20802182]
 [0.78274541]
 [0.53676745]
 [0.30371796]
 [0.13353251]
 [0.85752984]
 [0.3428316 ]
 [0.49569291]
 [0.96555494]
 [0.44211489]
 [0.47495687]
 [0.76687987]
 [0.80760352]
 [0.6022605 ]
 [0.68287025]
 [0.2403448 ]]


In [17]:
print(mat1.reshape(2, 9).shape)
print(mat1.reshape(9, 2))

(2, 9)
[[0.98137376 0.46013542]
 [0.20802182 0.78274541]
 [0.53676745 0.30371796]
 [0.13353251 0.85752984]
 [0.3428316  0.49569291]
 [0.96555494 0.44211489]
 [0.47495687 0.76687987]
 [0.80760352 0.6022605 ]
 [0.68287025 0.2403448 ]]


In [18]:
mat1.reshape(2,5)

ValueError: ignored

In [19]:
# tensor 형태로 나타내기
print(mat1.reshape(3,2,3).shape)
print(mat1.reshape(3,2,3))

(3, 2, 3)
[[[0.98137376 0.46013542 0.20802182]
  [0.78274541 0.53676745 0.30371796]]

 [[0.13353251 0.85752984 0.3428316 ]
  [0.49569291 0.96555494 0.44211489]]

 [[0.47495687 0.76687987 0.80760352]
  [0.6022605  0.68287025 0.2403448 ]]]


## matrix 또는 tensor 형태에서도 slicing이 가능합니다!


In [20]:
mat2 = np.arange(24).reshape(-1, 4)
print(mat2)

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]
 [12 13 14 15]
 [16 17 18 19]
 [20 21 22 23]]


In [21]:
mat2[:1,:3]

array([[0, 1, 2]])

In [22]:
mat2[3, 0:2]

array([12, 13])

## Math Arithmetic Operations(사칙연산)


In [23]:
x = np.array([[1,3,5],[7,9,11],[13,15,17]])
y = np.array([[2,4,6],[8,10,12],[14,16,18]])

print(x)
print(y)

[[ 1  3  5]
 [ 7  9 11]
 [13 15 17]]
[[ 2  4  6]
 [ 8 10 12]
 [14 16 18]]


In [24]:
# add
x + y

array([[ 3,  7, 11],
       [15, 19, 23],
       [27, 31, 35]])

In [25]:
x - y

array([[-1, -1, -1],
       [-1, -1, -1],
       [-1, -1, -1]])

In [26]:
x * y

array([[  2,  12,  30],
       [ 56,  90, 132],
       [182, 240, 306]])

In [27]:
x / y

array([[0.5       , 0.75      , 0.83333333],
       [0.875     , 0.9       , 0.91666667],
       [0.92857143, 0.9375    , 0.94444444]])

In [28]:
print(x ** 2)
print(np.power(x, 2))

[[  1   9  25]
 [ 49  81 121]
 [169 225 289]]
[[  1   9  25]
 [ 49  81 121]
 [169 225 289]]


In [29]:
np.dot(x, y)

array([[ 96, 114, 132],
       [240, 294, 348],
       [384, 474, 564]])

In [30]:
np.sqrt(x)

array([[1.        , 1.73205081, 2.23606798],
       [2.64575131, 3.        , 3.31662479],
       [3.60555128, 3.87298335, 4.12310563]])

In [31]:
# More on matrix operation
z1 = np.array([[2,2,2]])
z2 = np.array([[2,2]])

x * z1

array([[ 2,  6, 10],
       [14, 18, 22],
       [26, 30, 34]])

In [32]:
print(z2.shape)
print(x.shape)
print(x * z2)

(1, 2)
(3, 3)


ValueError: ignored

In [33]:
# 합
print(x)
print(x.sum(axis = 0))
print(x.sum(axis = 1))

[[ 1  3  5]
 [ 7  9 11]
 [13 15 17]]
[21 27 33]
[ 9 27 45]


In [34]:
# 평균
print(x)
print(x.mean(axis = 0))
print(x.mean(axis = 1))

[[ 1  3  5]
 [ 7  9 11]
 [13 15 17]]
[ 7.  9. 11.]
[ 3.  9. 15.]


In [35]:
# 표준편차
print(x)
print(x.std(axis = 0))
print(x.std(axis = 1))

[[ 1  3  5]
 [ 7  9 11]
 [13 15 17]]
[4.89897949 4.89897949 4.89897949]
[1.63299316 1.63299316 1.63299316]


In [36]:
print(x.T)
print(np.dot(x, z1.T))

[[ 1  7 13]
 [ 3  9 15]
 [ 5 11 17]]
[[18]
 [54]
 [90]]


## Other operations


In [37]:
xx = np.random.rand(15)
print(xx)

[0.47690458 0.37269077 0.25605797 0.29518506 0.79173043 0.45224452
 0.07344784 0.47493753 0.0909362  0.07365072 0.82930456 0.99303679
 0.54166912 0.02624229 0.0039867 ]


In [38]:
print(xx)
print(xx.argsort()) # axis를 활용해서 행렬(matrix)에도 적용할 수 있다.
xx.sort()
print(xx)

[0.47690458 0.37269077 0.25605797 0.29518506 0.79173043 0.45224452
 0.07344784 0.47493753 0.0909362  0.07365072 0.82930456 0.99303679
 0.54166912 0.02624229 0.0039867 ]
[14 13  6  9  8  2  3  1  5  7  0 12  4 10 11]
[0.0039867  0.02624229 0.07344784 0.07365072 0.0909362  0.25605797
 0.29518506 0.37269077 0.45224452 0.47493753 0.47690458 0.54166912
 0.79173043 0.82930456 0.99303679]


# 2. Pandas

- https://pandas.pydata.org/pandas-docs/stable/

In [39]:
import pandas as pd

## Pandas Series 만들기


- pandas series는 1차원 데이터 집합


In [40]:
pd_series = pd.Series(index = ['a','b','c','d','e'], data=[1,2,3,4,5])
pd_series

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [41]:
# Pandas Series의 기초 정보
print('차원:', pd_series.ndim)
print('형태: ', pd_series.shape)
print('총 원소의 수:', pd_series.size)
print('값:', pd_series.values)
print('인덱스:', pd_series.index)

차원: 1
형태:  (5,)
총 원소의 수: 5
값: [1 2 3 4 5]
인덱스: Index(['a', 'b', 'c', 'd', 'e'], dtype='object')


## index를 활용한 데이터 탐색


In [42]:
# loc(index를 활용하여 access), iloc(integer location)

print(pd_series.loc['a'])
print(pd_series.iloc[0])
print(pd_series.loc[['a','c']])
print(pd_series.iloc[[0,2]])

1
1
a    1
c    3
dtype: int64
a    1
c    3
dtype: int64


## 데이터 삭제


In [43]:
pd_series.drop('b')

a    1
c    3
d    4
e    5
dtype: int64

In [44]:
print(pd_series)
pd_series.drop('b', inplace=True)
print(pd_series)

a    1
b    2
c    3
d    4
e    5
dtype: int64
a    1
c    3
d    4
e    5
dtype: int64


## Pandas Dataframe 만들기


- 2차원 데이터 집합. 행렬과 비슷하게 row와 column을 갖고 있다


In [45]:
data = {
    'A': np.arange(15),
    'B': np.random.randint(low=0, high=15, size=(15)),
    'C': np.random.rand(15)
}

data_df = pd.DataFrame(data)

In [46]:
data_df.head()

Unnamed: 0,A,B,C
0,0,6,0.924407
1,1,9,0.78665
2,2,4,0.267608
3,3,3,0.323856
4,4,6,0.649578


In [47]:
data_df.tail()

Unnamed: 0,A,B,C
10,10,9,0.152762
11,11,6,0.01104
12,12,10,0.221307
13,13,8,0.234862
14,14,5,0.254979


In [48]:
data_df.shape

(15, 3)

## Indexing and Slicing

In [49]:
data_df[1:3]

Unnamed: 0,A,B,C
1,1,9,0.78665
2,2,4,0.267608


In [50]:
data_df.loc[1]

A    1.00000
B    9.00000
C    0.78665
Name: 1, dtype: float64

In [51]:
data_df.loc[1]['C']

0.7866501861792591

## Add, Remove and etc

In [52]:
data_df['D'] = data_df['A'] >= 5
data_df

Unnamed: 0,A,B,C,D
0,0,6,0.924407,False
1,1,9,0.78665,False
2,2,4,0.267608,False
3,3,3,0.323856,False
4,4,6,0.649578,False
5,5,11,0.844697,True
6,6,14,0.512173,True
7,7,0,0.642425,True
8,8,4,0.049137,True
9,9,1,0.829374,True


In [54]:
data_df.drop('D', axis=1, inplace=True)

In [55]:
data_df.sort_index(axis=0, ascending=False)

Unnamed: 0,A,B,C
14,14,5,0.254979
13,13,8,0.234862
12,12,10,0.221307
11,11,6,0.01104
10,10,9,0.152762
9,9,1,0.829374
8,8,4,0.049137
7,7,0,0.642425
6,6,14,0.512173
5,5,11,0.844697
