In [1]:
import pandas as pd
import numpy as np

#### 1. pandas Series
* python의 list나 numpy의 array가 인자로 입력된다.
* Series의 특징은 값과 함께, 우리가 원하는 index를 입력할 수 있다는 것
* 또 다른 특징은 Series의 이름과 index에 이름을 지정해줄 수 있다는 것

참고 : https://nittaku.tistory.com/110

In [2]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [5]:
s.values   # 값(values)만 따로 array로 확인

# array([ 1.,  3.,  5., nan,  6.,  8.])

array([ 1.,  3.,  5., nan,  6.,  8.])

In [6]:
s.index # index 범위값만 얻을 수 있다.

# RangeIndex(start=0, stop=6, step=1)

RangeIndex(start=0, stop=6, step=1)

In [7]:
s.dtypes # 데이터형을 확인

# dtype('float64')

dtype('float64')

In [8]:
# 원하는 인덱스 입력
s2 = pd.Series([1, 2, 3, 4, 5, 6], index=['a', 'b', 'c', 'd', 'e', 'f'])
s2

a    1
b    2
c    3
d    4
e    5
f    6
dtype: int64

Series는 python 에서 중괄호로 {"key": value, "key": value} 형식으로 만든 딕셔너리와 비슷

In [9]:
dic1 = {'홍길동' : 185, '강감찬' : 180, "이순신" : 175 }

s3 = pd.Series(dic1)

s3

홍길동    185
강감찬    180
이순신    175
dtype: int64

In [10]:
# 원하는 Series 이름과 index 이름 지정

s3.name = "인물들"
s3.index.name = '이름'

s3

이름
홍길동    185
강감찬    180
이순신    175
Name: 인물들, dtype: int64

In [11]:
# 직접 입력한 index 값을 변경
s3.index = ['아버지', '형', "동생"]

s3

아버지    185
형      180
동생     175
Name: 인물들, dtype: int64

#### 2. pandas DataFrame
* python 의 딕셔너리 형태로 Data를 정의해준 뒤 DataFrame을 정의한다.
* DataFrame 에서는 2종류의 index가 존재. 가로의 index를 columns
* 가로로는 키값(columns), 세로로는 index(리스트 성분의 갯수)

In [12]:
data = {"name" : ['hong', 'park', 'kim'],
        'age' : [29, 25, 20],
        "grade" : ['a', "b", 'c']}

df = pd.DataFrame(data)

In [13]:
df

Unnamed: 0,name,age,grade
0,hong,29,a
1,park,25,b
2,kim,20,c


In [14]:
df.index

# RangeIndex(start=0, stop=3, step=1)

RangeIndex(start=0, stop=3, step=1)

In [15]:
df. columns

# Index(['name', 'age', 'grade'], dtype='object')

Index(['name', 'age', 'grade'], dtype='object')

In [16]:
df.values

# array([['hong', 29, 'a'],
#       ['park', 25, 'b'],
#       ['kim', 20, 'c']], dtype=object)

array([['hong', 29, 'a'],
       ['park', 25, 'b'],
       ['kim', 20, 'c']], dtype=object)

In [17]:
# DataFrame 의 index와 columns에 이름 붙이기
df.index.name = 'Num'
df.columns.name = "Info"

In [18]:
df

Info,name,age,grade
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,hong,29,a
1,park,25,b
2,kim,20,c


In [19]:
data2 = {"name" : ['hong', 'park', 'kim', 'lee'],
        'age' : [29, 25, 20, 21],
        "grade" : ['a', "b", 'c', 'd']}
df2 = pd.DataFrame(data2)

In [20]:
df2

Unnamed: 0,name,age,grade
0,hong,29,a
1,park,25,b
2,kim,20,c
3,lee,21,d


* age 와 grade의 순서를 변경 
* NEW 라는 columns 생성
* NEW 라는 컬럼에 values를 지정하지 않았기 때문에 NaN 이라고 나온다.
* NaN (Not a Number)

In [23]:
df3 = pd.DataFrame(data2, columns=['name', 'grade', 'age', 'NEW'],
                   index = ['one', 'two', 'three', 'four'])

In [24]:
df3

Unnamed: 0,name,grade,age,NEW
one,hong,a,29,
two,park,b,25,
three,kim,c,20,
four,lee,d,21,


#### 3. pandas date_range
* 날짜형의 데이터인 date_range 가 있다.
* 기본 날짜를 지정하고 periods 옵션으로 기간을 지정

In [25]:
dates = pd.date_range('20190705', periods=7)

dates

# DatetimeIndex(['2019-07-05', '2019-07-06', '2019-07-07', '2019-07-08',
#               '2019-07-09', '2019-07-10', '2019-07-11'],
#               dtype='datetime64[ns]', freq='D')

DatetimeIndex(['2019-07-05', '2019-07-06', '2019-07-07', '2019-07-08',
               '2019-07-09', '2019-07-10', '2019-07-11'],
              dtype='datetime64[ns]', freq='D')

#### 연습하기

* DataFrame 형태의 데이터 생성
* 7행 4열의 random 변수를 만들고, 컬럼에는 A, B, C, D로 지정
* index는 위에서 만든 dates 변수를 이용

In [26]:
ran = pd.DataFrame(np.random.randn(7,4), index=dates, columns=['A', 'B', 'C', 'D'])

ran

Unnamed: 0,A,B,C,D
2019-07-05,0.218174,-0.751474,-0.043165,0.496226
2019-07-06,0.692477,-0.896868,-2.416149,0.167083
2019-07-07,0.441978,1.180014,-0.823068,0.166877
2019-07-08,-1.458178,0.570124,2.230147,0.294385
2019-07-09,-1.518938,-0.10263,-0.586794,-0.884566
2019-07-10,0.997772,-1.422569,0.161246,-0.868626
2019-07-11,-0.914662,-0.101962,0.555409,-0.709635


In [27]:
ran.index

DatetimeIndex(['2019-07-05', '2019-07-06', '2019-07-07', '2019-07-08',
               '2019-07-09', '2019-07-10', '2019-07-11'],
              dtype='datetime64[ns]', freq='D')

In [28]:
ran.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [29]:
ran.values

array([[ 0.21817438, -0.75147357, -0.043165  ,  0.49622641],
       [ 0.69247654, -0.89686764, -2.41614855,  0.16708319],
       [ 0.44197812,  1.18001361, -0.82306843,  0.16687651],
       [-1.45817826,  0.5701241 ,  2.23014679,  0.29438487],
       [-1.51893763, -0.10263   , -0.58679418, -0.88456563],
       [ 0.99777191, -1.42256949,  0.1612459 , -0.86862559],
       [-0.91466161, -0.10196226,  0.55540879, -0.70963471]])

In [30]:
ran.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 7 entries, 2019-07-05 to 2019-07-11
Freq: D
Data columns (total 4 columns):
A    7 non-null float64
B    7 non-null float64
C    7 non-null float64
D    7 non-null float64
dtypes: float64(4)
memory usage: 280.0 bytes


**.describe()** 메소드는 생성했던 DataFrame의 간단한 통계 정보를 보여준다.

데이터의 개수(count)

데이터의 평균 값(mean)

표준 편차(std) : standard deviation

최솟값(min)

4분위수(25%, 50%, 75%)

최댓값(max)

In [31]:
ran.describe()

Unnamed: 0,A,B,C,D
count,7.0,7.0,7.0,7.0
mean,-0.220197,-0.217909,-0.131768,-0.191179
std,1.052669,0.894315,1.417033,0.601854
min,-1.518938,-1.422569,-2.416149,-0.884566
25%,-1.18642,-0.824171,-0.704931,-0.78913
50%,0.218174,-0.10263,-0.043165,0.166877
75%,0.567227,0.234081,0.358327,0.230734
max,0.997772,1.180014,2.230147,0.496226


**sort_values** 는 **by** 로 지정된 컬럼을 기준으로 정렬한다

**ascending** 옵션으로 내림차순이나 오름차순으로 정렬할 수 있다.

**ascending** 을 지정하지 않으면 default 값으로 오름차순이 된다.

In [36]:
ran.sort_values(by='B', ascending=False)

Unnamed: 0,A,B,C,D
2019-07-07,0.441978,1.180014,-0.823068,0.166877
2019-07-08,-1.458178,0.570124,2.230147,0.294385
2019-07-11,-0.914662,-0.101962,0.555409,-0.709635
2019-07-09,-1.518938,-0.10263,-0.586794,-0.884566
2019-07-05,0.218174,-0.751474,-0.043165,0.496226
2019-07-06,0.692477,-0.896868,-2.416149,0.167083
2019-07-10,0.997772,-1.422569,0.161246,-0.868626


In [38]:
# 변수명 뒤에 ['columns명'] 을 하면 해당 컬럼만 Series로 보여준다

ran['A']

2019-07-05    0.218174
2019-07-06    0.692477
2019-07-07    0.441978
2019-07-08   -1.458178
2019-07-09   -1.518938
2019-07-10    0.997772
2019-07-11   -0.914662
Freq: D, Name: A, dtype: float64

In [41]:
ran[1:3]

Unnamed: 0,A,B,C,D
2019-07-06,0.692477,-0.896868,-2.416149,0.167083
2019-07-07,0.441978,1.180014,-0.823068,0.166877


In [51]:
ran['A'][0:3]

2019-07-05    0.218174
2019-07-06    0.692477
2019-07-07    0.441978
Freq: D, Name: A, dtype: float64

In [58]:
ran['20190705':'20190707']

Unnamed: 0,A,B,C,D
2019-07-05,0.218174,-0.751474,-0.043165,0.496226
2019-07-06,0.692477,-0.896868,-2.416149,0.167083
2019-07-07,0.441978,1.180014,-0.823068,0.166877


In [64]:
# 20190705 의 내용을 Series 형태로 출력

ran.loc[dates[0]]

A    0.218174
B   -0.751474
C   -0.043165
D    0.496226
Name: 2019-07-05 00:00:00, dtype: float64

In [60]:
ran.loc[:,['A','C']]

Unnamed: 0,A,C
2019-07-05,0.218174,-0.043165
2019-07-06,0.692477,-2.416149
2019-07-07,0.441978,-0.823068
2019-07-08,-1.458178,2.230147
2019-07-09,-1.518938,-0.586794
2019-07-10,0.997772,0.161246
2019-07-11,-0.914662,0.555409


In [65]:
ran.loc['20190705':'20190707',['A', 'C']]

Unnamed: 0,A,C
2019-07-05,0.218174,-0.043165
2019-07-06,0.692477,-2.416149
2019-07-07,0.441978,-0.823068


In [67]:
ran.loc['20190705',['A', 'C']]

A    0.218174
C   -0.043165
Name: 2019-07-05 00:00:00, dtype: float64

In [70]:
ran.loc[dates[0], ['A', 'C']]

A    0.218174
C   -0.043165
Name: 2019-07-05 00:00:00, dtype: float64

In [71]:
ran.loc[dates[0], 'A']

0.2181743788260634

* loc 와 달리 행과 열의 번호를 이용해서 데이터에 접근하는 방법은 iloc이다

In [72]:
ran.iloc[0]    # ran.loc[dates[0]] 와 결과가 같다.

A    0.218174
B   -0.751474
C   -0.043165
D    0.496226
Name: 2019-07-05 00:00:00, dtype: float64

In [73]:
ran.iloc[:,0:2]   # ran.loc[:,['A','B']] 와 결과가 같다.

Unnamed: 0,A,B
2019-07-05,0.218174,-0.751474
2019-07-06,0.692477,-0.896868
2019-07-07,0.441978,1.180014
2019-07-08,-1.458178,0.570124
2019-07-09,-1.518938,-0.10263
2019-07-10,0.997772,-1.422569
2019-07-11,-0.914662,-0.101962


In [74]:
ran.iloc[[0,1,3],[0,2]]

Unnamed: 0,A,C
2019-07-05,0.218174,-0.043165
2019-07-06,0.692477,-2.416149
2019-07-08,-1.458178,2.230147


In [78]:
# 행은 두 번째와 세 번째만 가져오고 열은 다 가져오기
ran.iloc[1:3,:]

Unnamed: 0,A,B,C,D
2019-07-06,0.692477,-0.896868,-2.416149,0.167083
2019-07-07,0.441978,1.180014,-0.823068,0.166877


In [79]:
ran.iloc[:, 1:3]

Unnamed: 0,B,C
2019-07-05,-0.751474,-0.043165
2019-07-06,-0.896868,-2.416149
2019-07-07,1.180014,-0.823068
2019-07-08,0.570124,2.230147
2019-07-09,-0.10263,-0.586794
2019-07-10,-1.422569,0.161246
2019-07-11,-0.101962,0.555409


In [80]:
ran

Unnamed: 0,A,B,C,D
2019-07-05,0.218174,-0.751474,-0.043165,0.496226
2019-07-06,0.692477,-0.896868,-2.416149,0.167083
2019-07-07,0.441978,1.180014,-0.823068,0.166877
2019-07-08,-1.458178,0.570124,2.230147,0.294385
2019-07-09,-1.518938,-0.10263,-0.586794,-0.884566
2019-07-10,0.997772,-1.422569,0.161246,-0.868626
2019-07-11,-0.914662,-0.101962,0.555409,-0.709635


In [83]:
ran.A   # ran['A'] 과 같다.

2019-07-05    0.218174
2019-07-06    0.692477
2019-07-07    0.441978
2019-07-08   -1.458178
2019-07-09   -1.518938
2019-07-10    0.997772
2019-07-11   -0.914662
Freq: D, Name: A, dtype: float64

In [84]:
# 컬럼 A 에서 0 보다 큰 행만 출력하기.
ran[ran.A > 0]

Unnamed: 0,A,B,C,D
2019-07-05,0.218174,-0.751474,-0.043165,0.496226
2019-07-06,0.692477,-0.896868,-2.416149,0.167083
2019-07-07,0.441978,1.180014,-0.823068,0.166877
2019-07-10,0.997772,-1.422569,0.161246,-0.868626


In [85]:
# 데이터 전체에 조건을 걸면 만족하지 않은 곳은 NaN 으로 처리가 된다.

ran[ran > 0]

Unnamed: 0,A,B,C,D
2019-07-05,0.218174,,,0.496226
2019-07-06,0.692477,,,0.167083
2019-07-07,0.441978,1.180014,,0.166877
2019-07-08,,0.570124,2.230147,0.294385
2019-07-09,,,,
2019-07-10,0.997772,,0.161246,
2019-07-11,,,0.555409,


* DataFrame 을 복사할 때 ' = ' 를 사용해서 복사하면 실제 데이터 내용이 복사되는 것이 아니라 데이터 위치만 복사하기 때문에 원본 데이터는 하나만 존재한다.

* 데이터의 내용까지 복사하려면 **copy()** 를 사용한다

In [86]:
ran2 = ran.copy()

In [90]:
# ran2['E'] = ['one', 'two', 'three', 'four', 'five', 'six', 'seven']
ran2['E'] = ['one', 'two', 'three', 'one', 'five', 'two', 'seven']

ran2

Unnamed: 0,A,B,C,D,E
2019-07-05,0.218174,-0.751474,-0.043165,0.496226,one
2019-07-06,0.692477,-0.896868,-2.416149,0.167083,two
2019-07-07,0.441978,1.180014,-0.823068,0.166877,three
2019-07-08,-1.458178,0.570124,2.230147,0.294385,one
2019-07-09,-1.518938,-0.10263,-0.586794,-0.884566,five
2019-07-10,0.997772,-1.422569,0.161246,-0.868626,two
2019-07-11,-0.914662,-0.101962,0.555409,-0.709635,seven


In [98]:
# E 컬럼에서 one 과 two 가 있는지 확인하기           .isin() 사용

ran2['E'].isin(['one', 'two'])


2019-07-05     True
2019-07-06     True
2019-07-07    False
2019-07-08     True
2019-07-09    False
2019-07-10     True
2019-07-11    False
Freq: D, Name: E, dtype: bool

In [99]:
# ran2 에서 E 컬럼에 one 이나 two가 있는 값만 출력

ran2[ran2['E'].isin(['one', 'two'])]

Unnamed: 0,A,B,C,D,E
2019-07-05,0.218174,-0.751474,-0.043165,0.496226,one
2019-07-06,0.692477,-0.896868,-2.416149,0.167083,two
2019-07-08,-1.458178,0.570124,2.230147,0.294385,one
2019-07-10,0.997772,-1.422569,0.161246,-0.868626,two


* 누적합을 알고 싶을 때는 **numpy** 의 **cumsum** 을 이용

In [100]:
ran2.apply(np.cumsum)

Unnamed: 0,A,B,C,D,E
2019-07-05,0.218174,-0.751474,-0.043165,0.496226,one
2019-07-06,0.910651,-1.648341,-2.459314,0.66331,onetwo
2019-07-07,1.352629,-0.468328,-3.282382,0.830186,onetwothree
2019-07-08,-0.105549,0.101797,-1.052235,1.124571,onetwothreeone
2019-07-09,-1.624487,-0.000833,-1.639029,0.240005,onetwothreeonefive
2019-07-10,-0.626715,-1.423403,-1.477783,-0.62862,onetwothreeonefivetwo
2019-07-11,-1.541377,-1.525365,-0.922375,-1.338255,onetwothreeonefivetwoseven


In [104]:
ran

Unnamed: 0,A,B,C,D
2019-07-05,0.218174,-0.751474,-0.043165,0.496226
2019-07-06,0.692477,-0.896868,-2.416149,0.167083
2019-07-07,0.441978,1.180014,-0.823068,0.166877
2019-07-08,-1.458178,0.570124,2.230147,0.294385
2019-07-09,-1.518938,-0.10263,-0.586794,-0.884566
2019-07-10,0.997772,-1.422569,0.161246,-0.868626
2019-07-11,-0.914662,-0.101962,0.555409,-0.709635


* 최대값과 최소값의 차이(혹은 거리)를 알고 싶다면 one-line 함수인 lambda 를 이용할 수 있다.

In [102]:
ran.apply(lambda x: x.max() - x.min())

A    2.516710
B    2.602583
C    4.646295
D    1.380792
dtype: float64