# Pandas

### Series

#### series의 index와 Value

In [25]:
import pandas as pd # pandas라이브러리를 pd라는 명칭으로 축약해 호출
import numpy as np  # numpy 라이브러리를 np라는 명칭으로 축약해 호출

#Series의 Value와 Index를 변수 pandas_series에 선언
pandas_series = pd.Series([3000, 3200, 2700],
                          index = ['2016-11-10','2016-11-11','2016-11-12'])

print(type(pandas_series)) # 변수 pandas_series의 type 출력
pandas_series              # 변수 pandas_series 실행

<class 'pandas.core.series.Series'>


2016-11-10    3000
2016-11-11    3200
2016-11-12    2700
dtype: int64

- series를 통해 원하는 위치의 값 출력

In [26]:
pandas_series[1:]  # Series의 2번째 자리부터의 값 출력

2016-11-11    3200
2016-11-12    2700
dtype: int64

### DataFrame

- DataFrame 예제

In [27]:
import numpy as np  # numpy라이브러리를 np라는 명칭으로 축약해 호출
import pandas as pd # pandas라이브러리를 pd라는 명칭으로 축약해 호출

# DataFrame의 Value, Column, Index를 변수 df에 선언
df = pd.DataFrame([100, 150, 200, 250, 300], columns=['numbers'], index=['a','b','c','d','e'] )
df

Unnamed: 0,numbers
a,100
b,150
c,200
d,250
e,300


- DataFrame 객체를 사용하는 예제

In [28]:
df.index # DataFrame의 index를 표시

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [29]:
df.columns # DataFrame의 column을 표시

Index(['numbers'], dtype='object')

In [34]:
df.loc['c'] # DataFrame의 index값에서 c에 해당하는 값을 출력

numbers    200
Name: c, dtype: int64

- DataFrame 연산

In [7]:
df.sum() # DataFrame의 Value를 모두 더함

numbers    1000
dtype: int64

In [8]:
df.numbers**2 # DataFrame의 numbers칼럼의 값을 제곱함

a    10000
b    22500
c    40000
d    62500
e    90000
Name: numbers, dtype: int64

- DataFrame의 Cloumn을 추가하는 방법

In [9]:
# DataFrame에 values라는 컬럼을 추가하고 값을 입력
df['values'] = (10, 50, 40, 30, 60)
df

Unnamed: 0,numbers,values
a,100,10
b,150,50
c,200,40
d,250,30
e,300,60


- DataFrame의 Column 삭제

In [10]:
del df['values']
df

Unnamed: 0,numbers
a,100
b,150
c,200
d,250
e,300


- DataFrame에서 데이터를 맞추지 않았을 경우 다음과 같은 에러를 발생

In [11]:
df['values'] = (10,50,40) # DataFrame의 인덱스 수를 맞추지 않으면 에러 발생
df

ValueError: Length of values (3) does not match length of index (5)

- DataFrame에서 Join 사용방법

In [12]:
# Column A에 1, 2, 3이라는 값을 넣음
df_1=pd.DataFrame(['1','2','3'], columns=['A'])

# Column B에 4,5,6,7 값을 넣음
df_2=pd.DataFrame(['4','5','6','7'], columns=['B'])
# B라는 Column에 4, 5, 6, 7이라는 값을 넣음

df = df_1.join(df_2,how='outer') # 두개의 DataFrame을 outer join을 통해 합침
df # 합친 DataFrame을 출력

Unnamed: 0,A,B
0,1.0,4
1,2.0,5
2,3.0,6
3,,7


- DataFrame에 난수를 이용한 임의의 값 생성

In [13]:
import pandas as pd
import numpy as np

# random값을 넣은 DataFrame을 생성
# np.random.randn은 5행 5열의 랜덤 데이터를 생성
# 재호출 시 데이터가 변형되므로 주의
df=pd.DataFrame(np.random.rand(5,5))
df.columns = ['A','B','C','D','E']

df

Unnamed: 0,A,B,C,D,E
0,0.688685,0.666215,0.518273,0.207833,0.130843
1,0.250843,0.673491,0.340276,0.321386,0.390729
2,0.401853,0.899842,0.052629,0.218002,0.58772
3,0.198345,0.767877,0.728613,0.426518,0.838934
4,0.03377,0.685632,0.085801,0.626377,0.120613


In [14]:
df.max() # 각 Column별 최대값을 나타냄

A    0.688685
B    0.899842
C    0.728613
D    0.626377
E    0.838934
dtype: float64

In [15]:
df.min() # 각 Column별 최소값을 나타냄

A    0.033770
B    0.666215
C    0.052629
D    0.207833
E    0.120613
dtype: float64

In [16]:
df.mean() # 각 Column별 평균값을 나타냄

A    0.314699
B    0.738611
C    0.345118
D    0.360023
E    0.413768
dtype: float64

In [17]:
df.std() # 각 Column별 표준편차 값을 나타냄

A    0.246981
B    0.098920
C    0.287171
D    0.173343
E    0.307222
dtype: float64

In [18]:
df.cumsum() # 각 Column별 누적합 값을 나타냄

Unnamed: 0,A,B,C,D,E
0,0.688685,0.666215,0.518273,0.207833,0.130843
1,0.939529,1.339706,0.858549,0.529219,0.521571
2,1.341382,2.239548,0.911177,0.747221,1.109292
3,1.539727,3.007425,1.63979,1.173738,1.948225
4,1.573497,3.693057,1.725591,1.800116,2.068838


In [19]:
# Describe함수를 이용해 DataFrame 데이터 통계 요약값을 표현
df.describe()

Unnamed: 0,A,B,C,D,E
count,5.0,5.0,5.0,5.0,5.0
mean,0.314699,0.738611,0.345118,0.360023,0.413768
std,0.246981,0.09892,0.287171,0.173343,0.307222
min,0.03377,0.666215,0.052629,0.207833,0.120613
25%,0.198345,0.673491,0.085801,0.218002,0.130843
50%,0.250843,0.685632,0.340276,0.321386,0.390729
75%,0.401853,0.767877,0.518273,0.426518,0.58772
max,0.688685,0.899842,0.728613,0.626377,0.838934


- Group by를 이용해 DataFrame의 그룹화

In [20]:
# Group by하기 전 그룹별로 구분하기 위해 division이라는 Column을 생성
df['division'] = ['X','Y','X','Y','Z']
df

Unnamed: 0,A,B,C,D,E,division
0,0.688685,0.666215,0.518273,0.207833,0.130843,X
1,0.250843,0.673491,0.340276,0.321386,0.390729,Y
2,0.401853,0.899842,0.052629,0.218002,0.58772,X
3,0.198345,0.767877,0.728613,0.426518,0.838934,Y
4,0.03377,0.685632,0.085801,0.626377,0.120613,Z


In [21]:
# DataFrame에 Group by를 사용해 column 'division'의 값에 따라 평균값을 산출
df.groupby(['division']).mean()

Unnamed: 0_level_0,A,B,C,D,E
division,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
X,0.545269,0.783029,0.285451,0.212918,0.359281
Y,0.224594,0.720684,0.534444,0.373952,0.614831
Z,0.03377,0.685632,0.085801,0.626377,0.120613
