In [1]:
import numpy as np
import pandas as pd

# 자료구조

## Series

In [2]:
s = pd.Series([4, 7, -5, 3])

In [3]:
s

0    4
1    7
2   -5
3    3
dtype: int64

In [4]:
s.values

array([ 4,  7, -5,  3], dtype=int64)

In [5]:
s.index

RangeIndex(start=0, stop=4, step=1)

In [6]:
s.dtypes

dtype('int64')

index 설정

In [7]:
s2 = pd.Series([4, 7, -5, 3], index=['d','b','a','c'])

In [8]:
s2

d    4
b    7
a   -5
c    3
dtype: int64

dictionary로 생성

In [9]:
sdata = {'Kim': 35000, 'Ahn': 67000, 'Joan': 12000, 'Choi': 4000}
s3 = pd.Series(sdata)

In [10]:
s3

Kim     35000
Ahn     67000
Joan    12000
Choi     4000
dtype: int64

In [11]:
s3.name = 'Salary'
s3.index.name = 'Names'
s3

Names
Kim     35000
Ahn     67000
Joan    12000
Choi     4000
Name: Salary, dtype: int64

In [12]:
s3.index = ['A', 'B', 'C', 'D']
s3

A    35000
B    67000
C    12000
D     4000
Name: Salary, dtype: int64

## Data Frame

In [13]:
data = {'name': ['Beomwoo', 'Beomwoo', 'Beomwoo', 'Kim', 'Park'],
        'year': [2013, 2014, 2015, 2016, 2015],
        'points': [1.5, 1.7, 3.6, 2.4, 2.9]}
df = pd.DataFrame(data)
df

Unnamed: 0,name,year,points
0,Beomwoo,2013,1.5
1,Beomwoo,2014,1.7
2,Beomwoo,2015,3.6
3,Kim,2016,2.4
4,Park,2015,2.9


In [14]:
df.index

RangeIndex(start=0, stop=5, step=1)

In [15]:
df.columns

Index(['name', 'year', 'points'], dtype='object')

In [16]:
df.values

array([['Beomwoo', 2013, 1.5],
       ['Beomwoo', 2014, 1.7],
       ['Beomwoo', 2015, 3.6],
       ['Kim', 2016, 2.4],
       ['Park', 2015, 2.9]], dtype=object)

In [17]:
# 각 인덱스에 대한 이름 설정
df.index.name = 'Num'
df.columns.name = 'Info'
df

Info,name,year,points
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Beomwoo,2013,1.5
1,Beomwoo,2014,1.7
2,Beomwoo,2015,3.6
3,Kim,2016,2.4
4,Park,2015,2.9


column과 index를 설정하며 생성

In [18]:
# data dictionary와 columns의 순서가 달라도 알아서 매치
df2 = pd.DataFrame(data, columns=['year', 'name', 'points', 'penalty'],
                  index=['one', 'two', 'three', 'four', 'five'])
df2

Unnamed: 0,year,name,points,penalty
one,2013,Beomwoo,1.5,
two,2014,Beomwoo,1.7,
three,2015,Beomwoo,3.6,
four,2016,Kim,2.4,
five,2015,Park,2.9,


In [19]:
# 계산 가능한 다양한 값들을 계산해서 보여줌
df2.describe()

Unnamed: 0,year,points
count,5.0,5.0
mean,2014.6,2.42
std,1.140175,0.864292
min,2013.0,1.5
25%,2014.0,1.7
50%,2015.0,2.4
75%,2015.0,2.9
max,2016.0,3.6


# DataFrame indexing

In [20]:
data = {"names": ["Kilho", "Kilho", "Kilho", "Charles", "Charles"],
           "year": [2014, 2015, 2016, 2015, 2016],
           "points": [1.5, 1.7, 3.6, 2.4, 2.9]}
df = pd.DataFrame(data, columns=["year", "names", "points", "penalty"],
                          index=["one", "two", "three", "four", "five"])
df

Unnamed: 0,year,names,points,penalty
one,2014,Kilho,1.5,
two,2015,Kilho,1.7,
three,2016,Kilho,3.6,
four,2015,Charles,2.4,
five,2016,Charles,2.9,


In [21]:
df['year']

one      2014
two      2015
three    2016
four     2015
five     2016
Name: year, dtype: int64

In [22]:
df.year

one      2014
two      2015
three    2016
four     2015
five     2016
Name: year, dtype: int64

In [23]:
df[['year', 'points']]

Unnamed: 0,year,points
one,2014,1.5
two,2015,1.7
three,2016,3.6
four,2015,2.4
five,2016,2.9


In [24]:
df['penalty'] = 0.5
df

Unnamed: 0,year,names,points,penalty
one,2014,Kilho,1.5,0.5
two,2015,Kilho,1.7,0.5
three,2016,Kilho,3.6,0.5
four,2015,Charles,2.4,0.5
five,2016,Charles,2.9,0.5


In [25]:
df['penalty'] = [0.1, 0.2, 0.3, 0.4, 0.5]
df

Unnamed: 0,year,names,points,penalty
one,2014,Kilho,1.5,0.1
two,2015,Kilho,1.7,0.2
three,2016,Kilho,3.6,0.3
four,2015,Charles,2.4,0.4
five,2016,Charles,2.9,0.5


In [26]:
# Array 추가
df['zeros'] = np.arange(5)
df

Unnamed: 0,year,names,points,penalty,zeros
one,2014,Kilho,1.5,0.1,0
two,2015,Kilho,1.7,0.2,1
three,2016,Kilho,3.6,0.3,2
four,2015,Charles,2.4,0.4,3
five,2016,Charles,2.9,0.5,4


In [27]:
# Series 추가
val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
df['dept'] = val
df

Unnamed: 0,year,names,points,penalty,zeros,dept
one,2014,Kilho,1.5,0.1,0,
two,2015,Kilho,1.7,0.2,1,-1.2
three,2016,Kilho,3.6,0.3,2,
four,2015,Charles,2.4,0.4,3,-1.5
five,2016,Charles,2.9,0.5,4,-1.7


In [28]:
df['net_points'] = df['points'] - df['penalty']
df['high_points'] = df['net_points'] > 2.0
df

Unnamed: 0,year,names,points,penalty,zeros,dept,net_points,high_points
one,2014,Kilho,1.5,0.1,0,,1.4,False
two,2015,Kilho,1.7,0.2,1,-1.2,1.5,False
three,2016,Kilho,3.6,0.3,2,,3.3,True
four,2015,Charles,2.4,0.4,3,-1.5,2.0,False
five,2016,Charles,2.9,0.5,4,-1.7,2.4,True


In [29]:
# 열 삭제
del df['high_points']
del df['net_points']
del df['zeros']
df

Unnamed: 0,year,names,points,penalty,dept
one,2014,Kilho,1.5,0.1,
two,2015,Kilho,1.7,0.2,-1.2
three,2016,Kilho,3.6,0.3,
four,2015,Charles,2.4,0.4,-1.5
five,2016,Charles,2.9,0.5,-1.7


In [30]:
df.index.name = 'Order'
df.columns.name = 'Info'
df

Info,year,names,points,penalty,dept
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,2014,Kilho,1.5,0.1,
two,2015,Kilho,1.7,0.2,-1.2
three,2016,Kilho,3.6,0.3,
four,2015,Charles,2.4,0.4,-1.5
five,2016,Charles,2.9,0.5,-1.7


### DataFrame의 행 조작

In [31]:
df[0:3]

Info,year,names,points,penalty,dept
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,2014,Kilho,1.5,0.1,
two,2015,Kilho,1.7,0.2,-1.2
three,2016,Kilho,3.6,0.3,


In [32]:
# 마지막 이름을 빼지 않음
df['two':'four']

Info,year,names,points,penalty,dept
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
two,2015,Kilho,1.7,0.2,-1.2
three,2016,Kilho,3.6,0.3,
four,2015,Charles,2.4,0.4,-1.5


In [33]:
df.loc['two']

Info
year        2015
names      Kilho
points       1.7
penalty      0.2
dept        -1.2
Name: two, dtype: object

In [34]:
df.loc['two':'four']

Info,year,names,points,penalty,dept
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
two,2015,Kilho,1.7,0.2,-1.2
three,2016,Kilho,3.6,0.3,
four,2015,Charles,2.4,0.4,-1.5


In [35]:
df.loc['two':'four', 'points']

Order
two      1.7
three    3.6
four     2.4
Name: points, dtype: float64

In [36]:
df.loc['two':'four', ['points', 'penalty']]

Info,points,penalty
Order,Unnamed: 1_level_1,Unnamed: 2_level_1
two,1.7,0.2
three,3.6,0.3
four,2.4,0.4


In [37]:
df.loc[:, 'year']

Order
one      2014
two      2015
three    2016
four     2015
five     2016
Name: year, dtype: int64

In [38]:
df.loc['three':'five', 'year':'penalty']

Info,year,names,points,penalty
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
three,2016,Kilho,3.6,0.3
four,2015,Charles,2.4,0.4
five,2016,Charles,2.9,0.5


In [39]:
# 새로운 행 삽입
df.loc['six', :] = [2013, 'Jun', 4.0, 0.1, 2.1]
df

Info,year,names,points,penalty,dept
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,2014.0,Kilho,1.5,0.1,
two,2015.0,Kilho,1.7,0.2,-1.2
three,2016.0,Kilho,3.6,0.3,
four,2015.0,Charles,2.4,0.4,-1.5
five,2016.0,Charles,2.9,0.5,-1.7
six,2013.0,Jun,4.0,0.1,2.1


### df.iloc

In [40]:
df.iloc[3]

Info
year          2015
names      Charles
points         2.4
penalty        0.4
dept          -1.5
Name: four, dtype: object

In [41]:
df.iloc[3:5, 0:2]

Info,year,names
Order,Unnamed: 1_level_1,Unnamed: 2_level_1
four,2015.0,Charles
five,2016.0,Charles


In [42]:
df.iloc[[0,1,3], [1,2]]

Info,names,points
Order,Unnamed: 1_level_1,Unnamed: 2_level_1
one,Kilho,1.5
two,Kilho,1.7
four,Charles,2.4


In [43]:
df.iloc[1,1]

'Kilho'

# DataFrame의 boolean Indexing

In [44]:
df

Info,year,names,points,penalty,dept
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,2014.0,Kilho,1.5,0.1,
two,2015.0,Kilho,1.7,0.2,-1.2
three,2016.0,Kilho,3.6,0.3,
four,2015.0,Charles,2.4,0.4,-1.5
five,2016.0,Charles,2.9,0.5,-1.7
six,2013.0,Jun,4.0,0.1,2.1


In [45]:
df['year'] > 2014

Order
one      False
two       True
three     True
four      True
five      True
six      False
Name: year, dtype: bool

In [46]:
df.loc[df['year'] > 2014, :]

Info,year,names,points,penalty,dept
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
two,2015.0,Kilho,1.7,0.2,-1.2
three,2016.0,Kilho,3.6,0.3,
four,2015.0,Charles,2.4,0.4,-1.5
five,2016.0,Charles,2.9,0.5,-1.7


In [47]:
df.loc[ df['names'] == 'Kilho', ['names', 'points'] ]

Info,names,points
Order,Unnamed: 1_level_1,Unnamed: 2_level_1
one,Kilho,1.5
two,Kilho,1.7
three,Kilho,3.6


In [48]:
# 논리연산 응용
df.loc[ (df.points > 2) & (df.points < 3), : ]

Info,year,names,points,penalty,dept
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
four,2015.0,Charles,2.4,0.4,-1.5
five,2016.0,Charles,2.9,0.5,-1.7


In [49]:
df.loc[ df.points > 3, 'penalty' ] = 0
df

Info,year,names,points,penalty,dept
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,2014.0,Kilho,1.5,0.1,
two,2015.0,Kilho,1.7,0.2,-1.2
three,2016.0,Kilho,3.6,0.0,
four,2015.0,Charles,2.4,0.4,-1.5
five,2016.0,Charles,2.9,0.5,-1.7
six,2013.0,Jun,4.0,0.0,2.1


# Data

In [50]:
# index, column 설정이 없으면 0부터 시작하는 정수로 설정
df = pd.DataFrame(np.random.randn(6, 4))
df

Unnamed: 0,0,1,2,3
0,-2.492247,0.513396,-0.658412,-0.15262
1,0.900208,0.345713,0.479513,1.433618
2,1.56461,1.951519,0.557231,0.159452
3,-1.221901,-0.128396,-0.78017,1.011731
4,-0.550795,0.294254,1.565174,-0.855774
5,-0.449974,1.365115,0.707301,0.852321


In [51]:
df.columns = ['A', 'B', 'C', 'D']
df.index = pd.date_range('20160701', periods=6)
# pandas에서 제공하는 date range함수는 datetime 자료형으로 구성된, 
# 날짜 시각등을 알 수 있는 자료형을 만드는 함수

df.index

DatetimeIndex(['2016-07-01', '2016-07-02', '2016-07-03', '2016-07-04',
               '2016-07-05', '2016-07-06'],
              dtype='datetime64[ns]', freq='D')

In [52]:
df

Unnamed: 0,A,B,C,D
2016-07-01,-2.492247,0.513396,-0.658412,-0.15262
2016-07-02,0.900208,0.345713,0.479513,1.433618
2016-07-03,1.56461,1.951519,0.557231,0.159452
2016-07-04,-1.221901,-0.128396,-0.78017,1.011731
2016-07-05,-0.550795,0.294254,1.565174,-0.855774
2016-07-06,-0.449974,1.365115,0.707301,0.852321


In [54]:
df['F'] = [1.0, np.nan, 3.5, 6.1, np.nan, 7.0]
df

Unnamed: 0,A,B,C,D,F
2016-07-01,-2.492247,0.513396,-0.658412,-0.15262,1.0
2016-07-02,0.900208,0.345713,0.479513,1.433618,
2016-07-03,1.56461,1.951519,0.557231,0.159452,3.5
2016-07-04,-1.221901,-0.128396,-0.78017,1.011731,6.1
2016-07-05,-0.550795,0.294254,1.565174,-0.855774,
2016-07-06,-0.449974,1.365115,0.707301,0.852321,7.0


  

#### NaN 없애기

In [56]:
# 행의 값 중 하나라도 nan이면 행을 없앤다.
df.dropna(how='any')

Unnamed: 0,A,B,C,D,F
2016-07-01,-2.492247,0.513396,-0.658412,-0.15262,1.0
2016-07-03,1.56461,1.951519,0.557231,0.159452,3.5
2016-07-04,-1.221901,-0.128396,-0.78017,1.011731,6.1
2016-07-06,-0.449974,1.365115,0.707301,0.852321,7.0


In [57]:
# 행의 모든 값이 nan이면 행을 없앤다.
df.dropna(how='all')

Unnamed: 0,A,B,C,D,F
2016-07-01,-2.492247,0.513396,-0.658412,-0.15262,1.0
2016-07-02,0.900208,0.345713,0.479513,1.433618,
2016-07-03,1.56461,1.951519,0.557231,0.159452,3.5
2016-07-04,-1.221901,-0.128396,-0.78017,1.011731,6.1
2016-07-05,-0.550795,0.294254,1.565174,-0.855774,
2016-07-06,-0.449974,1.365115,0.707301,0.852321,7.0


In [58]:
# nan에 값 넣기
df.fillna(value=0.5)

Unnamed: 0,A,B,C,D,F
2016-07-01,-2.492247,0.513396,-0.658412,-0.15262,1.0
2016-07-02,0.900208,0.345713,0.479513,1.433618,0.5
2016-07-03,1.56461,1.951519,0.557231,0.159452,3.5
2016-07-04,-1.221901,-0.128396,-0.78017,1.011731,6.1
2016-07-05,-0.550795,0.294254,1.565174,-0.855774,0.5
2016-07-06,-0.449974,1.365115,0.707301,0.852321,7.0


In [59]:
# nan인지 확인
df.isnull()

Unnamed: 0,A,B,C,D,F
2016-07-01,False,False,False,False,False
2016-07-02,False,False,False,False,True
2016-07-03,False,False,False,False,False
2016-07-04,False,False,False,False,False
2016-07-05,False,False,False,False,True
2016-07-06,False,False,False,False,False
