In [1]:
# Series - 1차원 배열
# DataFrame - 2차원 배열, 표 형태
import numpy as np
import pandas as pd

In [2]:
# Series 자료형 생성
obj = pd.Series([3,6,9,12])
obj

0     3
1     6
2     9
3    12
dtype: int64

In [3]:
# 1차원 데이터는 Series 사용
s = pd.Series([1.0, 3.0, 5.0, 7.0, 9.0])
print(s)

0    1.0
1    3.0
2    5.0
3    7.0
4    9.0
dtype: float64


In [4]:
# 기본 숫자 인덱스 대신 문자 인덱스 사용 가능
obj = pd.Series([3,6,9,12], index=['a','b','c','d'])
obj

a     3
b     6
c     9
d    12
dtype: int64

In [5]:
emp={'김철수':5000, '김철호':7000, '한상민':4000, '문대용':4500}
obj = pd.Series(emp)
obj
# key, value 확인

김철수    5000
김철호    7000
한상민    4000
문대용    4500
dtype: int64

In [6]:
# 2차원 리스트를 매개변수로 전달하여 dataframe 생성
a = pd.DataFrame([
    [10,20,30],
    [40,50,60],
    [70,80,90]
])
print(a)

    0   1   2
0  10  20  30
1  40  50  60
2  70  80  90


In [25]:
# 원하는 데이터 추출하기
import pandas as pd

# 키, 몸무게, 유형 데이터프레임 생성하기
tbl = pd.DataFrame({"gender":['f','m','m','f','f'],
                   'height':[170,180,155,143,154],
                   'weight':[80.0,70.4,65.5,45.9,51.2]
                   })

# 몸무게 목록 추출하기
print('몸무게 목록',end='\n\n')
print(tbl['weight'], end='\n\n')

# 몸무게와 키 목록 추출하기
print('몸무게와 키 목록',end='\n\n')
print(tbl[['weight','height']],end='\n\n')

print('--- height가 160 이상',end='\n\n')
print(tbl[tbl['height']>=160],end='\n\n')

print('--- gender가 m',end='\n\n')
print(tbl[tbl['gender']=='m'],end='\n\n')

print('--- 키로 정렬',end='\n\n')
print(tbl.sort_values(by='height'),end='\n\n')

print('--- 몸무게로 정렬',end='\n\n')
print(tbl.sort_values(by='weight', ascending=False))

몸무게 목록

0    80.0
1    70.4
2    65.5
3    45.9
4    51.2
Name: weight, dtype: float64

몸무게와 키 목록

   weight  height
0    80.0     170
1    70.4     180
2    65.5     155
3    45.9     143
4    51.2     154

--- height가 160 이상

  gender  height  weight
0      f     170    80.0
1      m     180    70.4

--- gender가 m

  gender  height  weight
1      m     180    70.4
2      m     155    65.5

--- 키로 정렬

  gender  height  weight
3      f     143    45.9
4      f     154    51.2
2      m     155    65.5
0      f     170    80.0
1      m     180    70.4

--- 몸무게로 정렬

  gender  height  weight
0      f     170    80.0
1      m     180    70.4
2      m     155    65.5
4      f     154    51.2
3      f     143    45.9


In [26]:
# 데이터프레임
# Dictionary 형태로 데이터 저장
data = {'names':['김철수','이철호','김영희','박민수','송철호'],
       'points':[1.5, 1.7, 3.6, 2.4, 2.9],
       'year':[2014,2015,2016,2017,2018]
       }

# 데이터프레임으로 변환
df=pd.DataFrame(data)

# 표 형태의 데이터로 출력됨
df

Unnamed: 0,names,points,year
0,김철수,1.5,2014
1,이철호,1.7,2015
2,김영희,3.6,2016
3,박민수,2.4,2017
4,송철호,2.9,2018


In [27]:
# 값 확인(2차원 ndarray로 출력됨)
df.values

array([['김철수', 1.5, 2014],
       ['이철호', 1.7, 2015],
       ['김영희', 3.6, 2016],
       ['박민수', 2.4, 2017],
       ['송철호', 2.9, 2018]], dtype=object)

In [28]:
# 인덱스와 컬럼 이름을 바꿀 수 있다.
df.index.name = 'Num'
df.columns.name = 'Info'
df

Info,names,points,year
Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,김철수,1.5,2014
1,이철호,1.7,2015
2,김영희,3.6,2016
3,박민수,2.4,2017
4,송철호,2.9,2018


In [29]:
# 컬럼과 인덱스에 key 이름을 지정할 수 있다.
df2 = pd.DataFrame(data, columns=['year','names','points','penalty'],
                  index=['one','two','three','four','five'])
df2

Unnamed: 0,year,names,points,penalty
one,2014,김철수,1.5,
two,2015,이철호,1.7,
three,2016,김영희,3.6,
four,2017,박민수,2.4,
five,2018,송철호,2.9,


In [30]:
# penalty는 새로운 필드이므로 NaN(Not a Number)로 표시됨
# NaN 컬럼은 어떤 연산도 처리되지 않으므로 대체값을 지정하는 것이 필요함
df2.fillna(0)

Unnamed: 0,year,names,points,penalty
one,2014,김철수,1.5,0
two,2015,이철호,1.7,0
three,2016,김영희,3.6,0
four,2017,박민수,2.4,0
five,2018,송철호,2.9,0


In [31]:
df2.values

array([[2014, '김철수', 1.5, nan],
       [2015, '이철호', 1.7, nan],
       [2016, '김영희', 3.6, nan],
       [2017, '박민수', 2.4, nan],
       [2018, '송철호', 2.9, nan]], dtype=object)

In [32]:
# 계산 가능한 컬럼에 대해 기본통계량을 계산하여 출력
# 데이터셋을 전반적으로 살펴보고 싶을 때 유용함
df2.describe()

Unnamed: 0,year,points
count,5.0,5.0
mean,2016.0,2.42
std,1.581139,0.864292
min,2014.0,1.5
25%,2015.0,1.7
50%,2016.0,2.4
75%,2017.0,2.9
max,2018.0,3.6


In [33]:
import pandas as pd
import numpy as np

data = {'names':['김철수','이철호','김영희','박민수','송철호'],
       'year':[2014,2015,2016,2017,2018],
       'points':[1.5, 1.7, 3.6, 2.4, 2.9]}

df = pd.DataFrame(data, columns=['year','names','points','penalty'],
                 index=['one','two','three','four','five'])
df

Unnamed: 0,year,names,points,penalty
one,2014,김철수,1.5,
two,2015,이철호,1.7,
three,2016,김영희,3.6,
four,2017,박민수,2.4,
five,2018,송철호,2.9,


In [34]:
# year 컬럼만 선택, 인덱스와 함께 표시됨
# 하나의 열, 행을 선택하면 Series 형으로 출력됨
df['year']

one      2014
two      2015
three    2016
four     2017
five     2018
Name: year, dtype: int64

In [35]:
# 위와 같은 방법
df.year

one      2014
two      2015
three    2016
four     2017
five     2018
Name: year, dtype: int64

In [36]:
# 복수개의 열
df[['year','points']]

Unnamed: 0,year,points
one,2014,1.5
two,2015,1.7
three,2016,3.6
four,2017,2.4
five,2018,2.9


In [37]:
# NaN 필드에 0.5를 대입
df['penalty'] = 0.5
df

Unnamed: 0,year,names,points,penalty
one,2014,김철수,1.5,0.5
two,2015,이철호,1.7,0.5
three,2016,김영희,3.6,0.5
four,2017,박민수,2.4,0.5
five,2018,송철호,2.9,0.5


In [38]:
# 각각 다른 값을 입력할 경우(우변에 리스트 또는 ndarray로 작성)
df['penalty'] = [0.1, 0.2, 0.3, 0.4, 0.5]
df

Unnamed: 0,year,names,points,penalty
one,2014,김철수,1.5,0.1
two,2015,이철호,1.7,0.2
three,2016,김영희,3.6,0.3
four,2017,박민수,2.4,0.4
five,2018,송철호,2.9,0.5


In [39]:
# 새로운 컬럼을 추가할 경우
df['ages'] = np.arange(10,15)
df

Unnamed: 0,year,names,points,penalty,ages
one,2014,김철수,1.5,0.1,10
two,2015,이철호,1.7,0.2,11
three,2016,김영희,3.6,0.3,12
four,2017,박민수,2.4,0.4,13
five,2018,송철호,2.9,0.5,14


In [40]:
# 필드 삭제
del df['ages']
df

Unnamed: 0,year,names,points,penalty
one,2014,김철수,1.5,0.1
two,2015,이철호,1.7,0.2
three,2016,김영희,3.6,0.3
four,2017,박민수,2.4,0.4
five,2018,송철호,2.9,0.5


In [41]:
# 데이터프레임의 인덱스와 컬럼 이름 정의
df.index.name = 'Order'
df.columns.name = 'Info'
df

Info,year,names,points,penalty
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
one,2014,김철수,1.5,0.1
two,2015,이철호,1.7,0.2
three,2016,김영희,3.6,0.3
four,2017,박민수,2.4,0.4
five,2018,송철호,2.9,0.5


In [42]:
# 숫자 인덱스 사용 : 0~2 인덱스
df[0:3]

Info,year,names,points,penalty
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
one,2014,김철수,1.5,0.1
two,2015,이철호,1.7,0.2
three,2016,김영희,3.6,0.3


In [43]:
# 행 선택
df.loc['two']

Info
year       2015
names       이철호
points      1.7
penalty     0.2
Name: two, dtype: object

In [44]:
# two~four 인덱스 범위
df.loc['two':'four']

Info,year,names,points,penalty
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
two,2015,이철호,1.7,0.2
three,2016,김영희,3.6,0.3
four,2017,박민수,2.4,0.4


In [45]:
# loc[행,열]
# two~four 행 중에서 points 열 선택
df.loc['two':'four','points']

Order
two      1.7
three    3.6
four     2.4
Name: points, dtype: float64

In [46]:
# 전체 행 중에서 year, names 필드만 선택
df.loc[:,['year','names']]

Info,year,names
Order,Unnamed: 1_level_1,Unnamed: 2_level_1
one,2014,김철수
two,2015,이철호
three,2016,김영희
four,2017,박민수
five,2018,송철호


In [47]:
# 인덱스 3행(네번째행)
df.iloc[3]

Info
year       2017
names       박민수
points      2.4
penalty     0.4
Name: four, dtype: object

In [48]:
# 행, 열에 대한 범위 인덱싱
# 3~4행, 0~1열
df.iloc[3:5, 0:2]

Info,year,names
Order,Unnamed: 1_level_1,Unnamed: 2_level_1
four,2017,박민수
five,2018,송철호


In [49]:
# 원하는 인덱스 명시 가능
df.iloc[[0,1,3],[1,2]]

Info,names,points
Order,Unnamed: 1_level_1,Unnamed: 2_level_1
one,김철수,1.5
two,이철호,1.7
four,박민수,2.4


In [50]:
# 모든 행의 1~3열
df.iloc[:, 1:4]

Info,names,points,penalty
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,김철수,1.5,0.1
two,이철호,1.7,0.2
three,김영희,3.6,0.3
four,박민수,2.4,0.4
five,송철호,2.9,0.5


In [51]:
# 1행 1열의 값
df.iloc[1,1]

'이철호'

In [52]:
# boolean 인덱싱
# year가 2014보다 큰 데이터를 선택하려면?
df['year'] > 2014

# boolean Series가 출력된다. 마스크라고 함

Order
one      False
two       True
three     True
four      True
five      True
Name: year, dtype: bool

In [53]:
# True가 나온 행들만 선택
df.loc[df['year'] > 2014, :]

Info,year,names,points,penalty
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
two,2015,이철호,1.7,0.2
three,2016,김영희,3.6,0.3
four,2017,박민수,2.4,0.4
five,2018,송철호,2.9,0.5


In [54]:
import numpy as np
import pandas as pd

# 인덱스와 컬럼에 대한 정보가 없으면 0부터 시작하는 인덱스와 컬럼으로 설정된다.
# 6행 4열의 데이터프레임, randn() 정규분포 난수 발생
df = pd.DataFrame(np.random.randn(6,4))
df

Unnamed: 0,0,1,2,3
0,0.182713,-0.991025,0.160438,1.37967
1,-1.644131,1.167784,0.78138,0.123192
2,-2.129156,-0.085585,0.488583,0.243281
3,-0.294139,1.272434,0.069739,0.267108
4,-0.432394,-0.135413,-0.348894,-0.166488
5,0.132249,0.606543,-0.783371,-0.18708


In [55]:
# 컬럼과 인덱스 설정
df.columns = ['A','B','C','D']
df.index = pd.date_range('20180301',periods=6)
df.index

DatetimeIndex(['2018-03-01', '2018-03-02', '2018-03-03', '2018-03-04',
               '2018-03-05', '2018-03-06'],
              dtype='datetime64[ns]', freq='D')

In [56]:
df

Unnamed: 0,A,B,C,D
2018-03-01,0.182713,-0.991025,0.160438,1.37967
2018-03-02,-1.644131,1.167784,0.78138,0.123192
2018-03-03,-2.129156,-0.085585,0.488583,0.243281
2018-03-04,-0.294139,1.272434,0.069739,0.267108
2018-03-05,-0.432394,-0.135413,-0.348894,-0.166488
2018-03-06,0.132249,0.606543,-0.783371,-0.18708


In [57]:
# 컬럼 삭제
# drop 함수는 기본적으로 행을 삭제한다..
# axis에 1을 주게 되면 컬럼을 삭제한다.
df.drop('D', axis=1)
df

# 좌변에 새로운 데이터프레임에 할당해야 함

Unnamed: 0,A,B,C,D
2018-03-01,0.182713,-0.991025,0.160438,1.37967
2018-03-02,-1.644131,1.167784,0.78138,0.123192
2018-03-03,-2.129156,-0.085585,0.488583,0.243281
2018-03-04,-0.294139,1.272434,0.069739,0.267108
2018-03-05,-0.432394,-0.135413,-0.348894,-0.166488
2018-03-06,0.132249,0.606543,-0.783371,-0.18708


In [58]:
df3 = df.drop('D', axis=1)
df3

Unnamed: 0,A,B,C
2018-03-01,0.182713,-0.991025,0.160438
2018-03-02,-1.644131,1.167784,0.78138
2018-03-03,-2.129156,-0.085585,0.488583
2018-03-04,-0.294139,1.272434,0.069739
2018-03-05,-0.432394,-0.135413,-0.348894
2018-03-06,0.132249,0.606543,-0.783371


In [59]:
# 여러 컬럼 삭제
df.drop(['B','C'], axis=1)

Unnamed: 0,A,D
2018-03-01,0.182713,1.37967
2018-03-02,-1.644131,0.123192
2018-03-03,-2.129156,0.243281
2018-03-04,-0.294139,0.267108
2018-03-05,-0.432394,-0.166488
2018-03-06,0.132249,-0.18708


In [60]:
import pandas as pd
import numpy as np

df = pd.DataFrame({
    'weight':[80.0, 70.4, 65.5, 45.9, 51.2],
    'height':[170, 180, 155, 143, 154]
})
df

Unnamed: 0,weight,height
0,80.0,170
1,70.4,180
2,65.5,155
3,45.9,143
4,51.2,154


In [61]:
# 세로방향 합(각 열의 합)
df.sum(axis=0)

weight    313.0
height    802.0
dtype: float64

In [62]:
# 가로방향 합(각 행의 합)
df.sum(axis=1)

0    250.0
1    250.4
2    220.5
3    188.9
4    205.2
dtype: float64

In [63]:
# height 열의 평균
df['height'].mean()

160.4

In [64]:
# height 열의 분산(평균으로부터 얼마나 떨어져 있는지)
df['height'].var()

212.3

In [65]:
# 새로운 데이터프레임 생성
df3 = pd.DataFrame(np.random.randn(4,3), columns=['b','d','e'],
                  index=['서울','인천','대구','부산'])
df3

Unnamed: 0,b,d,e
서울,-0.451187,2.956801,-0.038995
인천,0.086852,1.273867,0.052349
대구,2.490909,-2.370749,1.020499
부산,0.313537,-1.732672,-1.294873


In [66]:
# 람다식 정의 : x가 입력되면 최대값에서 최소값을 뺀 값을 리턴한다.
func = lambda x : x.max() - x.min()

# 데이터프레임의 각 열(세로방향)에 func 함수를 적용한다.
# apply(함수이름)
df3.apply(func, axis=0)

b    2.942096
d    5.327550
e    2.315372
dtype: float64

In [67]:
# 일반 함수 정의 : x가 입력되면 최대값에서 최소값을 뺀 값을 리턴한다.
def func(x):
    return x.max() - x.min()

# 데이터프레임의 각 열(세로방향)에 func 함수를 적용한다.
df3.apply(func, axis=0)

b    2.942096
d    5.327550
e    2.315372
dtype: float64

In [68]:
# 데이터프레임의 각 행(가로방향)에 func 함수를 적용한다.
df3.apply(func, axis=1)

서울    3.407988
인천    1.221518
대구    4.861658
부산    2.046210
dtype: float64