## pandas란?
파이썬에서 사용하는 데이터분석 라이브러리로,행과 열로 이루어진 데이터 객체를 만들어 다룰수 있게 되며 보다 안정적으로 대용량의 데이터들을 처리하는데 매우 편리한 도구입니다.
<br> 출처: https://doorbw.tistory.com/172

위  출처에 있는 코드를 연습하며 살을 붙힌 것입니다.

In [1]:
import numpy as np
import pandas as pd

pandas에서는 기본적으로 정의되는 자료구조인 Series와 Data Frame을 사용한다.
<br>
이 자료구조들은 데이터 분석에 있어서 높은 수준의 성능을 보여줍니다.

### 2-1. Series

In [2]:
obj = pd.Series([4,7,-3,7])

In [3]:
obj

0    4
1    7
2   -3
3    7
dtype: int64

In [4]:
obj.values

array([ 4,  7, -3,  7])

In [5]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [6]:
obj.dtypes
# = obj.dtype

dtype('int64')

##### python의 dictionary 자료형을 Serie data로 만들 수 있다.
dictionary의 key가 Series의 index가 된다.

In [7]:
data = {'kim' :165, "An" : 170, "park" : 180}
data

{'An': 170, 'kim': 165, 'park': 180}

In [8]:
type(data)

dict

In [9]:
pdata = pd.Series(data)

In [10]:
pdata

An      170
kim     165
park    180
dtype: int64

In [11]:
pdata.values

array([170, 165, 180])

In [12]:
obj2 = pd.Series([3,6,2,7], index = ['e','sdg',2,'7'])

In [13]:
obj2

e      3
sdg    6
2      2
7      7
dtype: int64

In [14]:
obj2.index


Index(['e', 'sdg', 2, '7'], dtype='object')

In [15]:
obj2.dtype

dtype('int64')

In [16]:
type(obj2)

pandas.core.series.Series

In [17]:
pdata.index 

Index(['An', 'kim', 'park'], dtype='object')

In [18]:
# Series 내 index변경
pdata.index = ['a','b','c']

In [19]:
pdata.rename(index = {'a': 'aaa'},inplace = False)
#inplace는 원본 데이터에도 변경사항으로 대체할 것인지, 아닌지

aaa    170
b      165
c      180
dtype: int64

In [20]:
pdata

a    170
b    165
c    180
dtype: int64

In [21]:
pdata.index.name = "alpha"

In [22]:
pdata

alpha
a    170
b    165
c    180
dtype: int64

In [23]:
pdata.index

Index(['a', 'b', 'c'], dtype='object', name='alpha')

In [24]:
pdata.index.name

'alpha'

#### 2-2 data frame
DataFrame에 들어갈 데이터 정의 : 
###### python의 dictionary 또는 numpy의 array로 정의 할 수 있다.

In [25]:
ddata = pd.DataFrame(pdata)

In [26]:
ddata

Unnamed: 0_level_0,0
alpha,Unnamed: 1_level_1
a,170
b,165
c,180


In [27]:
#dictionary
data = {
    'name' : ['kim','park','An','nam'],
    'age' : [35,47,2,71],
    'grade' : [5,3,1,1]
}

In [28]:
data

{'age': [35, 47, 2, 71],
 'grade': [5, 3, 1, 1],
 'name': ['kim', 'park', 'An', 'nam']}

In [29]:
df = pd.DataFrame(data)

In [30]:
df

Unnamed: 0,age,grade,name
0,35,5,kim
1,47,3,park
2,2,1,An
3,71,1,nam


In [31]:
df.index

RangeIndex(start=0, stop=4, step=1)

In [32]:
df.values

array([[35, 5, 'kim'],
       [47, 3, 'park'],
       [2, 1, 'An'],
       [71, 1, 'nam']], dtype=object)

In [33]:
df.index.name = 'student'

In [34]:
df

Unnamed: 0_level_0,age,grade,name
student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,35,5,kim
1,47,3,park
2,2,1,An
3,71,1,nam


In [35]:
df.columns.name = 'catogory'

In [36]:
df

catogory,age,grade,name
student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,35,5,kim
1,47,3,park
2,2,1,An
3,71,1,nam


위 예시는 dictionary를 가지고 pandas의 dataframe을 만드는 것.
이제는 처음부터 DataFrame을 만드는 것을 해본다.


In [37]:
data = {
    'name' : ['kim','park','An','nam'],
    'age' : [35,47,2,71],
    'grade' : [5,3,4,1]
}

In [38]:
df2 = pd.DataFrame(data)

In [39]:
df2

Unnamed: 0,age,grade,name
0,35,5,kim
1,47,3,park
2,2,4,An
3,71,1,nam


In [40]:
# dataframe을 만들면서 columns와 index를 설정가능하다.
df2 = pd.DataFrame(data, columns = ['age','name','grade'],
                      index = ['one','two','three','four'])

In [41]:
df2

Unnamed: 0,age,name,grade
one,35,kim,5
two,47,park,3
three,2,An,4
four,71,nam,1


In [42]:
df2.rename(columns = {'age' : '나이'})

Unnamed: 0,나이,name,grade
one,35,kim,5
two,47,park,3
three,2,An,4
four,71,nam,1


In [43]:
df2

Unnamed: 0,age,name,grade
one,35,kim,5
two,47,park,3
three,2,An,4
four,71,nam,1


In [44]:
df2.columns = ['하나','둘','셋']

In [45]:
df2

Unnamed: 0,하나,둘,셋
one,35,kim,5
two,47,park,3
three,2,An,4
four,71,nam,1


In [46]:
df2.describe()
#dataframe중에서 계산 가능한 값들에

Unnamed: 0,하나,셋
count,4.0,4.0
mean,38.75,3.25
std,28.709754,1.707825
min,2.0,1.0
25%,26.75,2.5
50%,41.0,3.5
75%,53.0,4.25
max,71.0,5.0


In [47]:
data = {"names": ["Kilho", "Kilho", "Kilho", "Charles", "Charles"],
           "year": [2014, 2015, 2016, 2015, 2016],
           "points": [1.5, 1.7, 3.6, 2.4, 2.9]}


In [48]:
df = pd.DataFrame(data, index = ['one', "two", "three", "four", "five"])

In [49]:
#add column
df['penalty'] = 'NaN'

In [50]:
df['year']

one      2014
two      2015
three    2016
four     2015
five     2016
Name: year, dtype: int64

In [51]:
df[['year','points']]

Unnamed: 0,year,points
one,2014,1.5
two,2015,1.7
three,2016,3.6
four,2015,2.4
five,2016,2.9


In [52]:
len(df)

5

In [53]:
df['add'] = np.arange(len(df))

In [54]:
df

Unnamed: 0,names,points,year,penalty,add
one,Kilho,1.5,2014,,0
two,Kilho,1.7,2015,,1
three,Kilho,3.6,2016,,2
four,Charles,2.4,2015,,3
five,Charles,2.9,2016,,4


In [55]:
len(df.columns)

5

In [56]:
df.shape

(5, 5)

In [57]:
#series를 추가할수 있다.
# index, value

In [58]:
val = pd.Series([3,627,573],index = ['one','two', 'five'])

In [59]:
df['debt'] = val
#index가 없는 value는 NaN

In [60]:
df

Unnamed: 0,names,points,year,penalty,add,debt
one,Kilho,1.5,2014,,0,3.0
two,Kilho,1.7,2015,,1,627.0
three,Kilho,3.6,2016,,2,
four,Charles,2.4,2015,,3,
five,Charles,2.9,2016,,4,573.0


In [61]:
defaults = np.seterr(all = "ignore")

In [62]:
df['debt']-df['points']

one        1.5
two      625.3
three      NaN
four       NaN
five     570.1
dtype: float64

In [63]:
#DataFrame열 삭제하기
del df['points']

In [64]:
df

Unnamed: 0,names,year,penalty,add,debt
one,Kilho,2014,,0,3.0
two,Kilho,2015,,1,627.0
three,Kilho,2016,,2,
four,Charles,2015,,3,
five,Charles,2016,,4,573.0


In [65]:
#column,index의 이름정해주기
df.columns.name = 'cate'
df.index.name = 'student'

In [66]:
df

cate,names,year,penalty,add,debt
student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,Kilho,2014,,0,3.0
two,Kilho,2015,,1,627.0
three,Kilho,2016,,2,
four,Charles,2015,,3,
five,Charles,2016,,4,573.0


 ###### DataFrame행 가지고 놀기
 pandas의 DataFrame에서는 행을 indexing하는 방법이 무수히 많다.


In [67]:
df[0:2]
#이거도 물론 뒤에 있는 index는 빼고 그 전까지만 가져오는것!

cate,names,year,penalty,add,debt
student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,Kilho,2014,,0,3.0
two,Kilho,2015,,1,627.0


In [68]:
df

cate,names,year,penalty,add,debt
student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,Kilho,2014,,0,3.0
two,Kilho,2015,,1,627.0
three,Kilho,2016,,2,
four,Charles,2015,,3,
five,Charles,2016,,4,573.0


In [69]:
#index이름으로 가져오고 싶다면?
#iloc 혹은 loc사용하기
df.loc['two']

cate
names      Kilho
year        2015
penalty      NaN
add            1
debt         627
Name: two, dtype: object

In [70]:
df.loc[["one",'five']]

cate,names,year,penalty,add,debt
student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,Kilho,2014,,0,3.0
five,Charles,2016,,4,573.0


In [71]:
df.loc['one' : 'three']
#마지막 three까지 가져온다.

cate,names,year,penalty,add,debt
student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,Kilho,2014,,0,3.0
two,Kilho,2015,,1,627.0
three,Kilho,2016,,2,


In [72]:
#특정 행, 특정열 가져오기
df.loc['four','add']

3

In [73]:
test = df.loc[['one','three'],'debt']

In [74]:
test   #결과는 Series

student
one      3.0
three    NaN
Name: debt, dtype: float64

In [75]:
type(test)

pandas.core.series.Series

python에서 모든 행, 모든 열은  : 로 표시한다.

In [76]:
df.loc[:,'penalty']

student
one      NaN
two      NaN
three    NaN
four     NaN
five     NaN
Name: penalty, dtype: object

In [77]:
df.loc['one']

cate
names      Kilho
year        2014
penalty      NaN
add            0
debt           3
Name: one, dtype: object

In [78]:
#새로운 열추가 뭐라고 ?? 
df['new'] = 1

In [79]:
df

cate,names,year,penalty,add,debt,new
student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
one,Kilho,2014,,0,3.0,1
two,Kilho,2015,,1,627.0,1
three,Kilho,2016,,2,,1
four,Charles,2015,,3,,1
five,Charles,2016,,4,573.0,1


In [80]:
#그렇다면 새로운 행 추가는?
df.loc['six',:] = ['eden',2009,2,5,46,2]

In [81]:
df

cate,names,year,penalty,add,debt,new
student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
one,Kilho,2014.0,,0.0,3.0,1.0
two,Kilho,2015.0,,1.0,627.0,1.0
three,Kilho,2016.0,,2.0,,1.0
four,Charles,2015.0,,3.0,,1.0
five,Charles,2016.0,,4.0,573.0,1.0
six,eden,2009.0,2.0,5.0,46.0,2.0


loc : label을 이용해 값을 찾을 수 있다.<br>
iloc : integer position을 이용해 값을 찾을 수 있다.<br>
ix : integer position 과 label을 혼용해서 사용할 수 있다.

In [82]:
df.iloc[2,]

cate
names      Kilho
year        2016
penalty      NaN
add            2
debt         NaN
new            1
Name: three, dtype: object

In [83]:
df.iloc[0,1]
# label과 columns모두 이름이 아닌 index값으로 추출가능

2014.0

In [84]:
test = df.iloc [ [1,2,3],[2,0]]

In [85]:
type(test)

pandas.core.frame.DataFrame

In [86]:
test

cate,penalty,names
student,Unnamed: 1_level_1,Unnamed: 2_level_1
two,,Kilho
three,,Kilho
four,,Charles


In [87]:
test.names

student
two        Kilho
three      Kilho
four     Charles
Name: names, dtype: object

In [88]:
type(test.names)

pandas.core.series.Series

In [89]:
df['year'] == 2009

student
one      False
two      False
three    False
four     False
five     False
six       True
Name: year, dtype: bool

In [90]:
df.loc[df['year'] >2010,]

cate,names,year,penalty,add,debt,new
student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
one,Kilho,2014.0,,0.0,3.0,1.0
two,Kilho,2015.0,,1.0,627.0,1.0
three,Kilho,2016.0,,2.0,,1.0
four,Charles,2015.0,,3.0,,1.0
five,Charles,2016.0,,4.0,573.0,1.0


In [91]:
df.loc[df['year']>2010,:]

cate,names,year,penalty,add,debt,new
student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
one,Kilho,2014.0,,0.0,3.0,1.0
two,Kilho,2015.0,,1.0,627.0,1.0
three,Kilho,2016.0,,2.0,,1.0
four,Charles,2015.0,,3.0,,1.0
five,Charles,2016.0,,4.0,573.0,1.0


In [92]:
df.loc[df['add']>= 3,['year','new']]

cate,year,new
student,Unnamed: 1_level_1,Unnamed: 2_level_1
four,2015.0,1.0
five,2016.0,1.0
six,2009.0,2.0


In [93]:
test = df.ix[1,['year','add','new']]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


In [94]:
test  # Series

cate
year    2015
add        1
new        1
Name: two, dtype: object

In [95]:
type(test)

pandas.core.series.Series

In [96]:
test.index

Index(['year', 'add', 'new'], dtype='object', name='cate')

In [97]:
type(test.values)  #numpy.dnarray

numpy.ndarray

dataframe조건 여러개를 줘봅시다.

In [98]:
df

cate,names,year,penalty,add,debt,new
student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
one,Kilho,2014.0,,0.0,3.0,1.0
two,Kilho,2015.0,,1.0,627.0,1.0
three,Kilho,2016.0,,2.0,,1.0
four,Charles,2015.0,,3.0,,1.0
five,Charles,2016.0,,4.0,573.0,1.0
six,eden,2009.0,2.0,5.0,46.0,2.0


In [99]:
df[0:4]

cate,names,year,penalty,add,debt,new
student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
one,Kilho,2014.0,,0.0,3.0,1.0
two,Kilho,2015.0,,1.0,627.0,1.0
three,Kilho,2016.0,,2.0,,1.0
four,Charles,2015.0,,3.0,,1.0


In [100]:
# 여러 조건으로 행 추출
df.loc[(df['new'] ==1)&(df['add']>2)]

cate,names,year,penalty,add,debt,new
student,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
four,Charles,2015.0,,3.0,,1.0
five,Charles,2016.0,,4.0,573.0,1.0


In [101]:
df[df['add']>3]['names']

student
five    Charles
six        eden
Name: names, dtype: object

In [102]:
df.loc['one']

cate
names      Kilho
year        2014
penalty      NaN
add            0
debt           3
new            1
Name: one, dtype: object

### remember!
행에 대한 조건 추출은 loc, iloc<br>
열에 대한 조건 추출은 바로 df[ ] 

In [103]:
data = pd.DataFrame(np.random.randint(1,20,size = (5,4)))

In [104]:
# column, index설정하지 않고 pd.DataFrame하면 0부터 숫자로 임의설정
data

Unnamed: 0,0,1,2,3
0,14,16,9,6
1,15,16,11,8
2,11,19,5,18
3,3,5,4,12
4,5,1,15,13


In [105]:
data.columns = ['kor','eng','math','fra']

In [106]:
data.index = ['K','A','T','E','R']

In [107]:
data

Unnamed: 0,kor,eng,math,fra
K,14,16,9,6
A,15,16,11,8
T,11,19,5,18
E,3,5,4,12
R,5,1,15,13


In [108]:
np.datetime64('today','s')

numpy.datetime64('2019-04-12T00:00:00')

In [109]:
np.arange(np.datetime64('today','M'),np.datetime64('today','M')+ 7,dtype = 'datetime64[M]')

array(['2019-04', '2019-05', '2019-06', '2019-07', '2019-08', '2019-09',
       '2019-10'], dtype='datetime64[M]')


freq 인수로 특정한 날짜만 생성되도록 할 수도 있다. 많이 사용되는 freq 인수값은 다음과 같다.

* s: 초
* T: 분
* H: 시간
* D: 일(day)
* B: 주말이 아닌 평일
* W: 주(일요일)
* W-MON: 주(월요일)
* M: 각 달(month)의 마지막 날
* MS: 각 달의 첫날
* BM: 주말이 아닌 평일 중에서 각 달의 마지막 날
* BMS: 주말이 아닌 평일 중에서 각 달의 첫날
* WOM-2THU: 각 달의 두번째 목요일
* Q-JAN: 각 분기의 첫달의 마지막 날
* Q-DEC: 각 분기의 마지막 달의 마지막 날

In [110]:
pd.date_range('today',periods=5, freq = 'D',dtype = 'datetime64[ns]')

DatetimeIndex(['2019-04-12 10:33:11.878273', '2019-04-13 10:33:11.878273',
               '2019-04-14 10:33:11.878273', '2019-04-15 10:33:11.878273',
               '2019-04-16 10:33:11.878273'],
              dtype='datetime64[ns]', freq='D')

In [111]:
import numpy as np

In [112]:
data

Unnamed: 0,kor,eng,math,fra
K,14,16,9,6
A,15,16,11,8
T,11,19,5,18
E,3,5,4,12
R,5,1,15,13


In [113]:
data.index = np.arange(np.datetime64('today','D'), np.datetime64('today','D') + 5, dtype = 'datetime64[D]')

In [114]:
data

Unnamed: 0,kor,eng,math,fra
2019-04-12,14,16,9,6
2019-04-13,15,16,11,8
2019-04-14,11,19,5,18
2019-04-15,3,5,4,12
2019-04-16,5,1,15,13


4월12일 103번째 실행라인부터 시작

In [115]:
np.arange(np.datetime64('today','D'),np.datetime64('today','D')+5,dtype = 'datetime64[D]')

array(['2019-04-12', '2019-04-13', '2019-04-14', '2019-04-15',
       '2019-04-16'], dtype='datetime64[D]')

In [116]:
len(data)

5

In [117]:
data.shape

(5, 4)

In [118]:
#column추가
data['F'] = [3,np.nan,5,np.nan,3.75]
#np.nan은 NaN난수를 의미한다.

In [119]:
data

Unnamed: 0,kor,eng,math,fra,F
2019-04-12,14,16,9,6,3.0
2019-04-13,15,16,11,8,
2019-04-14,11,19,5,18,5.0
2019-04-15,3,5,4,12,
2019-04-16,5,1,15,13,3.75


In [120]:
#NaN없애기
# 행의 값 중 하나라도 NaN일 경우, 해당 행을 삭제한다.
data.dropna(how = 'any')
# 'any' : 어떤거 하나라도!!

Unnamed: 0,kor,eng,math,fra,F
2019-04-12,14,16,9,6,3.0
2019-04-14,11,19,5,18,5.0
2019-04-16,5,1,15,13,3.75


In [121]:
data

Unnamed: 0,kor,eng,math,fra,F
2019-04-12,14,16,9,6,3.0
2019-04-13,15,16,11,8,
2019-04-14,11,19,5,18,5.0
2019-04-15,3,5,4,12,
2019-04-16,5,1,15,13,3.75


In [122]:
data.dropna(how = 'all')
#모든값이 NaN인 경우!!

# include = True라는 옵션을 추가하면 DataFrame에 적용됨.

Unnamed: 0,kor,eng,math,fra,F
2019-04-12,14,16,9,6,3.0
2019-04-13,15,16,11,8,
2019-04-14,11,19,5,18,5.0
2019-04-15,3,5,4,12,
2019-04-16,5,1,15,13,3.75


In [123]:
#NaN인 값에 값넣기
data.fillna(value = 0.5)

Unnamed: 0,kor,eng,math,fra,F
2019-04-12,14,16,9,6,3.0
2019-04-13,15,16,11,8,0.5
2019-04-14,11,19,5,18,5.0
2019-04-15,3,5,4,12,0.5
2019-04-16,5,1,15,13,3.75


In [124]:
data

Unnamed: 0,kor,eng,math,fra,F
2019-04-12,14,16,9,6,3.0
2019-04-13,15,16,11,8,
2019-04-14,11,19,5,18,5.0
2019-04-15,3,5,4,12,
2019-04-16,5,1,15,13,3.75


In [125]:
df = pd.DataFrame({
    'k1' : ['one']*3 + ['two']*4,
    'k2' : [1,1,4,4,3,6,4]
})

In [126]:
df

Unnamed: 0,k1,k2
0,one,1
1,one,1
2,one,4
3,two,4
4,two,3
5,two,6
6,two,4


In [127]:
df.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
2,one,4
3,two,4
4,two,3
5,two,6


In [171]:
df.count()

k1    7
k2    7
dtype: int64

In [176]:
len(df.k2)

7

In [177]:
df

Unnamed: 0,k1,k2
0,one,1
1,one,1
2,one,4
3,two,4
4,two,3
5,two,6
6,two,4


In [128]:
df.duplicated('k1')

0    False
1     True
2     True
3    False
4     True
5     True
6     True
dtype: bool

In [129]:
df.duplicated()

0    False
1     True
2    False
3    False
4    False
5    False
6     True
dtype: bool

.apply(lambda x: )이해하기

In [130]:
# map함수 이해하기
s1 = pd.Series(['K',"F","F","M"])
s1

0    K
1    F
2    F
3    M
dtype: object

In [131]:
import pandas as pd

In [132]:
gender = {"F":1,"M":0}

In [134]:
s2 = pd.Series(['M','M','F','E','F','M','F','E'])
s2.unique()

array(['M', 'F', 'E'], dtype=object)

In [135]:
sorted(s2.unique())

['E', 'F', 'M']

In [136]:
a = dict(enumerate(sorted(s2.unique())))


In [137]:
value = list(map(int,np.array(list(enumerate(s2.unique())))[:,0].tolist()))

* enumerate : 순서대로 index를 매기는 것


아래 출처는 https://wikidocs.net/64 입니다.

lambda 인자 :  표현식

In [138]:
def add_fn(a,b):
    return a+b

In [140]:
add_fn(1,5)

6

In [145]:
(lambda x,y : x+y)(1,5)
#lambda는 함수 이름도 필요없다.

6

* map(함수, 리스트)
<br>
* lambda 인자 : 표현식

In [149]:
list(map(lambda x : x**2 , range(5)))

[0, 1, 4, 9, 16]

In [150]:
from functools import reduce

In [153]:
reduce(lambda x,y : y + x, 'abcde')

'edcba'

In [161]:
list(filter(lambda x:x< 5,range(20) ))         
#             <해석해보기>
#             range(20)은 0에서부터 19까지 20개의 숫자
#             0부터 하나씩 x라고 하고 그것이 <5라면
#             True, 
#             그렇지 않으면,
#             False

[0, 1, 2, 3, 4]

In [170]:
list(filter(lambda x : x % 2 , range (20)))
# 1이면 참, 0이면 거짓이므로, 1인 경우, 즉 홀수만 출력한다.

[1, 3, 5, 7, 9, 11, 13, 15, 17, 19]

아래 예제는
https://nittaku.tistory.com/124
이곳을 참고했습니다.

In [193]:
df2 = pd.DataFrame({
    'food':['bacon','pulled pork','bacon','passitrami','corned beef',
           'bacon','passitrami','honey ham','nova lox'],
    'ounce' : [4,36,2,5,7.5,5,1,6,4]
})

In [179]:
df2

Unnamed: 0,food,ounce
0,bacon,4.0
1,pulled pork,36.0
2,bacon,2.0
3,passtrami,5.0
4,corned beef,7.5
5,bacon,5.0
6,passitrami,1.0
7,honey ham,6.0
8,nova lox,4.0


In [195]:
#중간 dictionary생성
meat_to_animal = {
    'bacon': 'pig',
    'pulled pork' : 'pig',
    'passitrami':'cow',
    'honey ham' : 'pig',
    'nova lox' :'salmon',
    'corned beef':'cow'
}

사용자 정의함수<br>
.apply(lambda x:~)

food에 있는 값들이 mead_to_animal과 연결되서 하나의 열을 추가한다.<br>
특정열에 .apply(lambda x: )를 적용하면, 열의 각 성분들이 for문처럼 모든 성분들이 대입되어 들어간다.

In [191]:
#df2.map(meat_to_animal)

In [184]:
type(df2.food)

pandas.core.series.Series

In [196]:
df2['animal'] = df2['food'].apply(lambda x :meat_to_animal[x.lower()])

In [199]:
df2['food'].map(lambda x : meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [201]:
df2['food'].apply(lambda x : meat_to_animal[x])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

In [202]:
df2

Unnamed: 0,food,ounce,animal
0,bacon,4.0,pig
1,pulled pork,36.0,pig
2,bacon,2.0,pig
3,passitrami,5.0,cow
4,corned beef,7.5,cow
5,bacon,5.0,pig
6,passitrami,1.0,cow
7,honey ham,6.0,pig
8,nova lox,4.0,salmon


In [207]:
df2['food_cate'] = df2['food'].astype("category")

In [208]:
df2

Unnamed: 0,food,ounce,animal,food_cate
0,bacon,4.0,pig,bacon
1,pulled pork,36.0,pig,pulled pork
2,bacon,2.0,pig,bacon
3,passitrami,5.0,cow,passitrami
4,corned beef,7.5,cow,corned beef
5,bacon,5.0,pig,bacon
6,passitrami,1.0,cow,passitrami
7,honey ham,6.0,pig,honey ham
8,nova lox,4.0,salmon,nova lox


In [211]:
type(df2)

pandas.core.frame.DataFrame

In [212]:
df2['food_cate'].cat.categories

Index(['bacon', 'corned beef', 'honey ham', 'nova lox', 'passitrami',
       'pulled pork'],
      dtype='object')

정렬하기

In [214]:
df2.sort_values(by = 'food')

Unnamed: 0,food,ounce,animal,food_cate
0,bacon,4.0,pig,bacon
2,bacon,2.0,pig,bacon
5,bacon,5.0,pig,bacon
4,corned beef,7.5,cow,corned beef
7,honey ham,6.0,pig,honey ham
8,nova lox,4.0,salmon,nova lox
3,passitrami,5.0,cow,passitrami
6,passitrami,1.0,cow,passitrami
1,pulled pork,36.0,pig,pulled pork


ounce들을 범위를 정해줘서 카테고리화해보기

In [224]:
bins = [0,1,3,5,7,float('inf')]

In [238]:
cats = pd.cut(df2["ounce"],bins,labels = ['a','b','c','d','e'])

In [244]:
cats

0    c
1    e
2    b
3    c
4    e
5    c
6    a
7    d
8    c
Name: ounce, dtype: category
Categories (5, object): [a < b < c < d < e]

In [239]:
type(cats)

pandas.core.series.Series

In [240]:
tat = cats.value_counts()

In [241]:
type(tat)

pandas.core.series.Series

In [242]:
tat

c    4
e    2
d    1
b    1
a    1
Name: ounce, dtype: int64

In [243]:
tat.index

CategoricalIndex(['c', 'e', 'd', 'b', 'a'], categories=['a', 'b', 'c', 'd', 'e'], ordered=True, dtype='category')

In [235]:
tat.values

array([4, 2, 1, 1, 1])

In [236]:
tat.var

<bound method Series.var of (3.0, 5.0]    4
(7.0, inf]    2
(5.0, 7.0]    1
(1.0, 3.0]    1
(0.0, 1.0]    1
Name: ounce, dtype: int64>

In [245]:
# !!!!이렇게 하면안된다!!순서 다무시 해버림
## 범위를 index로 하고 싶지않으면 pd.cut할때 labels해주기
tat.index = ['aa','bb','cc','dd','ee']

In [246]:
tat

aa    4
bb    2
cc    1
dd    1
ee    1
Name: ounce, dtype: int64

pd.cut을 할때 내가 bins로 구간을 나눠주지 않고,
<br>
<br>
pandas가 알아서 구간을 나눠주게 할 수 있다.
* pd.cut : 최솟값과 최댓값으로 보고 구간을 나눈다.
* pd.qcut : 분위수를 보고 구간을 나눈다.

In [247]:
data = np.random.rand(30)

In [248]:
data

array([0.59023938, 0.91540759, 0.36906694, 0.26319445, 0.96090576,
       0.81128934, 0.02079221, 0.61768896, 0.94974508, 0.72534671,
       0.87117214, 0.06160723, 0.12519058, 0.46016128, 0.5298694 ,
       0.39226577, 0.95057079, 0.03181438, 0.90305783, 0.89546884,
       0.56970335, 0.89977502, 0.24687663, 0.95214585, 0.88867933,
       0.79429678, 0.25060035, 0.34716673, 0.53502392, 0.45429065])

In [250]:
pd.cut(data,5, precision = 3) 
# data를 5구간으로 나누고, precision은 소수점 셋째자리까지 나타낼 것을 의미한다.
# 반올림

[(0.585, 0.773], (0.773, 0.961], (0.209, 0.397], (0.209, 0.397], (0.773, 0.961], ..., (0.773, 0.961], (0.209, 0.397], (0.209, 0.397], (0.397, 0.585], (0.397, 0.585]]
Length: 30
Categories (5, interval[float64]): [(0.0199, 0.209] < (0.209, 0.397] < (0.397, 0.585] < (0.585, 0.773] < (0.773, 0.961]]

In [253]:
data.max(), data.min()

(0.9609057649845665, 0.02079220610514454)

In [254]:
pd.qcut(data,5, precision = 3)

[(0.502, 0.753], (0.9, 0.961], (0.261, 0.502], (0.261, 0.502], (0.9, 0.961], ..., (0.753, 0.9], (0.019799999999999998, 0.261], (0.261, 0.502], (0.502, 0.753], (0.261, 0.502]]
Length: 30
Categories (5, interval[float64]): [(0.019799999999999998, 0.261] < (0.261, 0.502] < (0.502, 0.753] < (0.753, 0.9] < (0.9, 0.961]]

qcut은 pd.cut과 달리 단순히 4등분을 하는 것이 아니라, 분포까지 고려해서 4분위로 나눈다음, 구간을 결정한다.
<br>
<br>
#### 따라서 각 구간의 길이가 동일하다고 말할 수 없다.
(pd.cut의 각 구간의 길이는 동일하다)


================================================================
다시 https://doorbw.tistory.com/172 로 돌아와 107번 실행코드부터 시작한다.

In [256]:
df = pd.DataFrame(np.random.randn(5,5))

In [257]:
df

Unnamed: 0,0,1,2,3,4
0,-0.321391,-0.318205,-0.17608,0.01905,1.243162
1,-0.886196,-1.406709,2.0424,-0.07445,-0.2232
2,-0.236388,-1.465904,0.067274,-0.899857,1.411438
3,-0.391929,1.156578,0.797989,-0.63216,-0.634249
4,-1.565628,-0.110661,-0.044798,0.908588,1.003347


In [259]:
#참고, 랜덤으로 randint를 할때는 표현식이 조금 다르다.
pd.DataFrame(np.random.randint(35, size = (5,7)))

Unnamed: 0,0,1,2,3,4,5,6
0,1,0,14,24,20,28,26
1,19,33,25,3,31,6,29
2,27,8,22,20,0,33,32
3,25,3,5,10,14,10,28
4,20,22,22,24,28,7,25


In [260]:
df.columns = ['A','B','C','D','E']

In [264]:
df.index = np.arange(np.datetime64('today','D'),np.datetime64('today','D')+5,
                    dtype = 'datetime64[D]')

In [265]:
df

Unnamed: 0,A,B,C,D,E
2019-04-12,-0.321391,-0.318205,-0.17608,0.01905,1.243162
2019-04-13,-0.886196,-1.406709,2.0424,-0.07445,-0.2232
2019-04-14,-0.236388,-1.465904,0.067274,-0.899857,1.411438
2019-04-15,-0.391929,1.156578,0.797989,-0.63216,-0.634249
2019-04-16,-1.565628,-0.110661,-0.044798,0.908588,1.003347


In [266]:
df['F']=[2,6,np.nan,np.nan,4]

In [269]:
df.isnull()

Unnamed: 0,A,B,C,D,E,F
2019-04-12,False,False,False,False,False,False
2019-04-13,False,False,False,False,False,False
2019-04-14,False,False,False,False,False,True
2019-04-15,False,False,False,False,False,True
2019-04-16,False,False,False,False,False,False


In [270]:
df.dropna()

Unnamed: 0,A,B,C,D,E,F
2019-04-12,-0.321391,-0.318205,-0.17608,0.01905,1.243162,2.0
2019-04-13,-0.886196,-1.406709,2.0424,-0.07445,-0.2232,6.0
2019-04-16,-1.565628,-0.110661,-0.044798,0.908588,1.003347,4.0


In [271]:
df.dropna(how = 'all')

Unnamed: 0,A,B,C,D,E,F
2019-04-12,-0.321391,-0.318205,-0.17608,0.01905,1.243162,2.0
2019-04-13,-0.886196,-1.406709,2.0424,-0.07445,-0.2232,6.0
2019-04-14,-0.236388,-1.465904,0.067274,-0.899857,1.411438,
2019-04-15,-0.391929,1.156578,0.797989,-0.63216,-0.634249,
2019-04-16,-1.565628,-0.110661,-0.044798,0.908588,1.003347,4.0


In [275]:
df.loc[df.isnull()['F'],:]
#행추출이니까 loc임을 한번 더 확인하자
# 해석
# 1) df.isnull()['F']   : F column에서 nan의 여부를 bool타입으로
# 2) 그 bool타입을 index로 해서 행을 추출
# 3) 행을 추출해야 하니까 df.loc

Unnamed: 0,A,B,C,D,E,F
2019-04-14,-0.236388,-1.465904,0.067274,-0.899857,1.411438,
2019-04-15,-0.391929,1.156578,0.797989,-0.63216,-0.634249,


### 날짜형 변수 datetime

In [276]:
pd.to_datetime('19961007')

Timestamp('1996-10-07 00:00:00')

In [281]:
#특정항 drop하기
df.drop([pd.to_datetime('20190415')],axis = 0)
#default axis = 0 행기준

Unnamed: 0,A,B,C,D,E,F
2019-04-12,-0.321391,-0.318205,-0.17608,0.01905,1.243162,2.0
2019-04-13,-0.886196,-1.406709,2.0424,-0.07445,-0.2232,6.0
2019-04-14,-0.236388,-1.465904,0.067274,-0.899857,1.411438,
2019-04-16,-1.565628,-0.110661,-0.044798,0.908588,1.003347,4.0


In [283]:
df.drop('E',axis = 1)
#axis = 0은 행, axis = 1은 열

Unnamed: 0,A,B,C,D,F
2019-04-12,-0.321391,-0.318205,-0.17608,0.01905,2.0
2019-04-13,-0.886196,-1.406709,2.0424,-0.07445,6.0
2019-04-14,-0.236388,-1.465904,0.067274,-0.899857,
2019-04-15,-0.391929,1.156578,0.797989,-0.63216,
2019-04-16,-1.565628,-0.110661,-0.044798,0.908588,4.0


In [279]:
df

Unnamed: 0,A,B,C,D,E,F
2019-04-12,-0.321391,-0.318205,-0.17608,0.01905,1.243162,2.0
2019-04-13,-0.886196,-1.406709,2.0424,-0.07445,-0.2232,6.0
2019-04-14,-0.236388,-1.465904,0.067274,-0.899857,1.411438,
2019-04-15,-0.391929,1.156578,0.797989,-0.63216,-0.634249,
2019-04-16,-1.565628,-0.110661,-0.044798,0.908588,1.003347,4.0


In [284]:
data = [
        [1.4, np.nan],
        [7.1, -4.5],
        [np.nan, np.nan],
        [0.75, -1.3]
]

In [285]:
data

[[1.4, nan], [7.1, -4.5], [nan, nan], [0.75, -1.3]]

In [289]:
df = pd.DataFrame(data, columns = ('one','two'), index = np.arange(1,5))

In [290]:
df

Unnamed: 0,one,two
1,1.4,
2,7.1,-4.5
3,,
4,0.75,-1.3


In [291]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [293]:
df.sum(axis = 1)
#nan 값은 없다고 생각하고 계산한다.

1    1.40
2    2.60
3    0.00
4   -0.55
dtype: float64

In [294]:
#nan를 무시하지 않고 sum을 하고 싶다면?
df.count()

one    3
two    2
dtype: int64

In [298]:
df.sum(skipna = False, axis = 1)

1     NaN
2    2.60
3     NaN
4   -0.55
dtype: float64

In [299]:
# 특정 행 혹은 열에 대해서 연산
df

Unnamed: 0,one,two
1,1.4,
2,7.1,-4.5
3,,
4,0.75,-1.3


In [303]:
df.loc[3].sum(skipna = False)

nan

In [304]:
df['one'].sum()

9.25

In [305]:
df.sum()

one    9.25
two   -5.80
dtype: float64

#### pandas에서 DataFrame에 적용되는 함수들
* sum()
* count( ): 전체 성분의 값의 갯수를 확인(NaN이 아닌)
* min, max  :전체 성분의 최대, 최솟값
* argmin, argmax : 전체 성분의 최솟,최댓값의 위치 인덱스(정수)
* idxmin, idxmax : 전체 인덱스중에서 최대 최솟값반환
* quantile  : 전체 성분의 특정 사분위수에 해당하는 값을 반환(0~1)
* sum, mean, median, std, var...
* mad : 전체성분의 평균값으로부터의  절대편차의 평균(편차의 평균)
* cumsum : 누적합(0부터 더해진다)
* cumprod : 누적곱(1부터 곱해진다)

In [308]:
pd.date_range('20090109', periods= 7)

DatetimeIndex(['2009-01-09', '2009-01-10', '2009-01-11', '2009-01-12',
               '2009-01-13', '2009-01-14', '2009-01-15'],
              dtype='datetime64[ns]', freq='D')

In [314]:
df2 = pd.DataFrame(np.random.randn(6,4),
                  columns = ['A','B','C','D'], 
                  index = pd.date_range('20190412',periods = 6))

In [315]:
df2

Unnamed: 0,A,B,C,D
2019-04-12,1.174492,0.53151,0.529841,-1.802785
2019-04-13,1.222929,0.000983,0.896098,0.463882
2019-04-14,-0.023699,-0.581864,-0.286028,0.216099
2019-04-15,0.186998,1.631595,0.148456,-0.461948
2019-04-16,0.901235,0.624281,0.664708,-1.583178
2019-04-17,0.447845,1.421113,0.428165,-0.21623


In [316]:
df2['A'].corr(df2['B'])

-0.10048810827616568

In [317]:
#B열과 C열의 공분산 구하기
df2['B'].cov(df2['C'])

0.05497524946393895

정렬함수/ 기타함수


In [319]:
date = df2.index

In [325]:
random_date = np.random.permutation(date)

In [329]:
random_col = np.random.permutation(df2.columns)

In [336]:
random_df2 = df2.reindex(index = random_date, columns= random_col)

In [337]:
random_df2.sort_index()

Unnamed: 0,C,B,A,D
2019-04-12,0.529841,0.53151,1.174492,-1.802785
2019-04-13,0.896098,0.000983,1.222929,0.463882
2019-04-14,-0.286028,-0.581864,-0.023699,0.216099
2019-04-15,0.148456,1.631595,0.186998,-0.461948
2019-04-16,0.664708,0.624281,0.901235,-1.583178
2019-04-17,0.428165,1.421113,0.447845,-0.21623


In [339]:
random_df2.sort_index(axis = 1, ascending= False)

Unnamed: 0,D,C,B,A
2019-04-12,-1.802785,0.529841,0.53151,1.174492
2019-04-13,0.463882,0.896098,0.000983,1.222929
2019-04-17,-0.21623,0.428165,1.421113,0.447845
2019-04-16,-1.583178,0.664708,0.624281,0.901235
2019-04-15,-0.461948,0.148456,1.631595,0.186998
2019-04-14,0.216099,-0.286028,-0.581864,-0.023699


In [341]:
random_df2.sort_values(by = 'A')

Unnamed: 0,C,B,A,D
2019-04-14,-0.286028,-0.581864,-0.023699,0.216099
2019-04-15,0.148456,1.631595,0.186998,-0.461948
2019-04-17,0.428165,1.421113,0.447845,-0.21623
2019-04-16,0.664708,0.624281,0.901235,-1.583178
2019-04-12,0.529841,0.53151,1.174492,-1.802785
2019-04-13,0.896098,0.000983,1.222929,0.463882


In [367]:
df = pd.DataFrame({
    'A' : ['alpha'] * 5+ ['beta']*3+ ['gamma']*1,
    'B' : [3,5,2,6,7,5,7,7,7]
               
                  })

In [359]:
df

Unnamed: 0,A,B
0,aplha,3
1,aplha,5
2,aplha,2
3,aplha,6
4,aplha,7
5,beta,5
6,beta,7
7,beta,7
8,gamma,7


In [360]:
df.duplicated('A')

0    False
1     True
2     True
3     True
4     True
5    False
6     True
7     True
8    False
dtype: bool

In [362]:
df['A'].unique()

array(['aplha', 'beta', 'gamma'], dtype=object)

In [364]:
df['A'].value_counts()

aplha    5
beta     3
gamma    1
Name: A, dtype: int64

In [370]:
df.isin(['alpha','beta'])

Unnamed: 0,A,B
0,True,False
1,True,False
2,True,False
3,True,False
4,True,False
5,True,False
6,True,False
7,True,False
8,False,False


In [366]:
df

Unnamed: 0,A,B
0,aplha,3
1,aplha,5
2,aplha,2
3,aplha,6
4,aplha,7
5,beta,5
6,beta,7
7,beta,7
8,gamma,7


Quiz . A열의 값이 alpha혹은 beta인 모든 행을 구하라

In [372]:
df.loc[df["A"].isin(['alpha','beta']),:]

Unnamed: 0,A,B
0,alpha,3
1,alpha,5
2,alpha,2
3,alpha,6
4,alpha,7
5,beta,5
6,beta,7
7,beta,7


## 궁금한점
R에서 grep, grepl과 같은 함수는?
<br>
#### answer
df.str.contains('blahblah')

In [381]:
df['A'].__contains__('gamma')

False

In [374]:
[i for i in df['A'] if 'al' in i]

['alpha', 'alpha', 'alpha', 'alpha', 'alpha']

In [380]:
df['A'].str.contains('al')

0     True
1     True
2     True
3     True
4     True
5    False
6    False
7    False
8    False
Name: A, dtype: bool