In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

class disp(object):
    template = '<div style="float: left;padding:10px;"> <b>[{0}]</b> {1}</div>'
    def __init__(self, *args):
        self.args = args
        
    def _repr_html_(self):
        return '\n'.join( self.template.format(a, eval(a)._repr_html_()) 
                        for a in self.args)

import pandas as pd
import numpy as np

### [예제1] 조건에 맞는 데이터 indexing

* Boolean indexing : Boolean vector로 필터링하여 추출하는 것

In [None]:
# [1]

df = pd.DataFrame({'name':['kim','lee','park','song'],'age':[20,35,25,40]})

df1 = df[[False, True, False, True]]
disp('df', 'df1')

In [None]:
# [2]

df = pd.DataFrame({'name':['kim','lee','park','song'],'age':[20,35,25,40]})

df2 = df[ df['age'] >= 30 ]
print(df['age'] >= 30)
disp('df', 'df2')

In [None]:
# [3]

df = pd.DataFrame({'name':['kim','lee','park','song'],'age':[20,35,25,40]})

df3 = df.loc[ lambda x : x['age'] >=30 ]
disp('df', 'df3')

### [예제2] Boolean vector : list, ndarray

In [None]:
# [1] 

d = {'name':['kim','lee','park','song'],'sex':list('WMWM'),'age':[20, 40, 35, 25]}
df = pd.DataFrame(d)

row = [False, True, False, True]
r1 = df[row]
r2 = df.loc[row]
r3 = df.iloc[row]
disp('df', 'r1', 'r2', 'r3')

In [None]:
# [2] 

d = {'name':['kim','lee','park','song'],'sex':list('WMWM'),'age':[20, 40, 35, 25]}
df = pd.DataFrame(d)

row = [False, True, False, True]
r4 = df[row]['name']      # 지양
r5 = df['name'][row]      # 지양
r6 = df.loc[row, 'name']  # label 기반 인덱싱
r7 = df.iloc[row, 0]      # 위치번호 기반 인덱싱
r4; r5; r6; r7

In [None]:
# [3] 

d = {'name':['kim','lee','park','song'],'sex':list('WMWM'),'age':[20, 40, 35, 25]}
df = pd.DataFrame(d)

row = [False, True, False, True]
col = np.array([True, False, True])
r8 = df.loc[row, ['name', 'age']]
r9 = df.loc[row, col]
r10 = df.iloc[row, [0, 2]]
r11 = df.iloc[row, col]
disp('df', 'r8', 'r9', 'r10','r11')

### [예제3] DataFrame / Series의 비교 및 논리연산

In [None]:
# [1-1] 

df = pd.DataFrame({'A':[50,60,70],'B':[100,80,40]})

m1 = df['A'] >= 60
m2 = ~df['A'].lt(60)
m3 = df >= 60  
m1; m2; disp('df', 'm3')

In [None]:
# [2] 
# ~, &, | 사용 가능 (not, and, or 불가)

df = pd.DataFrame({'A':[50,60,70],'B':[100,80,40]})

m4 = (df['A']>=60) & (df['A']<=80)
m5 = (df['A']>=60) and (df['A']<=80) #error     
df; m4

### [예제4] Boolean vector : Series, DataFrame 

In [None]:
# [1-1] 

df = pd.DataFrame({'kor':[50,60,70],'eng':[100,90,40]}, index=['kim','lee','park'])

m1 = df['kor']>=60  # series boolean vector (index 有)
r1 = df[m1]
r2 = df.loc[m1]
r3 = df.iloc[m1] # error
r4 = df.iloc[m1.to_list()]  # m1.to_list()를 통해 series -> list [F, T, T] (index 無)
r5 = df['kor'][m1]
r6 = df.loc[m1, 'kor']
m1
disp('df', 'r1', 'r2', 'r4'); r5; r6

In [None]:
# [1-2]

df = pd.DataFrame({'kor':[50,60,70],'eng':[100,90,40]}, index=['kim','lee','park'])

m2 = pd.Series([True,False,True],index=['park','lee','kim'])
m3 = pd.Series([True,False,True],index=['kim','lee','song'])
r7 = df['kor'][m2]
r8 = df['kor'][m3] #error
r9 = df[m2]  #error or warning
r10 = df[m3] # error
m2; m3; r7; r9

In [None]:
# [2]

df = pd.DataFrame({'kor':[50,60,70],'eng':[100,90,40]}, index=['kim','lee','park'])

m4 = df >= 60
m5 = m4.drop('park')
r11 = df[m4]
r12 = df[m5]

disp('df', 'm4', 'r11')
disp('df', 'm5', 'r12')

In [None]:
# [3]

df = pd.DataFrame({'kor':[50,60,70],'eng':[100,90,40]}, index=['kim','lee','park'])

m6 = df.index == 'kim'
r13 = df[m6]
m6; disp('df', 'r13')

### [예제5] Series의 .str accessor

In [None]:
df = pd.DataFrame({'name':['kim','lee','ki'], 'age':[20,30,25]})

m1 = df['name'] != 'lee'
m2 = df['name'].str.contains('a|i') 
m3 = df['name'].str.endswith(('m','i'))
m4 = df['name'].str.startswith('k')
r1 = df[m1]
r2 = df[m2]
r3 = df[m3]
r4 = df[m4]

m1; m2; m3; m4
disp('df', 'r1', 'r2', 'r3', 'r4')

### [예제6] 사용자 함수에 의한 Indexing

In [None]:
# [1-1]

df = pd.DataFrame({'kor':[50,60,70],'eng':[100,90,40]}, index=['kim','lee','park'])
df

def f1(x): 
    print(type(x))
    print(x)
    return 'kor'

sr = df.loc['kim']
r1 = sr[f1]
r1

In [None]:
# [1-1]

df = pd.DataFrame({'kor':[50,60,70],'eng':[100,90,40]}, index=['kim','lee','park'])
df

def f1(x): 
    print(type(x))
    print(x)
    return 'kor'

r2 = df[f1]
r2

In [None]:
# [2] 

df = pd.DataFrame({'kor':[50,60,70],'eng':[100,90,40]}, index=['kim','lee','park'])

def f2(x) :
    return x.mean(axis=1) >=60

r3 = df[f2] 

disp('df', 'r3')
print(df.mean(axis=1))
print(df.mean(axis=1)>=60)

### [예제7] 조건에 따른 집계 및 편집

In [None]:
# [1]

d = {'class':['A','B','A','B'],'exam':[90,60,50,80]}
df = pd.DataFrame(d,index=['kim','lee','park','song'])

r1 = df.loc[ df['exam']>=60, 'class'].count()
r2 = df.loc[ df['exam']>=60, 'class'].value_counts()    
df; r1; r2

In [None]:
# [2]

d = {'class':['A','B','A','B'],'exam':[90,60,50,80]}
df = pd.DataFrame(d,index=['kim','lee','park','song'])

df.loc[df['exam'] < 70, 'pass'] = 'fail'
df.loc[df['exam'] >= 70, 'pass'] = df['exam']
df

In [None]:
# [3]

d = {'kor':[40,90,50],'eng':[80,100,30]}
df2 = pd.DataFrame(d,index=['kim','lee','park'])
df2

df2[ df2 <= 60 ] = 'fail'
df2

In [None]:
# [4]

d = {'kor':[40,90,50],'eng':[80,100,30]}
df2 = pd.DataFrame(d,index=['kim','lee','park'])
df2

df2[ df2 <= 60 ] = 'fail'
r3 = df2[df2.eq('fail').any(axis=1)]
r3

print(df2.eq('fail').any(axis=1) )

### [예제8] isin() 메서드

In [None]:
# [m1, r1]

df = pd.DataFrame({'A':[100,90,80],'B':[50,100,70]})

m1 = df['A'].isin([100,90])
r1 = df[m1]
m1; disp('df', 'r1')

In [None]:
# [m2, r2]

df = pd.DataFrame({'A':[100,90,80],'B':[50,100,70]})

m2 = df.isin([100,90])
m3 = df.isin(100) # error
r2 = df[m2]
disp('df', 'm2', 'r2')

### [예제9] all(), any() 메서드

In [None]:
# [m1, m2, m3]

df = pd.DataFrame({'A':[100,90,80],'B':[50,100,70]})

m1 = df.isin([100, 90])
m2 = m1.all(axis=1)
m3 = m1.any()
disp('df', 'm1'); m2; m3

In [None]:
# [r1, r2]

df = pd.DataFrame({'A':[100,90,80],'B':[50,100,70]})

m1 = df.isin([100, 90])
m2 = m1.all(axis=1)
m3 = m1.any()

r1 = df[m2]
r2 = df.loc[:, m3]
disp('df', 'r1', 'r2')

In [None]:
# [m4]

df = pd.DataFrame({'A':[100,90,80],'B':[50,100,70]})

m4 = df['A'].isin([100]).any()
m4

### [예제10] isin() 메서드 동작 이해

In [None]:
# [m1]

df = pd.DataFrame({'A':[100,90,80],'B':[50,100,70]})
dic = {'A':[100,50],'B':[100,70]}
m1 = df.isin(dic)  
disp('df', 'm1')

In [None]:
# [m2]

df = pd.DataFrame({'A':[100,90,80],'B':[50,100,70]})
sr1 = pd.Series([50,90,70])
m2 = df.isin(sr1)  
sr1; disp('df', 'm2')

In [None]:
# [m3]

df = pd.DataFrame({'A':[100,90,80],'B':[50,100,70]})
df1 = pd.DataFrame({'A':[100,80],'B':[50,100]})
m3 = df.isin(df1)
disp('df','df1','m3')

In [None]:
# [m4]

df = pd.DataFrame({'A':[100,90,80],'B':[50,100,70]})
df1 = pd.DataFrame({'A':[100,80],'B':[50,100]})
m3 = df.isin(df1)


m4 = m3.any(axis=1)
r1 = df[m4]

m4; disp('df', 'df1', 'm3', 'r1')

### [예제11] duplicated() 메서드

In [None]:
# [1]

df = pd.DataFrame({'name':['kim','lee','park','kim'],'id':[10, 15, 20, 10]})
m1 = df.duplicated()
df1 = df[m1]
m1; disp('df', 'df1')

In [None]:
# [2]

df = pd.DataFrame({'name':['kim','lee','park','kim'],'id':[10, 15, 20, 10]})
m2 = ~df.duplicated()
df2 = df[m2]
m2; disp('df', 'df2')

### [예제12] duplicated() 메서드 동작 이해

In [None]:
# [1]

df = pd.DataFrame({'name':['kim','lee','park','kim'],'id':[10, 15, 20, 10]})
m1 = df.duplicated(keep = 'last')
df1 = df[~m1]
m1; disp('df', 'df1')

In [None]:
# [2]

df = pd.DataFrame({'name':['kim','lee','park','kim'],'id':[10, 15, 20, 10]})
m2 = df.duplicated(keep = False)
df2 = df[~m2]
m2; disp('df', 'df2')

In [None]:
# [3]

df = pd.DataFrame({'name':['kim','lee','park','kim'],'id':[10, 15, 20, 10]})
m3 = df.duplicated(['name'])
m4 = df['name'].duplicated()
df3, df4 = df[~m3], df[~m4]
m3; m4; disp('df', 'df3', 'df4')

In [None]:
# [4]

df = pd.DataFrame({'name':['kim','lee','park','kim'],'id':[10, 15, 20, 10]})
ndf = df.set_index('name')
m5 = ndf.index.duplicated()
df5 = ndf[~m5]
m5; disp('ndf', 'df5')

### [예제13] 조건에 따른 필터링 활용

In [None]:
# [1]

d = {'grade':[1,1,2,1,2],'major':['math','kor','com','kor','eng'],'exam':[80, np.nan, 90, 100,np.nan]}
df = pd.DataFrame(d, index=['kim','lee','park','song','lew'])

r1 = df[(df['grade']==1) & df['major'].isin(['kor','math'])]
disp('df', 'r1')

In [2]:
# [2]

d = {'grade':[1,1,2,1,2],'major':['math','kor','com','kor','eng'],'exam':[80, np.nan, 90, 100,np.nan]}
df = pd.DataFrame(d, index=['kim','lee','park','song','lew'])

r2 = df[ df['exam'].isna()]
r3 = df[ df['exam'].isin([np.nan]) ]
disp('df', 'r2', 'r3')

Unnamed: 0,grade,major,exam
kim,1,math,80.0
lee,1,kor,
park,2,com,90.0
song,1,kor,100.0
lew,2,eng,

Unnamed: 0,grade,major,exam
lee,1,kor,
lew,2,eng,

Unnamed: 0,grade,major,exam
lee,1,kor,
lew,2,eng,


In [4]:
# [3]

d = {'grade':[1,1,2,1,2],'major':['math','kor','com','kor','eng'],'exam':[80, np.nan, 90, 100,np.nan]}
df = pd.DataFrame(d, index=['kim','lee','park','song','lew'])

df.loc[df['exam'].isna(), 'YN'] = 'NO'
df.loc[df['exam'].notna(), 'YN'] = 'YES'

df

Unnamed: 0,grade,major,exam,YN
kim,1,math,80.0,YES
lee,1,kor,,YES
park,2,com,90.0,YES
song,1,kor,100.0,YES
lew,2,eng,,YES
