In [1]:
import numpy as np
import pandas as pd

# 7.1 누락된 데이터 처리하기



In [2]:
##pandas의 설계 목표 중 하나는 누락 데이터를 가능한 한 쉽게 처리할 수 있도록 하는 것
##산술 데이터에 한해 pandas는 누락된 데이터를 실숫값 NaN으로 취급함

string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
print(string_data)
print(string_data.isnull())

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object
0    False
1    False
2     True
3    False
dtype: bool


In [3]:
#파이썬 내장 None 값도 NA 값으로 취급
string_data[0] = None
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

NA 처리 메서드
*   dropna: 누락된 데이터가 있는 축(로우, 컬럼) 제외, 어느 정도의 누락 데이터까지 용인할 것인지 지정 가능
*   fillna: 누락된 데이터를 대신할 값을 채워줌, ffill이면 결측값을 바로 위 값과 동일하게, bfill이면 바로 아래 값과 동일하게 채움
*   isnull: 결측치를 알려 주는 불리언값이 저장된 같은 형의 객체 반환
*   notnull: isnull의 반대 메서드

## 7.1.1 누락된 데이터 골라내기

In [4]:
from numpy import nan as NA

In [5]:
data = pd.Series([1, NA, 3.5, NA, 7])
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [6]:
#동일
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [7]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],[NA, NA, NA], [NA, 6.5, 3.]])
cleaned = data.dropna()
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [8]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [9]:
#how = 'all'옵션을 넘기면 전부 NA인 행만 제외
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [10]:
data[4] = NA
data


Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [11]:
data.dropna(axis = 1, how= 'all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [12]:
df = pd.DataFrame(np.random.randn(7,3))
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
df


Unnamed: 0,0,1,2
0,-0.739638,,
1,0.697106,,
2,0.55644,,0.078912
3,1.14372,,0.887767
4,0.110493,-1.03032,1.286908
5,-0.753449,-0.276157,0.439767
6,0.562137,-1.0654,0.233949


In [13]:
df.dropna()

Unnamed: 0,0,1,2
4,0.110493,-1.03032,1.286908
5,-0.753449,-0.276157,0.439767
6,0.562137,-1.0654,0.233949


In [14]:
df.dropna(thresh=2) ##2개 이상의 값이 있는 행만 살펴봄

Unnamed: 0,0,1,2
2,0.55644,,0.078912
3,1.14372,,0.887767
4,0.110493,-1.03032,1.286908
5,-0.753449,-0.276157,0.439767
6,0.562137,-1.0654,0.233949


##7.1.2 결측치 채우기

In [15]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-0.739638,0.0,0.0
1,0.697106,0.0,0.0
2,0.55644,0.0,0.078912
3,1.14372,0.0,0.887767
4,0.110493,-1.03032,1.286908
5,-0.753449,-0.276157,0.439767
6,0.562137,-1.0654,0.233949


In [16]:
df.fillna({1:0.5, 2:0})

Unnamed: 0,0,1,2
0,-0.739638,0.5,0.0
1,0.697106,0.5,0.0
2,0.55644,0.5,0.078912
3,1.14372,0.5,0.887767
4,0.110493,-1.03032,1.286908
5,-0.753449,-0.276157,0.439767
6,0.562137,-1.0654,0.233949


In [17]:
_ = df.fillna(0, inplace = True) #fillna()로 새 객체를 반환하는 대신 기존의 객체를 변경
df

Unnamed: 0,0,1,2
0,-0.739638,0.0,0.0
1,0.697106,0.0,0.0
2,0.55644,0.0,0.078912
3,1.14372,0.0,0.887767
4,0.110493,-1.03032,1.286908
5,-0.753449,-0.276157,0.439767
6,0.562137,-1.0654,0.233949


In [18]:
df=pd.DataFrame(np.random.randn(6,3))
df.iloc[2:,1] = NA
df.iloc[4:,2] = NA
df

Unnamed: 0,0,1,2
0,1.561908,-0.174672,-0.0158
1,0.678694,-0.008552,0.789082
2,-0.431197,,-1.580461
3,-1.41783,,0.053433
4,0.320052,,
5,0.501058,,


In [19]:
df.fillna(method = 'ffill')

Unnamed: 0,0,1,2
0,1.561908,-0.174672,-0.0158
1,0.678694,-0.008552,0.789082
2,-0.431197,-0.008552,-1.580461
3,-1.41783,-0.008552,0.053433
4,0.320052,-0.008552,0.053433
5,0.501058,-0.008552,0.053433


In [20]:
df.fillna(method = 'ffill', limit=2)

Unnamed: 0,0,1,2
0,1.561908,-0.174672,-0.0158
1,0.678694,-0.008552,0.789082
2,-0.431197,-0.008552,-1.580461
3,-1.41783,-0.008552,0.053433
4,0.320052,,0.053433
5,0.501058,,0.053433


In [21]:
data = pd.Series([1., NA, 3.5, NA, 7])
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

fillna
*   value: 비어 있는 값을 채울 객체
*   method: 보간 방식, 기본적으로 'ffill'을 사용
*   axis: 값을 채워 넣을 축. 기본값은 axis=0(칼럼)이다
*   inplace: 복사본을 생성하지 않고 객체 자체를 변경
*   limit: 값을 앞 혹은 뒤에서부터 몇 개까지 채울지 결정 

#7.2 데이터 변형

##7.2.1 중복 제거하기

In [22]:
data = pd.DataFrame({'k1':['one','two']*3+['two'],'k2': [1,1,2,3,3,4,4]})
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [23]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [24]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [25]:
data['v1']=range(7)
data.drop_duplicates(['k1']) ## 'k1'칼럼에 기반하여 중복 제거

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [26]:
data.drop_duplicates(['k1', 'k2'], keep = 'last')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


##7.2.2 함수나 매핑을 이용해서 데이터 변형하기


In [27]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon','Pastrami','corned beef','Bacon', 'pastrami', 'honey ham', 'nova lox'],
                     'ounces': [4,3,12,6,7.5,8,3,5,6]})
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [28]:
meat_to_animal = {
    'bacon': 'pig',
    'pulled pork': 'pig',
    'pastrami': 'cow',
    'corned beef': 'cow',
    'honey ham' : 'pig',
    'nova lox' : 'salmon'
}

In [29]:
lowercased = data['food'].str.lower()
lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [30]:
data['animal'] = lowercased.map(meat_to_animal)
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [31]:
data['food'].map(lambda x: meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

##7.2.3 값 치환하기

In [32]:
##replace 메서드 사용
data = pd.Series([1., -999, 2., -999, -1000., 3.]) ##누락된 데이터 표기를 위해 -999를 사용한 경우
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [33]:
data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [34]:
data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [35]:
data.replace({-999:np.nan, -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

##7.2.4 축 색인 이름 바꾸기


In [36]:
data = pd.DataFrame(np.arange(12).reshape((3,4)),
                    index=['Ohio', 'Colorado', 'New York'],
                    columns=['one','two','three','four'])
transform = lambda x: x[:4].upper()
data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [37]:
data.index = data.index.map(transform)
data

Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [38]:
##새로운 객체를 생성하려면 rename 메서드 사용
data.rename(index=str.title, columns=str.upper)

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


In [39]:
##축 이름 중 일부만 변경하는 것도 가능
data.rename(index={'OHIO':'INDIANA'}, columns={'three':'peekaboo'})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [40]:
##inplace를 이용하면 원본 변경 가능
data.rename(index={'OHIO':'INDIANA'}, inplace=True)
data

Unnamed: 0,one,two,three,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


##7.2.5 개별화와 양자화
연속적인 데이터를 개별로 분할하거나, 분석을 위해 그룹별로 나누기도 함



In [41]:
ages = [20,22,25,27,21,23,37,61,45,41,32]
bins=[18,25,35,60,100]
cats=pd.cut(ages,bins)

In [42]:
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (35, 60], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 11
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [43]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 3, 2, 2, 1], dtype=int8)

In [44]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')

In [45]:
pd.value_counts(cats)

(18, 25]     5
(35, 60]     3
(25, 35]     2
(60, 100]    1
dtype: int64

In [46]:
##중괄호 쪽의 값은 포함하지 않고, 대괄호 쪽의 값은 포함하는 간격(열림/닫힘)
pd.cut(ages, [18,26,36,61,100], right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [36, 61), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 11
Categories (4, interval[int64, left]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

In [47]:
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']
pd.cut(ages,bins,labels=group_names)

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'MiddleAged', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 11
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

In [48]:
data = np.random.rand(20)
pd.cut(data, 4, precision=2) ##precision: 소수점 자릿수 제한


[(0.71, 0.94], (0.48, 0.71], (0.71, 0.94], (0.025, 0.26], (0.26, 0.48], ..., (0.71, 0.94], (0.26, 0.48], (0.48, 0.71], (0.48, 0.71], (0.26, 0.48]]
Length: 20
Categories (4, interval[float64, right]): [(0.025, 0.26] < (0.26, 0.48] < (0.48, 0.71] <
                                           (0.71, 0.94]]

In [49]:
##qcut은 정규화한 표준 변위차를 바탕으로 

data = np.random.randn(1000) #정규분포
cats = pd.qcut(data,4)
cats

[(-0.671, 0.0576], (0.0576, 0.715], (0.0576, 0.715], (-3.138, -0.671], (-0.671, 0.0576], ..., (0.715, 3.011], (-3.138, -0.671], (-0.671, 0.0576], (0.715, 3.011], (-3.138, -0.671]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.138, -0.671] < (-0.671, 0.0576] < (0.0576, 0.715] <
                                           (0.715, 3.011]]

In [50]:
pd.value_counts(cats)

(-3.138, -0.671]    250
(-0.671, 0.0576]    250
(0.0576, 0.715]     250
(0.715, 3.011]      250
dtype: int64

In [51]:
pd.qcut(data,[0,0.1,0.5,0.9,1.])

[(-1.241, 0.0576], (0.0576, 1.233], (0.0576, 1.233], (-1.241, 0.0576], (-1.241, 0.0576], ..., (1.233, 3.011], (-3.138, -1.241], (-1.241, 0.0576], (0.0576, 1.233], (-3.138, -1.241]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.138, -1.241] < (-1.241, 0.0576] < (0.0576, 1.233] <
                                           (1.233, 3.011]]

##7.2.6 특잇값을 찾고 제외하기
배열 연산에서는 아웃라이어를 제외하거나 적당한 값으로 대체하는 것이 중요

In [52]:
data = pd.DataFrame(np.random.randn(1000,4))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.019059,0.022662,-0.019243,0.071531
std,1.007392,0.976147,0.981749,0.937397
min,-2.954506,-2.804135,-2.90575,-3.355393
25%,-0.673385,-0.637427,-0.704298,-0.562642
50%,0.043821,0.062097,0.015585,0.061969
75%,0.668256,0.67935,0.664745,0.686645
max,3.230509,2.912418,3.700349,3.287704


In [53]:
##한 컬럼에서 절댓값이 3 초과하는 값 찾아내기
col = data[2]
col[np.abs(col)>3]


721    3.700349
Name: 2, dtype: float64

In [54]:
data[(np.abs(data) > 3).any(1)]

  data[(np.abs(data) > 3).any(1)]


Unnamed: 0,0,1,2,3
239,-0.225113,2.635817,-0.765633,3.287704
329,0.838882,-0.619663,-0.044909,-3.355393
698,0.508583,0.323504,0.42721,-3.174014
721,0.845181,0.840077,3.700349,0.855631
999,3.230509,1.125893,-0.640857,0.352232


In [55]:
data[np.abs(data) > 3] = np.sign(data)*3
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.018828,0.022662,-0.019943,0.071772
std,1.006683,0.976147,0.97934,0.934629
min,-2.954506,-2.804135,-2.90575,-3.0
25%,-0.673385,-0.637427,-0.704298,-0.562642
50%,0.043821,0.062097,0.015585,0.061969
75%,0.668256,0.67935,0.664745,0.686645
max,3.0,2.912418,3.0,3.0


In [56]:
##np.sign()은 data의 값이 양수인지 음수인지에 따라 1이나 -1이 담긴 배열 반환
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,-1.0,-1.0,1.0,1.0
1,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,-1.0
3,-1.0,1.0,-1.0,-1.0
4,-1.0,1.0,-1.0,1.0


##7.2.7 치환과 임의 샘플링




In [57]:
##로우를 쉽게 임의 순서로 재배치 가능
df = pd.DataFrame(np.arange(5 * 4).reshape((5,4)))
sampler = np.random.permutation(5)
sampler

array([2, 4, 0, 1, 3])

In [58]:
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [59]:
df.take(sampler)

Unnamed: 0,0,1,2,3
2,8,9,10,11
4,16,17,18,19
0,0,1,2,3
1,4,5,6,7
3,12,13,14,15


In [60]:
#일부만 임의로 선택하려면 Series나 DataFrame의 sample 메서드 사용
df.sample(n=3)

Unnamed: 0,0,1,2,3
1,4,5,6,7
3,12,13,14,15
0,0,1,2,3


In [61]:
##표본을 치환을 통해 생성해내려면 sample에 replace=True
choices = pd.Series([5,7,-1,6,4])
draws = choices.sample(n=10, replace=True)
draws

4    4
2   -1
1    7
1    7
0    5
2   -1
4    4
4    4
4    4
3    6
dtype: int64

## 7.2.8 표시자/더미 변수 계산하기
분류값을 더미나 표시자 행렬로 전환


In [63]:
df = pd.DataFrame({'key': ['b','b','a','c','a','b'], 'data1': range(6)})
pd.get_dummies(df['key'])
                  

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [64]:
dummies = pd.get_dummies(df['key'], prefix = 'key')
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [67]:
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('movies.csv', sep = ',',
                       header = None, names = mnames)
movies[:10]

Unnamed: 0,movie_id,title,genres
0,movieId,title,genres
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,2,Jumanji (1995),Adventure|Children|Fantasy
3,3,Grumpier Old Men (1995),Comedy|Romance
4,4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,5,Father of the Bride Part II (1995),Comedy
6,6,Heat (1995),Action|Crime|Thriller
7,7,Sabrina (1995),Comedy|Romance
8,8,Tom and Huck (1995),Adventure|Children
9,9,Sudden Death (1995),Action


In [69]:
all_genres = []
for x in movies.genres:
  all_genres.extend(x.split('|'))
genres = pd.unique(all_genres)
genres

array(['genres', 'Adventure', 'Animation', 'Children', 'Comedy',
       'Fantasy', 'Romance', 'Drama', 'Action', 'Crime', 'Thriller',
       'Horror', 'Mystery', 'Sci-Fi', 'War', 'Musical', 'Documentary',
       'IMAX', 'Western', 'Film-Noir', '(no genres listed)'], dtype=object)

In [72]:
zero_matrix = np.zeros((len(movies), len(genres)))
dummies = pd.DataFrame(zero_matrix, columns=genres)

In [73]:
gen = movies.genres[0]
gen.split('|')

['genres']

In [77]:
dummies.columns.get_indexer(gen.split('|'))

array([4])

In [78]:
for i, gen in enumerate(movies.genres):
  indices = dummies.columns.get_indexer(gen.split('|'))
  dummies.iloc[i,indices] = 1

In [79]:
movies_windic = movies.join(dummies.add_prefix('Genre_'))
movies_windic.iloc[0]

movie_id                    movieId
title                         title
genres                       genres
Genre_genres                    1.0
Genre_Adventure                 0.0
Genre_Animation                 0.0
Genre_Children                  0.0
Genre_Comedy                    0.0
Genre_Fantasy                   0.0
Genre_Romance                   0.0
Genre_Drama                     0.0
Genre_Action                    0.0
Genre_Crime                     0.0
Genre_Thriller                  0.0
Genre_Horror                    0.0
Genre_Mystery                   0.0
Genre_Sci-Fi                    0.0
Genre_War                       0.0
Genre_Musical                   0.0
Genre_Documentary               0.0
Genre_IMAX                      0.0
Genre_Western                   0.0
Genre_Film-Noir                 0.0
Genre_(no genres listed)        0.0
Name: 0, dtype: object

In [80]:
np.random.seed(12345)
values = np.random.rand(10)
values

array([0.92961609, 0.31637555, 0.18391881, 0.20456028, 0.56772503,
       0.5955447 , 0.96451452, 0.6531771 , 0.74890664, 0.65356987])

In [81]:
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
pd.get_dummies(pd.cut(values,bins))

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,0,0,0,0,1
1,0,1,0,0,0
2,1,0,0,0,0
3,0,1,0,0,0
4,0,0,1,0,0
5,0,0,1,0,0
6,0,0,0,0,1
7,0,0,0,1,0
8,0,0,0,1,0
9,0,0,0,1,0


#7.3 문자열 다루기
pandas는 배열 데이터 전체에 쉽게 정규 표현식을 적용하고, 누락된 데이터를 편리하게 처리할 수 있는 기능을 포함하고 있음

##7.3.1 문자열 객체 메서드

In [82]:
## 대부분의 경우 내장 문자열 메서드만으로도 충분함

val = 'a, b, guido'
val.split(',')

['a', ' b', ' guido']

In [83]:
pieces = [x.strip() for x in val.split(',')]
pieces

['a', 'b', 'guido']

In [84]:
first, second, third = pieces
##first + '::' + second + '::' third
'::'.join(pieces)

'a::b::guido'

In [85]:
'guido' in val

True

In [86]:
val.index(',') # 찾지 못하면 예외 발생

1

In [87]:
val.find(':')

-1

In [88]:
val.count(',')

2

In [90]:
val.replace(',', ':')

'a: b: guido'

## 7.3.2 정규 표현식
re 모듈 함수는 패턴 매칭, 치환, 분리 3가지로 나눌 수 있음

텍스트 내에 존재하는 패턴을 표현하고 이를 여러 목적으로 사용할 수 있도록 함

In [91]:
import re

In [93]:
text = "foo bar\t baz \tqux"
re.split('\s+', text) ## 1개 이상의 whitespace 문자와 매치
##먼저 정규 표현식이 컴파일되고 그 다음에 메서드가 실행됨

['foo', 'bar', 'baz', 'qux']

In [95]:
regex = re.compile('\s+') ## 같은 정규 표현식을 여러 번 적용해야 할 때 유용
regex.split(text)

['foo', 'bar', 'baz', 'qux']

In [96]:
regex.findall(text)

[' ', '\t ', ' \t']

In [98]:
##이메일 주소 검사
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com
"""
pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'

#re.IGNORECASE로 대소문자를 가리지 않도록 하기
regex = re.compile(pattern, flags = re.IGNORECASE)

In [99]:
regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [100]:
m = regex.search(text)
m

<re.Match object; span=(5, 20), match='dave@google.com'>

In [101]:
text[m.start():m.end()]

'dave@google.com'

In [102]:
print(regex.match(text))

None


In [103]:
print(regex.sub('REDACTED', text))

Dave REDACTED
Steve REDACTED
Rob REDACTED
Ryan REDACTED



In [104]:
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex = re.compile(pattern, flags = re.IGNORECASE)

In [105]:
m = regex.match('wesm@bright.net')
m.groups() ##각 패턴 요소로 이루어진 튜플 얻을 수 있음

('wesm', 'bright', 'net')

In [106]:
regex.findall(text)

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

In [107]:
print(regex.sub(r"Username: \1, Domain: \2, Suffix: \3", text))
## \n은 n번째로 찾은 그룹을 의미

Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: ryan, Domain: yahoo, Suffix: com



In [108]:
data = {'Dave': 'dave@google.com', 'Steve':'steve@gmail.com',
        'Rob': 'rob@gmail.com', 'Wes': np.nan}
data = pd.Series(data)
data

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

In [109]:
data.isnull()

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

In [111]:
## data.map을 사용해서 메서드를 각 값에 적용할 수 있으나 NA 값을 만나면 실패함
data.str.contains('gmail')

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

In [112]:
pattern

'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'

In [113]:
data.str.findall(pattern, flags=re.IGNORECASE)

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

In [114]:
## 벡터화된 요소 꺼내오기

matches = data.str.match(pattern, flags = re.IGNORECASE)
matches

Dave     True
Steve    True
Rob      True
Wes       NaN
dtype: object

In [124]:
##matches.str.get(1)

True

In [127]:
##matches.str[0]

True

In [118]:
## 문자열 잘라내기
data.str[:5]

Dave     dave@
Steve    steve
Rob      rob@g
Wes        NaN
dtype: object