# Data Pre-Processing
- 결측치 확인
- 결측치 처리 (결측치가 포함된 행 제거, 결측치 값을 대체(기본값/평균값))
- 중복값 처리

In [1]:
import pandas as pd
import numpy as np


In [2]:
df = pd.read_csv('data/contacts.csv')
df

Unnamed: 0,Name,Phone,Email
0,김민수,010-1234-5678,minsu.kim@gmail.com
1,이지은,010-2345-6789,jieun.lee@naver.com
2,박철수,010-3456-7890,chulsoo.park@hotmail.com
3,홍길동,010-4567-8901,gildong.hong@daum.net
4,김영희,010-5678-9012,younghee.kim@gmail.com
...,...,...,...
72,범수정,010-3456-7892,soojeong.beom@gmail.com
73,이호진,010-4567-8904,hojin.lee@daum.net
74,정지윤,010-5678-9015,jungzee@naver.com
75,김지현,010-6789-0126,jh.kim@gmail.com


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    77 non-null     object
 1   Phone   71 non-null     object
 2   Email   76 non-null     object
dtypes: object(3)
memory usage: 1.9+ KB


In [4]:
df.describe()

Unnamed: 0,Name,Phone,Email
count,77,71,76
unique,75,63,75
top,박철수,010-2345-6789,dahyun.jung@naver.com
freq,2,2,2


In [6]:
df[df['Name'] == '박철수']

Unnamed: 0,Name,Phone,Email
2,박철수,010-3456-7890,chulsoo.park@hotmail.com
31,박철수,010-1234-5681,chulsoo.park@gmail.com


In [9]:
df[df['Phone'] == '010-2345-6789']

Unnamed: 0,Name,Phone,Email
1,이지은,010-2345-6789,jieun.lee@naver.com
71,엄정희,010-2345-6789,jeonghee.eom@hotmail.com


In [12]:
df[df['Email'] == 'dahyun.jung@naver.com']

Unnamed: 0,Name,Phone,Email
32,정다현,010-2345-6782,dahyun.jung@naver.com
67,정다현,010-2345-6782,dahyun.jung@naver.com


## 중복데이터 처리
# dupliated()는 기본적으로 모든 컬럼값이 같을 때 중복으로 간주한다.
# 결과가 1이라면, 2개 데이터가 중복이다.
# 결과가 2라면, a데이터가 3번 중복된 것 일 수도 있고, a와 b데이터가 각각 2번 중복일 수 있다.

In [14]:
df[df.duplicated()]

Unnamed: 0,Name,Phone,Email
67,정다현,010-2345-6782,dahyun.jung@naver.com


In [16]:
df.duplicated().sum()

np.int64(1)

In [17]:
df[df['Name'] == '정다현']

Unnamed: 0,Name,Phone,Email
32,정다현,010-2345-6782,dahyun.jung@naver.com
67,정다현,010-2345-6782,dahyun.jung@naver.com


In [18]:
# 특정 컬럼을 지정해서 중복 데이터 찾기
df[df.duplicated(['Name'])]

Unnamed: 0,Name,Phone,Email
31,박철수,010-1234-5681,chulsoo.park@gmail.com
67,정다현,010-2345-6782,dahyun.jung@naver.com


In [19]:
# 중복 데이터 제거
df.drop_duplicates()

Unnamed: 0,Name,Phone,Email
0,김민수,010-1234-5678,minsu.kim@gmail.com
1,이지은,010-2345-6789,jieun.lee@naver.com
2,박철수,010-3456-7890,chulsoo.park@hotmail.com
3,홍길동,010-4567-8901,gildong.hong@daum.net
4,김영희,010-5678-9012,younghee.kim@gmail.com
...,...,...,...
72,범수정,010-3456-7892,soojeong.beom@gmail.com
73,이호진,010-4567-8904,hojin.lee@daum.net
74,정지윤,010-5678-9015,jungzee@naver.com
75,김지현,010-6789-0126,jh.kim@gmail.com


In [22]:
# 공백이 생기게 되면 처리에 좋지 않다. index 초기화, 지워진 행 다시 채우기
df.drop_duplicates(). reset_index(drop = True) # index 지우기까지
df

Unnamed: 0,Name,Phone,Email
0,김민수,010-1234-5678,minsu.kim@gmail.com
1,이지은,010-2345-6789,jieun.lee@naver.com
2,박철수,010-3456-7890,chulsoo.park@hotmail.com
3,홍길동,010-4567-8901,gildong.hong@daum.net
4,김영희,010-5678-9012,younghee.kim@gmail.com
...,...,...,...
72,범수정,010-3456-7892,soojeong.beom@gmail.com
73,이호진,010-4567-8904,hojin.lee@daum.net
74,정지윤,010-5678-9015,jungzee@naver.com
75,김지현,010-6789-0126,jh.kim@gmail.com


## 결측치 처리



In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    77 non-null     object
 1   Phone   71 non-null     object
 2   Email   76 non-null     object
dtypes: object(3)
memory usage: 1.9+ KB


In [24]:
# 1. isna() - 결측치가 있는지
print(df.isna().sum())

Name     0
Phone    6
Email    1
dtype: int64


In [25]:
# 2. isnull()
df.isnull()
print(df.isnull().sum())

Name     0
Phone    6
Email    1
dtype: int64


In [39]:
# 대표값으로 NaN 값 대체하기
# - 평균값
# - 기본값 010-0000-0000
# - 최빈값
df['Phone'].fillna('010-0000-0000') # 빈 부분을 기본값으로 채워 넣겠다.

df['Phone'].value_counts()
df['Phone'] = df['Phone'].fillna('010-2345-6789') # 최빈값
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 71 entries, 0 to 76
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    71 non-null     object
 1   Phone   71 non-null     object
 2   Email   71 non-null     object
 3   phone   71 non-null     object
dtypes: object(4)
memory usage: 2.8+ KB


In [36]:
# dropna() : 결측치가 포함된 행을 제거
# - how='any' : 하나만 비어있으면 삭제
# - how='all' : 다 비어있어야 삭제
df = df.dropna(how='any')
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 71 entries, 0 to 76
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    71 non-null     object
 1   Phone   71 non-null     object
 2   Email   71 non-null     object
 3   phone   71 non-null     object
dtypes: object(4)
memory usage: 2.8+ KB
