# DataFrame정제
#### NaN으로 빠진 값이나 정상적이지 않은값(결측치(missing value), 이상치)의 정제)

In [1]:
# 이거 두개는 가장 기본적으로 쓰임
import numpy as np
import pandas as pd 

In [2]:
# numpy의 난수를 이용하여 DataFrame 만들기
# 난수의 기본적인 특징 0, 1 이 없다

df = pd.DataFrame(np.random.rand(6, 4))
df

Unnamed: 0,0,1,2,3
0,0.011618,0.513256,0.176513,0.755374
1,0.789125,0.756622,0.413726,0.046385
2,0.306606,0.962386,0.63755,0.046896
3,0.223853,0.986955,0.19249,0.716674
4,0.230998,0.31273,0.366025,0.043137
5,0.190612,0.730766,0.034221,0.0911


In [3]:
# Column과 index이름 넣기

df.columns = ["A","B","C","D"]
# date_range : 출발시간=> 20220701 , periods=6 => 6일치만 불러오겠다.
df.index = pd.date_range("20220701", periods=6)
df

Unnamed: 0,A,B,C,D
2022-07-01,0.011618,0.513256,0.176513,0.755374
2022-07-02,0.789125,0.756622,0.413726,0.046385
2022-07-03,0.306606,0.962386,0.63755,0.046896
2022-07-04,0.223853,0.986955,0.19249,0.716674
2022-07-05,0.230998,0.31273,0.366025,0.043137
2022-07-06,0.190612,0.730766,0.034221,0.0911


In [4]:
# index type 확인 => datetime
df.index

DatetimeIndex(['2022-07-01', '2022-07-02', '2022-07-03', '2022-07-04',
               '2022-07-05', '2022-07-06'],
              dtype='datetime64[ns]', freq='D')

In [5]:
# 새로운 F열 생성과 값 입력
df['F'] = [1.0, np.nan, 3.5, 6.1 ,np.nan, 7.0]
df

Unnamed: 0,A,B,C,D,F
2022-07-01,0.011618,0.513256,0.176513,0.755374,1.0
2022-07-02,0.789125,0.756622,0.413726,0.046385,
2022-07-03,0.306606,0.962386,0.63755,0.046896,3.5
2022-07-04,0.223853,0.986955,0.19249,0.716674,6.1
2022-07-05,0.230998,0.31273,0.366025,0.043137,
2022-07-06,0.190612,0.730766,0.034221,0.0911,7.0


### NaN 값 처리하기

In [6]:
# NaN이 하나라도 있는 Data행 삭제 => dropna(how='any' ==> any: 하나라도 있으면 지운다)
df. dropna(how='any')

Unnamed: 0,A,B,C,D,F
2022-07-01,0.011618,0.513256,0.176513,0.755374,1.0
2022-07-03,0.306606,0.962386,0.63755,0.046896,3.5
2022-07-04,0.223853,0.986955,0.19249,0.716674,6.1
2022-07-06,0.190612,0.730766,0.034221,0.0911,7.0


In [7]:
# NaN이 같은 행에서 모든 열의 data로 있는 경우 행 삭제 => all : 같은행의 모든열에 있는 경우
df.dropna(how='all')

Unnamed: 0,A,B,C,D,F
2022-07-01,0.011618,0.513256,0.176513,0.755374,1.0
2022-07-02,0.789125,0.756622,0.413726,0.046385,
2022-07-03,0.306606,0.962386,0.63755,0.046896,3.5
2022-07-04,0.223853,0.986955,0.19249,0.716674,6.1
2022-07-05,0.230998,0.31273,0.366025,0.043137,
2022-07-06,0.190612,0.730766,0.034221,0.0911,7.0


In [8]:
# NaN값을 특정 값으로 변경하기 => fillna(value=5.0 : 5.0 으로 바꾸겠다.) 
df.fillna(value=5.0)

Unnamed: 0,A,B,C,D,F
2022-07-01,0.011618,0.513256,0.176513,0.755374,1.0
2022-07-02,0.789125,0.756622,0.413726,0.046385,5.0
2022-07-03,0.306606,0.962386,0.63755,0.046896,3.5
2022-07-04,0.223853,0.986955,0.19249,0.716674,6.1
2022-07-05,0.230998,0.31273,0.366025,0.043137,5.0
2022-07-06,0.190612,0.730766,0.034221,0.0911,7.0


In [9]:
# Boolean Index를 통해 검색 및 변경
df.isnull() # == df.isna()

Unnamed: 0,A,B,C,D,F
2022-07-01,False,False,False,False,False
2022-07-02,False,False,False,False,True
2022-07-03,False,False,False,False,False
2022-07-04,False,False,False,False,False
2022-07-05,False,False,False,False,True
2022-07-06,False,False,False,False,False


In [10]:
# F열에서 NaN을 포함하고 있는 행 찾기
df.loc[df.F.isna(),:] #<= loc가 찾을 때 쓰기 좋다.
# == df.loc[df.isnull()['F'],:]

Unnamed: 0,A,B,C,D,F
2022-07-02,0.789125,0.756622,0.413726,0.046385,
2022-07-05,0.230998,0.31273,0.366025,0.043137,


In [11]:
# index를 이용하여 행 제거
df.drop("2022-07-01")
# 날짜 타입에서는 이게 더 좋다. 이게더 훨씬 더 정확하다
df.drop(pd.to_datetime("20220701"))

Unnamed: 0,A,B,C,D,F
2022-07-02,0.789125,0.756622,0.413726,0.046385,
2022-07-03,0.306606,0.962386,0.63755,0.046896,3.5
2022-07-04,0.223853,0.986955,0.19249,0.716674,6.1
2022-07-05,0.230998,0.31273,0.366025,0.043137,
2022-07-06,0.190612,0.730766,0.034221,0.0911,7.0


In [12]:
# index를 이요하여 행 두개이상 지우기
df.drop([pd.to_datetime("20220701"),pd.to_datetime("20220702")])
# == df.drop(["2022-07-01","2022-07-02"])

Unnamed: 0,A,B,C,D,F
2022-07-03,0.306606,0.962386,0.63755,0.046896,3.5
2022-07-04,0.223853,0.986955,0.19249,0.716674,6.1
2022-07-05,0.230998,0.31273,0.366025,0.043137,
2022-07-06,0.190612,0.730766,0.034221,0.0911,7.0


In [13]:
# 열 삭제
df2 = df.copy()

In [14]:
# del은 바로 지워짐!
del df2['F']
df2

Unnamed: 0,A,B,C,D
2022-07-01,0.011618,0.513256,0.176513,0.755374
2022-07-02,0.789125,0.756622,0.413726,0.046385
2022-07-03,0.306606,0.962386,0.63755,0.046896
2022-07-04,0.223853,0.986955,0.19249,0.716674
2022-07-05,0.230998,0.31273,0.366025,0.043137
2022-07-06,0.190612,0.730766,0.034221,0.0911


In [15]:
df.drop("F", axis='columns')

Unnamed: 0,A,B,C,D
2022-07-01,0.011618,0.513256,0.176513,0.755374
2022-07-02,0.789125,0.756622,0.413726,0.046385
2022-07-03,0.306606,0.962386,0.63755,0.046896
2022-07-04,0.223853,0.986955,0.19249,0.716674
2022-07-05,0.230998,0.31273,0.366025,0.043137
2022-07-06,0.190612,0.730766,0.034221,0.0911


In [16]:
#axis='columns' == axis=1
df.drop(["B","F"],axis=1)

Unnamed: 0,A,C,D
2022-07-01,0.011618,0.176513,0.755374
2022-07-02,0.789125,0.413726,0.046385
2022-07-03,0.306606,0.63755,0.046896
2022-07-04,0.223853,0.19249,0.716674
2022-07-05,0.230998,0.366025,0.043137
2022-07-06,0.190612,0.034221,0.0911


### df 만들기

In [17]:
data = [
    [1.4 ,np.nan],
    [7.0, -4.5],
    [np.nan, np.nan],
    [0.75, -1.3]]


In [18]:
df = pd.DataFrame(data, index=['a','b','c','d'], columns=['one', 'two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.0,-4.5
c,,
d,0.75,-1.3


In [19]:
# 열방향 합계 구하기
df.sum(axis='rows')

one    9.15
two   -5.80
dtype: float64

In [20]:
# 행방향 합계 구하기
df.sum(axis='columns')

a    1.40
b    2.50
c    0.00
d   -0.55
dtype: float64

In [21]:
# b행의 합계 구하기
df.loc['b'].sum()

2.5

In [22]:
df.iloc[1].sum()

2.5

In [23]:
df.mean(axis='rows')

one    3.05
two   -2.90
dtype: float64

In [24]:
# 열의 분산
df.var(axis='rows')

one    11.8075
two     5.1200
dtype: float64

In [25]:
# 행의 평균
df.mean(axis='columns')

a    1.400
b    1.250
c      NaN
d   -0.275
dtype: float64

In [26]:
# skipna=False =>  NaN값도 계산할꺼야 파이썬은 NaN을 무시하는 특징이 있어서
df.mean(axis='columns', skipna=False)

a      NaN
b    1.250
c      NaN
d   -0.275
dtype: float64

one의 NaN은 남은 값들의 평균으로 대체. two의 NaN 가장 작은값으로 대체

In [27]:
df2 = df.copy()

In [28]:
# NaN값을 특정 값으로 변경하기
df2['one'].fillna(value=df2['one'].mean(), inplace= True)
df2['two'].fillna(value=df2['two'].min(), inplace= True)
df2

Unnamed: 0,one,two
a,1.4,-4.5
b,7.0,-4.5
c,3.05,-4.5
d,0.75,-1.3


In [29]:
# 평균값 구하기
one_mean = df.mean(axis=0)['one']
#최소값 구하기
two_mean = df.min(axis=0)['two']

In [30]:
df['one'] = df['one'].fillna(value=one_mean)
df['two'] = df['two'].fillna(value=two_mean)
df

Unnamed: 0,one,two
a,1.4,-4.5
b,7.0,-4.5
c,3.05,-4.5
d,0.75,-1.3


--- 
# DataFrame Merging(병합)

In [31]:
df1 = pd.DataFrame({"key":list("bbacaab"), "data":range(7)})
df1

Unnamed: 0,key,data
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [32]:
df2 = pd.DataFrame({"key":list("abd"), "data2":range(3)})
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,d,2


In [33]:
# 병합하기 방법 : on='기준' , how='inner'기본이 교집합
pd.merge(df1, df2, on='key')

Unnamed: 0,key,data,data2
0,b,0,1
1,b,1,1
2,b,6,1
3,a,2,0
4,a,4,0
5,a,5,0


> key c와 d는 서로 match 되지 않으므로 출력되지 않음

In [34]:
# 모두 보이기  how='outer' => outer join
pd.merge(df1, df2, on='key', how='outer')

Unnamed: 0,key,data,data2
0,b,0.0,1.0
1,b,1.0,1.0
2,b,6.0,1.0
3,a,2.0,0.0
4,a,4.0,0.0
5,a,5.0,0.0
6,c,3.0,
7,d,,2.0


In [35]:
# df1 기준으로 합치기   how='left' => 차집합 (left join)
pd.merge(df1, df2, on='key', how='left')

Unnamed: 0,key,data,data2
0,b,0,1.0
1,b,1,1.0
2,a,2,0.0
3,c,3,
4,a,4,0.0
5,a,5,0.0
6,b,6,1.0


In [36]:
# df2 기준으로 합치기   how='right' => 차집합 (right join)
pd.merge(df1, df2, on='key', how='right')

Unnamed: 0,key,data,data2
0,a,2.0,0
1,a,4.0,0
2,a,5.0,0
3,b,0.0,1
4,b,1.0,1
5,b,6.0,1
6,d,,2


### 두개의 dataframe에 중복된 값이 있을경우

In [37]:
df1 = pd.DataFrame({"key":list("bbacab"), "data":range(6)})
df1

Unnamed: 0,key,data
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [38]:
df2 = pd.DataFrame({"key":list("ababd"), "data2":range(5)})
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,a,2
3,b,3
4,d,4


In [39]:
pd.merge(df1, df2, on='key', how='inner')

Unnamed: 0,key,data,data2
0,b,0,1
1,b,0,3
2,b,1,1
3,b,1,3
4,b,5,1
5,b,5,3
6,a,2,0
7,a,2,2
8,a,4,0
9,a,4,2


### key name 이 다른경우

In [40]:
df1 = pd.DataFrame({"lkey":list("bbacab"), "data":range(6)})
print(df1)
df2 = pd.DataFrame({"rkey":list("ababd"), "data2":range(5)})
print(df2)

  lkey  data
0    b     0
1    b     1
2    a     2
3    c     3
4    a     4
5    b     5
  rkey  data2
0    a      0
1    b      1
2    a      2
3    b      3
4    d      4


In [41]:
pd.merge(df1, df2, left_on='lkey', right_on="rkey", how='inner')

Unnamed: 0,lkey,data,rkey,data2
0,b,0,b,1
1,b,0,b,3
2,b,1,b,1
3,b,1,b,3
4,b,5,b,1
5,b,5,b,3
6,a,2,a,0
7,a,2,a,2
8,a,4,a,0
9,a,4,a,2


### 하나의 key값으로 병합하는 경우

In [42]:
df1 = pd.DataFrame({"key":list("bbacab"), "data":range(6)})
# 얘는 index값으로 ab가 있음
print(df1)
df2 = pd.DataFrame({"group_val":[3.5, 7]}, index=['a', 'b'])
print(df2)

  key  data
0   b     0
1   b     1
2   a     2
3   c     3
4   a     4
5   b     5
   group_val
a        3.5
b        7.0


In [43]:
pd.merge(df1, df2, left_on='key', right_index=True)

Unnamed: 0,key,data,group_val
0,b,0,7.0
1,b,1,7.0
5,b,5,7.0
2,a,2,3.5
4,a,4,3.5


---
### Data Concatenating(연결) : append 개념

In [44]:
s1 = pd.Series([0, 1], index=['a', 'b'])
s1

a    0
b    1
dtype: int64

In [45]:
s2 = pd.Series([2,3,4], index=['c','d','e'])
s2

c    2
d    3
e    4
dtype: int64

In [46]:
s3 = pd.Series([5,6], index=['f','g'])
s3

f    5
g    6
dtype: int64

In [47]:
#s1, s2, s3 합치기
pd.concat([s1, s2, s3])

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: int64

In [48]:
# Series를 합치면서 dataFrame만들기 (이거 많이 씀.)
# 방향만 주면됨! axis= 1 | | | 이렇게
# sort=True 이왕이면 정렬도 해줘
pd.concat([s1,s2,s3], axis= 1, sort=True)

Unnamed: 0,0,1,2
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [49]:
# Column 에 이름넣기
# 시리즈(Series)는 Columns라고 안쓰고 keys라고 씀
pd.concat([s1,s2,s3], axis= 1, sort=True , keys=['s1', 's2', 's3'])

Unnamed: 0,s1,s2,s3
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


---
## DataFrame의 Concatenation

In [50]:
df1 = pd.DataFrame(np.arange(6).reshape(3, 2), index=['a','b','c'], columns=['one','two'])
df1

Unnamed: 0,one,two
a,0,1
b,2,3
c,4,5


In [51]:
df2 = pd.DataFrame(5 + np.arange(4).reshape(2,2), index=['a','c'], columns=['three', 'four'])
df2

Unnamed: 0,three,four
a,5,6
c,7,8


In [52]:
# df1과 df2 합치기 (index가 있는경우)
# ignore_index=True => index를 새로 만듦
pd.concat([df1, df2], axis=1, sort=True)

Unnamed: 0,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [53]:
pd.concat([df1, df2], axis=1, sort=True, ignore_index=True)

Unnamed: 0,0,1,2,3
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [54]:
pd.concat([df1, df2], axis=0, sort=True)

Unnamed: 0,four,one,three,two
a,,0.0,,1.0
b,,2.0,,3.0
c,,4.0,,5.0
a,6.0,,5.0,
c,8.0,,7.0,


In [55]:
pd.concat([df1, df2], axis=0, sort=True, ignore_index=True)

Unnamed: 0,four,one,three,two
0,,0.0,,1.0
1,,2.0,,3.0
2,,4.0,,5.0
3,6.0,,5.0,
4,8.0,,7.0,


---
### 데이터프레임 중복값 제거

In [56]:
df = pd.DataFrame({'k1':['one']*3 + ['two']*4, 'k2': [1,1,2,3,3,4,4]})
df

Unnamed: 0,k1,k2
0,one,1
1,one,1
2,one,2
3,two,3
4,two,3
5,two,4
6,two,4


In [57]:
# 중복값 확인 
# 중복값 중에서 제일 처음 나온 값은 중복이 아님.
df.duplicated()

0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

In [58]:
# 중복값 제거 (전체 행에 대한 중복값만 제거함)
df.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
2,one,2
3,two,3
5,two,4


In [59]:
# 새로운 열 추가
df['v1'] = np.arange(7)
df

Unnamed: 0,k1,k2,v1
0,one,1,0
1,one,1,1
2,one,2,2
3,two,3,3
4,two,3,4
5,two,4,5
6,two,4,6


In [60]:
df.drop_duplicates()

Unnamed: 0,k1,k2,v1
0,one,1,0
1,one,1,1
2,one,2,2
3,two,3,3
4,two,3,4
5,two,4,5
6,two,4,6


In [61]:
# column하나에 대한 중복값 제거
df['k1'].drop_duplicates()

0    one
3    two
Name: k1, dtype: object

In [62]:
# k1의 값들로 중봅값 제거 (기준이 k1 임, 중복체크는 k1만 하고 나머진 그냥 보겠다.)
df.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
3,two,3,3


In [63]:
# k1의 값들로 마지막 값으로 출력 => 첫번째 아니면 마지막 밖에 없다. keep='last'
df.drop_duplicates(['k1'], keep='last')

Unnamed: 0,k1,k2,v1
2,one,2,2
6,two,4,6


## Category 사용하기 (이것도 많이 사용!)

In [64]:
df3 = pd.DataFrame({'id':[1,2,3,4,5,6],
                    'raw_grade': ['a','b','b','a','a','e']
})
df3

Unnamed: 0,id,raw_grade
0,1,a
1,2,b
2,3,b
3,4,a
4,5,a
5,6,e


In [65]:
# category 자료형으로 변형하기
# 보기엔 안본했는데 내부적으로 많이 변함(object ->category)
df3['grade'] = df3['raw_grade'].astype('category')
df3

Unnamed: 0,id,raw_grade,grade
0,1,a,a
1,2,b,b
2,3,b,b
3,4,a,a
4,5,a,a
5,6,e,e


In [66]:
df3['raw_grade']

0    a
1    b
2    b
3    a
4    a
5    e
Name: raw_grade, dtype: object

In [67]:
df3['grade']

0    a
1    b
2    b
3    a
4    a
5    e
Name: grade, dtype: category
Categories (3, object): ['a', 'b', 'e']

In [68]:
# category를 이용하여 자료 변형
df3['grade'].cat.categories= ['very good','good','very bed']
df3

Unnamed: 0,id,raw_grade,grade
0,1,a,very good
1,2,b,good
2,3,b,good
3,4,a,very good
4,5,a,very good
5,6,e,very bed


In [69]:
# 정렬 by=기준점
df3.sort_values(by='grade')

Unnamed: 0,id,raw_grade,grade
0,1,a,very good
3,4,a,very good
4,5,a,very good
1,2,b,good
2,3,b,good
5,6,e,very bed


In [70]:
# 3개로 되어있음
df3['grade']

0    very good
1         good
2         good
3    very good
4    very good
5     very bed
Name: grade, dtype: category
Categories (3, object): ['very good', 'good', 'very bed']

## 데이터의 범위 정하기

In [71]:
# 나이
ages=[20,22,25,27,21,23,37,31,61,45,41,32]

# 범위 => [(18, 25] < (25, 35] < (35, 60] < (60, 100]]
bins=[18,25,35,60,100]

In [72]:
cats = pd.cut(ages, bins)
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [73]:
# 범주 관련 code 보기
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [74]:
# 범주 관련 빈도수 보기
cats.value_counts()

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
dtype: int64

In [75]:
# category이름 정하기
group_names = ['Youth','YoungAdult','MiddleAged','Senior']
cat2 = pd.cut(ages, bins, labels=group_names)
cat2.value_counts()

Youth         5
YoungAdult    3
MiddleAged    3
Senior        1
dtype: int64