#### 1. 열(column) 단위 서브셋 추출

* loc : 라벨(인덱스 이름) 기반으로 행과 열을 선택한다.
* iloc : 정수 위치(숫자 인덱스) 기반으로 행과 열을 선택한다.

In [122]:
import pandas as pd
import seaborn as sns

In [123]:
df = sns.load_dataset("iris")
df.head(10)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
6,4.6,3.4,1.4,0.3,setosa
7,5.0,3.4,1.5,0.2,setosa
8,4.4,2.9,1.4,0.2,setosa
9,4.9,3.1,1.5,0.1,setosa


In [124]:
columns = ['sepal_width','sepal_length','species']
df[columns].head()

Unnamed: 0,sepal_width,sepal_length,species
0,3.5,5.1,setosa
1,3.0,4.9,setosa
2,3.2,4.7,setosa
3,3.1,4.6,setosa
4,3.6,5.0,setosa


In [125]:
df['sepal_width']

Unnamed: 0,sepal_width
0,3.5
1,3.0
2,3.2
3,3.1
4,3.6
...,...
145,3.0
146,2.5
147,3.0
148,3.4


In [126]:
df.loc[2:5,'sepal_width':'petal_width']

Unnamed: 0,sepal_width,petal_length,petal_width
2,3.2,1.3,0.2
3,3.1,1.5,0.2
4,3.6,1.4,0.2
5,3.9,1.7,0.4


In [127]:
# DataFrame에서 앞쪽 3행과 2번째, 4번째 열만 선택한다
df.iloc[:3,[1,3]]

Unnamed: 0,sepal_width,petal_width
0,3.5,0.2
1,3.0,0.2
2,3.2,0.2


In [128]:
# DataFrame에서 조건에 맞는 행과 열만 선택하여 상위 5개만 확인하는 코드

# 1) 'sepal_length' 컬럼 값이 5보다 큰 행만 선택
# 2) 선택된 행에서 'sepal_length'와 'sepal_width' 두 개의 열만 선택
# 3) 결과의 상위 5개 행만 출력
df.loc[df['sepal_length'] > 5, ['sepal_length', 'sepal_width']].head()

Unnamed: 0,sepal_length,sepal_width
0,5.1,3.5
5,5.4,3.9
10,5.4,3.7
14,5.8,4.0
15,5.7,4.4


#### 2. Summarize Data

* value_counts(): 각 고유값의 출현 빈도를 계산하여 내림차순으로 반환한다.
* unique(): 제거한 고유값들을 NumPy 배열 형태로 반환한다.
* describe(): 기초 통계 요약(개수, 평균, 표준편차, 최소·최대값, 사분위수)을 제공한다.

In [129]:
import pandas as pd
import seaborn as sns
import numpy as np

In [130]:
df = sns.load_dataset('iris')
df.shape

(150, 5)

In [131]:
df.head(2)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa


In [132]:
df['species'].value_counts()

Unnamed: 0_level_0,count
species,Unnamed: 1_level_1
setosa,50
versicolor,50
virginica,50


In [133]:
len(df)

150

In [134]:
len(df) == df.shape[0]

True

In [135]:
df['species'].unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [136]:
df['species'].nunique()

3

In [137]:
df.describe(include='all')

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
count,150.0,150.0,150.0,150.0,150
unique,,,,,3
top,,,,,setosa
freq,,,,,50
mean,5.843333,3.057333,3.758,1.199333,
std,0.828066,0.435866,1.765298,0.762238,
min,4.3,2.0,1.0,0.1,
25%,5.1,2.8,1.6,0.3,
50%,5.8,3.0,4.35,1.3,
75%,6.4,3.3,5.1,1.8,


In [138]:
df['petal_width'].sum()

np.float64(179.90000000000003)

In [139]:
df['petal_width'].count()

np.int64(150)

In [140]:
df['petal_width'].median()

1.3

In [141]:
df['petal_width'].quantile([0.25, 0.75])

Unnamed: 0,petal_width
0.25,0.3
0.75,1.8


In [142]:
df.median(numeric_only=True)

Unnamed: 0,0
sepal_length,5.8
sepal_width,3.0
petal_length,4.35
petal_width,1.3


In [143]:
df.mean(numeric_only=True)

Unnamed: 0,0
sepal_length,5.843333
sepal_width,3.057333
petal_length,3.758
petal_width,1.199333


In [144]:
df.max()

Unnamed: 0,0
sepal_length,7.9
sepal_width,4.4
petal_length,6.9
petal_width,2.5
species,virginica


In [145]:
df.var(numeric_only=True)

Unnamed: 0,0
sepal_length,0.685694
sepal_width,0.189979
petal_length,3.116278
petal_width,0.581006


In [146]:
df.std(numeric_only=True)

Unnamed: 0,0
sepal_length,0.828066
sepal_width,0.435866
petal_length,1.765298
petal_width,0.762238


#### 3.Pandas Handling Missing Data

* dropna(): 결측값이 있는 행(또는 axis에 따라 열)을 삭제한다.
*
fillna(): 결측값을 지정한 상수, 평균·중앙값, 전방·후방 채우기 등 원하는 값이나 방법으로 대한니다.

In [147]:
import pandas as pd
import numpy as np

In [148]:
df = pd.DataFrame([[np.nan, 2, np.nan, 0], [3, 4, np.nan, 1],
                   [np.nan, np.nan, np.nan, 5]],
                  columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
2,,,,5


In [149]:
df.dropna(axis=1, how='any')

Unnamed: 0,D
0,0
1,1
2,5


In [150]:
df.dropna(axis=0, how='all')

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
2,,,,5


In [151]:
df.fillna(0)

Unnamed: 0,A,B,C,D
0,0.0,2.0,0.0,0
1,3.0,4.0,0.0,1
2,0.0,0.0,0.0,5


In [152]:
values = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
df.fillna(value=values)

Unnamed: 0,A,B,C,D
0,0.0,2.0,2.0,0
1,3.0,4.0,2.0,1
2,0.0,1.0,2.0,5


In [153]:
df

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
2,,,,5


In [154]:
fill_na_value = df['D'].max()
fill_na_value

5

In [155]:
df.fillna(fill_na_value)

Unnamed: 0,A,B,C,D
0,5.0,2.0,5.0,0
1,3.0,4.0,5.0,1
2,5.0,5.0,5.0,5


In [156]:
df.isnull().sum()

Unnamed: 0,0
A,2
B,1
C,3
D,0


In [157]:
df.notnull().sum()

Unnamed: 0,0
A,1
B,2
C,0
D,3
