# 데이터프레임 고급 인덱싱

## loc 인덱서

In [6]:
import pandas as pd

In [7]:
df = pd.DataFrame(np.arange(10, 22).reshape(3, 4),
                  index=["a", "b", "c"],
                  columns=["A", "B", "C", "D"])
df

Unnamed: 0,A,B,C,D
a,10,11,12,13
b,14,15,16,17
c,18,19,20,21


In [8]:
df.loc["a"]

A    10
B    11
C    12
D    13
Name: a, dtype: int32

In [9]:
df.loc["b":"c"]

Unnamed: 0,A,B,C,D
b,14,15,16,17
c,18,19,20,21


In [10]:
df[["b","c"]]      ## 윗줄과 달리 loc가 없으면 key error 발생

KeyError: "None of [Index(['b', 'c'], dtype='object')] are in the [columns]"

In [11]:
df.A > 15

a    False
b    False
c     True
Name: A, dtype: bool

In [12]:
df.loc[df.A > 15]

Unnamed: 0,A,B,C,D
c,18,19,20,21


In [13]:
def select_rows(df):
    return df.A > 15

In [15]:
df.loc[select_rows(df)]

Unnamed: 0,A,B,C,D
c,18,19,20,21


In [16]:
df2 = pd.DataFrame(np.arange(10, 26).reshape(4, 4), columns=["A", "B", "C", "D"])
df2

Unnamed: 0,A,B,C,D
0,10,11,12,13
1,14,15,16,17
2,18,19,20,21
3,22,23,24,25


In [18]:
df2.loc[1:2]

Unnamed: 0,A,B,C,D
1,14,15,16,17
2,18,19,20,21


In [24]:
df

Unnamed: 0,A,B,C,D
a,10,11,12,13
b,14,15,16,17
c,18,19,20,21


In [20]:
df.loc["a","A"]

10

In [22]:
df.loc["b":,"A"]

b    14
c    18
Name: A, dtype: int32

In [23]:
df.loc["a",:]

A    10
B    11
C    12
D    13
Name: a, dtype: int32

In [25]:
df.loc[["a","b"],["B","D"]]

Unnamed: 0,B,D
a,11,13
b,15,17


In [26]:
df.loc[df.A > 10, ["C","D"]]

Unnamed: 0,C,D
b,16,17
c,20,21


## iloc 인덱서

몇행 몇열로 표현해서 쓰기 더 쉬움

교수님은 iloc를 추천한다

In [28]:
df

Unnamed: 0,A,B,C,D
a,10,11,12,13
b,14,15,16,17
c,18,19,20,21


In [29]:
df.iloc[0,1]

11

In [30]:
df.iloc[:2,2]

a    12
b    16
Name: C, dtype: int32

In [31]:
df.iloc[0, -2:]

C    12
D    13
Name: a, dtype: int32

In [32]:
df.iloc[2:3, 1:3]

Unnamed: 0,B,C
c,19,20


In [33]:
df.iloc[-1]     ## 인덱스가 하나만 들어가면 행을 선택한다.

A    18
B    19
C    20
D    21
Name: c, dtype: int32

In [34]:
df.iloc[-1] = df.iloc[-1]*2
df

Unnamed: 0,A,B,C,D
a,10,11,12,13
b,14,15,16,17
c,36,38,40,42


# 데이터프리임의 데이터조작

## 데이터 갯수 세기

In [39]:
s = pd.Series(range(10))
s[3] = np.nan
s

0    0.0
1    1.0
2    2.0
3    NaN
4    4.0
5    5.0
6    6.0
7    7.0
8    8.0
9    9.0
dtype: float64

In [40]:
s.count()

9

In [42]:
np.random.seed(2)
df = pd.DataFrame(np.random.randint(5, size=(4,4)), dtype=float)
df.iloc[2, 3] = np.nan
df

Unnamed: 0,0,1,2,3
0,0.0,0.0,3.0,2.0
1,3.0,0.0,2.0,1.0
2,3.0,2.0,4.0,
3,4.0,3.0,4.0,2.0


In [43]:
df.count()

0    4
1    4
2    4
3    3
dtype: int64

In [44]:
import seaborn as sns

In [46]:
import seaborn as sns
titanic = sns.load_dataset("titanic")
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


## 카테고리 값 세기

In [56]:
np.random.seed(1)                 ## seed(1)때문에 모든사람이 같은 값이 나옴
s2 = pd.Series(np.random.randint(6, size=100))
s2.tail()

95    4
96    5
97    2
98    4
99    3
dtype: int32

In [57]:
s2.value_counts()

1    22
0    18
4    17
5    16
3    14
2    13
dtype: int64

In [58]:
df[0].value_counts()

3.0    2
0.0    1
4.0    1
Name: 0, dtype: int64

## 정렬

In [65]:
s

0    0.0
1    1.0
2    2.0
3    NaN
4    4.0
5    5.0
6    6.0
7    7.0
8    8.0
9    9.0
dtype: float64

In [66]:
s2

0     5
1     3
2     4
3     0
4     1
     ..
95    4
96    5
97    2
98    4
99    3
Length: 100, dtype: int32

In [59]:
s2.value_counts().sort_index()

0    18
1    22
2    13
3    14
4    17
5    16
dtype: int64

In [60]:
s.sort_values()
# NaN 값이 잆으면 가장 나중에 나옴

0    0.0
1    1.0
2    2.0
4    4.0
5    5.0
6    6.0
7    7.0
8    8.0
9    9.0
3    NaN
dtype: float64

In [62]:
s.sort_values(ascending=False)
# 큰 수에서 작은 수로 반대 방향 지정하는 방법

9    9.0
8    8.0
7    7.0
6    6.0
5    5.0
4    4.0
2    2.0
1    1.0
0    0.0
3    NaN
dtype: float64

In [67]:
df

Unnamed: 0,0,1,2,3
0,0.0,0.0,3.0,2.0
1,3.0,0.0,2.0,1.0
2,3.0,2.0,4.0,
3,4.0,3.0,4.0,2.0


In [63]:
df.sort_values(by=1)

Unnamed: 0,0,1,2,3
0,0.0,0.0,3.0,2.0
1,3.0,0.0,2.0,1.0
2,3.0,2.0,4.0,
3,4.0,3.0,4.0,2.0


In [68]:
df.sort_values(by=3)

Unnamed: 0,0,1,2,3
1,3.0,0.0,2.0,1.0
0,0.0,0.0,3.0,2.0
3,4.0,3.0,4.0,2.0
2,3.0,2.0,4.0,


In [64]:
df.sort_values(by=[1,2])

Unnamed: 0,0,1,2,3
1,3.0,0.0,2.0,1.0
0,0.0,0.0,3.0,2.0
2,3.0,2.0,4.0,
3,4.0,3.0,4.0,2.0
