## 인덱싱, 데이터 조작, 인덱스 조작

- loc() : 라벨값 기반의 2차원 인덱싱
- iloc() : 순서를 나타내는 정수 기반의 2차원 인덱싱

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [246]:
# df loc[행 인덱싱값] -> 데이터 프레임을 만들때 존재하는 인덱스를 써야 사용가능하다
# df loc[행 인덱싱값, 열 인덱싱 ]

sample_df = pd.DataFrame(np.arange(10,22).reshape(3,4),
                        index=['a','b','c'],
                        columns = ['A','B','C','D'])

sample_df

Unnamed: 0,A,B,C,D
a,10,11,12,13
b,14,15,16,17
c,18,19,20,21


In [3]:
sample_df.loc['a']

A    10
B    11
C    12
D    13
Name: a, dtype: int32

In [4]:
type(sample_df.loc['a']) # Series

pandas.core.series.Series

In [6]:
type(sample_df.loc['a'].values) # numpy이의 vector 형식 (1차원 배열)

numpy.ndarray

In [11]:
# sample.df.loc['b':'c'] 데이터프레임의 행은 인덱싱을 해줘야함 그러므로 같은 결과

sample_df['b':'c']

sample_df.loc[['b','c']] # loc 를 빼면 열 인덱스 처리가 되므로 error 그러므로 loc필요

Unnamed: 0,A,B,C,D
b,14,15,16,17
c,18,19,20,21


In [12]:
# 열에 대한 접근
sample_df.A 

# sample_df['A']

a    10
b    14
c    18
Name: A, dtype: int32

In [13]:
type(sample_df.A) # Series

pandas.core.series.Series

In [14]:
sample_df.A>15 # type - Series

a    False
b    False
c     True
Name: A, dtype: bool

In [250]:
sample_df.loc[sample_df.A>15] 
# bool이라 bool 인덱스 가능 -> 18만 True여서 18이 들어가 있는 행에 대한 정보를 불러옴

Unnamed: 0,A,B,C,D
c,18,19,20,21


In [16]:
sample_df2 = pd.DataFrame(np.arange(10,26).reshape(4,4),
                        columns = ['A','B','C','D'])

sample_df2

Unnamed: 0,A,B,C,D
0,10,11,12,13
1,14,15,16,17
2,18,19,20,21
3,22,23,24,25


In [17]:
sample_df2.loc[1:2] # loc는 1:2 면 -1을 안하고 값을 불러옴

Unnamed: 0,A,B,C,D
1,14,15,16,17
2,18,19,20,21


In [22]:
sample_df

Unnamed: 0,A,B,C,D
a,10,11,12,13
b,14,15,16,17
c,18,19,20,21


In [251]:
sample_df.loc['a','A']

10

In [27]:
#sample_df.loc['b':'c']['A']  모두 가능하다

#sample_df.loc['b':'c','A']

sample_df.loc['b':,'A']

b    14
c    18
Name: A, dtype: int32

In [252]:
sample_df.loc['a']
# sample_df.loc['a',:]

A    10
B    11
C    12
D    13
Name: a, dtype: int32

In [47]:
sample_df.loc['b':,'C':]
# sample_df.loc[['b','c'],['C','D']]
# sample_df.loc[sample_df.A>10,['C','D']]

Unnamed: 0,C,D
b,16,17
c,20,21


- iloc() : 정수 인덱스 사용

In [33]:
sample_df.iloc[0,1] # 0번째 행의 1번째 열

11

In [36]:
sample_df

Unnamed: 0,A,B,C,D
a,10,11,12,13
b,14,15,16,17
c,18,19,20,21


In [38]:
sample_df.iloc[:,1]

a    11
b    15
c    19
Name: B, dtype: int32

In [42]:
sample_df.iloc[0,2:4]

C    12
D    13
Name: a, dtype: int32

In [49]:
sample_df.iloc[2,1:3]

B    19
C    20
Name: c, dtype: int32

In [50]:
sample_df.iloc[-1]

A    18
B    19
C    20
D    21
Name: c, dtype: int32

In [51]:
sample_df.iloc[-1] = sample_df.iloc[-1]*2 # 마지막 행에 대해서 연산

In [52]:
sample_df

Unnamed: 0,A,B,C,D
a,10,11,12,13
b,14,15,16,17
c,36,38,40,42


- 데이터 갯수를 세어보기
- count

In [54]:
s = pd.Series(range(10))
s

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int64

In [56]:
s[5] = np.NaN
s[2] = np.NAN
s.count()  # 결측값을 제외하고 8개 나옴

8

In [57]:
s

0    0.0
1    1.0
2    NaN
3    3.0
4    4.0
5    NaN
6    6.0
7    7.0
8    8.0
9    9.0
dtype: float64

In [60]:
np.random.seed(2)
count_df = pd.DataFrame(np.random.randint(5,size=(4,4)),dtype=np.float)
count_df

Unnamed: 0,0,1,2,3
0,0.0,0.0,3.0,2.0
1,3.0,0.0,2.0,1.0
2,3.0,2.0,4.0,4.0
3,4.0,3.0,4.0,2.0


In [61]:
count_df.count() # 각 열에 대한 수를 의미한다 

0    4
1    4
2    4
3    4
dtype: int64

In [66]:
count_df.iloc[1,0] = np.NaN
count_df.iloc[3,0] = np.NaN
count_df.iloc[2,3] = np.NaN
count_df
count_df.count()

0    2
1    4
2    4
3    3
dtype: int64

In [67]:
import seaborn as sns
titanic = sns.load_dataset('titanic') # type => 데이터프레임
titanic.describe() # 수치형 컬럼에 대한 요약을 볼 수 있음

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [68]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
survived       891 non-null int64
pclass         891 non-null int64
sex            891 non-null object
age            714 non-null float64
sibsp          891 non-null int64
parch          891 non-null int64
fare           891 non-null float64
embarked       889 non-null object
class          891 non-null category
who            891 non-null object
adult_male     891 non-null bool
deck           203 non-null category
embark_town    889 non-null object
alive          891 non-null object
alone          891 non-null bool
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.6+ KB


In [69]:
titanic.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

In [70]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [71]:
titanic.count()

survived       891
pclass         891
sex            891
age            714
sibsp          891
parch          891
fare           891
embarked       889
class          891
who            891
adult_male     891
deck           203
embark_town    889
alive          891
alone          891
dtype: int64

- value_counts() : 특정 열에 대한 count가 가능하다

In [72]:
titanic.columns # 열에 대한 키를 가져올 수 있다

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

In [73]:
# value_counts는 특정 Series의 값들을 count 해준다
titanic['pclass'].value_counts() # 데이터의 수를 보여준다

3    491
1    216
2    184
Name: pclass, dtype: int64

In [74]:
titanic['survived'].value_counts()

0    549
1    342
Name: survived, dtype: int64

In [75]:
titanic['pclass'].value_counts().values # 값들만 가져올 수 있음

array([491, 216, 184], dtype=int64)

In [76]:
# 새로운 열 추가 age_0 일괄적으로 0 할당
titanic['age_0'] = 0

In [78]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,age_0
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,0
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,0
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,0
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,0
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,0


In [None]:
# age의 각 값에 10을 곱한 age_10 컬럼 생성

In [79]:
titanic['age_10'] = titanic['age']*10

In [83]:
titanic['age_10'] = titanic['age']+100

In [80]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,age_0,age_10
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,0,220.0
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,0,380.0
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,0,260.0
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,0,350.0
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,0,350.0


In [None]:
# parch와 sibsp의 값과 1을 더한 family_no 컬럼생성

In [81]:
titanic['family_no'] = titanic['parch']+titanic['sibsp']+1

In [82]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,age_0,age_10,family_no
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,0,220.0,2
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,0,380.0,2
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,0,260.0,1
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,0,350.0,2
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,0,350.0,1


### 데이터 프레임 데이터 삭제

- drop()

In [84]:
# age_0 열을 삭제하고자 한다면?
titanic_drop_df = titanic.drop('age_0',axis=1) # inplace=True 하면 원본자체에서 삭제하게 됨

In [85]:
titanic_drop_df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,age_10,family_no
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False,122.0,2
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,138.0,2
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True,126.0,1
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False,135.0,2
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True,135.0,1


In [89]:
titanic.drop(['age_0','age_10','family_no'],axis=1,inplace=True)

In [91]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [98]:
# 0,1,2 번째 행을 삭제하여 원본 프레임에 반영
titanic.drop([0,1,2],axis=0,inplace=True)

In [100]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False


In [107]:
# 인덱스를 조작하려면 int형식이 아니라 array 형식이 되어야함
titanic.index # int
print(type(titanic.index.values)) # array
print(titanic.index.shape)

<class 'numpy.ndarray'>
(888,)


- 인덱스에 대한 슬라이싱 및 인덱싱

In [108]:
# 인덱스 5개를 꺼내오고 싶다면?
titanic.index.values[:5] #titanic.index[:5].values

array([3, 4, 5, 6, 7], dtype=int64)

In [109]:
# 인덱스의 6번째를 꺼내고 싶으면?
titanic.index[6] # titanic.index[6]=10 이런식은 불가능 인덱스는 조작이 불가능하기 때문이다

9

In [111]:
series_fair = titanic['fare']
# print('series',series_fair) #Series는 인덱스와 value로 이루어져있다
print('type', type(series_fair))

type <class 'pandas.core.series.Series'>


In [114]:
# max , min , sum
print('max : ', series_fair.max())
print('min : ', series_fair.min())
print('sum : ', series_fair.sum())
print('sum : ', np.sum(series_fair)) # 가능하다

print('*'*50)

print('DC 10% : ', series_fair*0.9) 


max :  512.3292
min :  0.0
sum :  28607.491
sum :  28607.491
**************************************************
DC 10% :  3       47.79000
4        7.24500
5        7.61247
6       46.67625
7       18.96750
8       10.01997
9       27.06372
10      15.03000
11      23.89500
12       7.24500
13      28.14750
14       7.06878
15      14.40000
16      26.21250
17      11.70000
18      16.20000
19       6.50250
20      23.40000
21      11.70000
22       7.22628
23      31.95000
24      18.96750
25      28.24875
26       6.50250
27     236.70000
28       7.09128
29       7.10622
30      24.94872
31     131.86872
32       6.97500
         ...    
861     10.35000
862     23.33628
863     62.59500
864     11.70000
865     11.70000
866     12.47247
867     45.44622
868      8.55000
869     10.01997
870      7.10622
871     47.29878
872      4.50000
873      8.10000
874     21.60000
875      6.50250
876      8.86122
877      7.10622
878      7.10622
879     74.84247
880     23.40000
881      7.

- reset_index() : 새로운 인덱스를 할당하고, 기존 인덱스는 인덱스라는 새로운 컬럼명을 추가

In [115]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False


In [116]:
titanic_reset_index_df = titanic.reset_index(inplace=False)
titanic_reset_index_df.head() # 기존 인덱스는 새로운 변수가 되고 새로운 인덱스가 생김

Unnamed: 0,index,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
1,4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
2,5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
3,6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
4,7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False


In [118]:
titanic_reset_index_df[['pclass','fare']].head()

Unnamed: 0,pclass,fare
0,1,53.1
1,3,8.05
2,3,8.4583
3,1,51.8625
4,3,21.075


In [120]:
# titanic_reset_index_df['pclass']==3
titanic_reset_index_df[titanic_reset_index_df['pclass']==3].head()

Unnamed: 0,index,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
2,5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
4,7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False
5,8,1,3,female,27.0,0,2,11.1333,S,Third,woman,False,,Southampton,yes,False
7,10,1,3,female,4.0,1,1,16.7,S,Third,child,False,G,Southampton,yes,False


In [121]:
titanic_reset_index_df.iloc[0:7,2:4]

Unnamed: 0,pclass,sex
0,1,female
1,3,male
2,3,male
3,1,male
4,3,male
5,3,female
6,2,female


In [122]:
titanic_reset_index_df.iloc[[4,6,8],[2,4,6]]

Unnamed: 0,pclass,age,parch
4,3,2.0,1
6,2,14.0,0
8,1,58.0,0


In [123]:
# age -> 60 이상인 정보만 추출하고 싶다면?

titanic_reset_index_df[titanic_reset_index_df['age']>=60].head()

Unnamed: 0,index,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
30,33,0,2,male,66.0,0,0,10.5,S,Second,man,True,,Southampton,no,True
51,54,0,1,male,65.0,0,1,61.9792,C,First,man,True,B,Cherbourg,no,False
93,96,0,1,male,71.0,0,0,34.6542,C,First,man,True,A,Cherbourg,no,True
113,116,0,3,male,70.5,0,0,7.75,Q,Third,man,True,,Queenstown,no,True
167,170,0,1,male,61.0,0,0,33.5,S,First,man,True,B,Southampton,no,True


In [None]:
# age -> 60 이상인 pclass,survived,who만 추출하고 싶다면?

In [128]:
titanic_reset_index_df.loc[titanic_reset_index_df['age']>=60,['pclass','survived','who']].head()

Unnamed: 0,pclass,survived,who
30,2,0,man
51,1,0,man
93,1,0,man
113,3,0,man
167,1,0,man


In [130]:
titanic_reset_index_df[titanic_reset_index_df['age']>=60][['pclass','survived','who']].head()

Unnamed: 0,pclass,survived,who
30,2,0,man
51,1,0,man
93,1,0,man
113,3,0,man
167,1,0,man


- 여러개의 복합 조건을 이용해서 불링인덱스를 만드는 것도 가능하다
- and -> &
- or -> |
- not -> !, ~

In [135]:
# 나이가 60보다 크고 선실등급이 1등급이고 성별이 여자인 데이터를 추출하는 방법

titanic_reset_index_df[(titanic_reset_index_df['age']>60)&
                       (titanic_reset_index_df['pclass']==1)&
                      (titanic_reset_index_df['sex']=='female')]

Unnamed: 0,index,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
272,275,1,1,female,63.0,1,0,77.9583,S,First,woman,False,D,Southampton,yes,False
826,829,1,1,female,62.0,0,0,80.0,,First,woman,False,B,,yes,True


### 정렬
- sort_index
- sort_values

In [261]:
np.random.seed(100)
sort_df=pd.DataFrame(np.random.randint(0,10,(6,4)))
sort_df

Unnamed: 0,0,1,2,3
0,8,8,3,7
1,7,0,4,2
2,5,2,2,2
3,1,0,8,4
4,0,9,6,2
5,4,1,5,3


In [184]:
sort_df.columns = ['A', 'B', 'C', 'D']
sort_df.index = pd.date_range('20201014', periods = 6)
sort_df

Unnamed: 0,A,B,C,D
2020-10-14,8,8,3,7
2020-10-15,7,0,4,2
2020-10-16,5,2,2,2
2020-10-17,1,0,8,4
2020-10-18,0,9,6,2
2020-10-19,4,1,5,3


In [174]:
# error 
# np.random.shuffle(sort_df.index)

#순열 랜덤 치환
random_date = np.random.permutation(sort_df.index)
random_date

array(['2020-10-18T00:00:00.000000000', '2020-10-19T00:00:00.000000000',
       '2020-10-16T00:00:00.000000000', '2020-10-14T00:00:00.000000000',
       '2020-10-17T00:00:00.000000000', '2020-10-15T00:00:00.000000000'],
      dtype='datetime64[ns]')

In [212]:
# index를 재할당
sort_df2 = sort_df.reindex(index = random_date, columns = ['B', 'A', 'D', 'C'])
sort_df2

Unnamed: 0,B,A,D,C
2020-10-18,9,0,2,6
2020-10-19,1,4,3,5
2020-10-16,2,5,2,2
2020-10-14,8,8,7,3
2020-10-17,0,1,4,8
2020-10-15,0,7,2,4


In [256]:
# axis = 0 : row, axis = 1 : col
sort_df2.sort_index(axis = 1, ascending = False)
sort_df2.sort_index(axis = 0, ascending = True)

Unnamed: 0,B,A,D,C
2020-10-14,8,8,7,3
2020-10-15,0,7,2,4
2020-10-16,2,5,2,2
2020-10-17,0,1,4,8
2020-10-18,9,0,2,6
2020-10-19,1,4,3,5


In [179]:
# 특정 컬럼 값을 기준으로 행 정렬
sort_df2.sort_values(by = 'B', ascending = False)


Unnamed: 0,B,A,D,C
2020-10-18,9,0,2,6
2020-10-14,8,8,7,3
2020-10-16,2,5,2,2
2020-10-19,1,4,3,5
2020-10-17,0,1,4,8
2020-10-15,0,7,2,4


- 행 / 열의 합을 구할 때는 sum(axis = )

In [180]:
sort_df2.sum(axis=1)

2020-10-18    17
2020-10-19    13
2020-10-16    11
2020-10-14    26
2020-10-17    13
2020-10-15    13
dtype: int64

In [206]:
sort_df2['row_sum'] = sort_df2.sum(axis=1)
sort_df2

Unnamed: 0,B,A,D,C,row_sum
2020-10-18 00:00:00,9.0,0.0,2.0,6.0,85.0
2020-10-19 00:00:00,1.0,4.0,3.0,5.0,65.0
2020-10-16 00:00:00,2.0,5.0,2.0,2.0,55.0
2020-10-14 00:00:00,8.0,8.0,7.0,3.0,130.0
2020-10-17 00:00:00,0.0,1.0,4.0,8.0,65.0
2020-10-15 00:00:00,0.0,7.0,2.0,4.0,65.0
col_sum,20.0,25.0,20.0,28.0,465.0


In [185]:
sort_df2.sum(axis=0)

B          20.0
A          25.0
D          20.0
C          28.0
row_sum    93.0
dtype: float64

In [208]:
sort_df2.loc['col_sum',:] = sort_df2.sum(axis=0)
sort_df2

Unnamed: 0,B,A,D,C,row_sum
2020-10-18 00:00:00,9.0,0.0,2.0,6.0,85.0
2020-10-19 00:00:00,1.0,4.0,3.0,5.0,65.0
2020-10-16 00:00:00,2.0,5.0,2.0,2.0,55.0
2020-10-14 00:00:00,8.0,8.0,7.0,3.0,130.0
2020-10-17 00:00:00,0.0,1.0,4.0,8.0,65.0
2020-10-15 00:00:00,0.0,7.0,2.0,4.0,65.0
col_sum,60.0,75.0,60.0,84.0,1395.0


In [200]:
# 타이타닉호 승객의 평균 나이를 구하라
print(titanic_reset_index_df['age'].mean())
# 타이타닉호 승객중 여성 승객의 평균 나이를 구하라
print(titanic_reset_index_df.loc[titanic_reset_index_df['sex']=='female','age'].mean())
# 타이타닉호 승객중 1등실 선실의 여성 승객의 평균 나이를 구하라
titanic_reset_index_df.loc[(titanic_reset_index_df['pclass']==1)&
                      (titanic_reset_index_df['sex']=='female'),'age'].mean()

29.703473980309422
27.884169884169886


34.57142857142857

### apply 변환
- 행이나 열 단위로 복잡한 데이터 가공이 필요한 경우 사용하는 함수이다
- lambda 식
- apply함수는 인자로 함수를 넘겨 받을 수 있다

In [210]:
def get_square(a):
    return a**2

In [217]:
print('제곱근 : ', get_square(3))

제곱근 :  9


In [218]:
# 위 코드를 람다식으로 바꾼다면?
lambda_square = lambda a : a**2
print('제곱근 : ', lambda_square(3))

제곱근 :  9


In [220]:
np.random.seed(100)
apply_df = pd.DataFrame(np.random.randint(0,10,(6,4)))
apply_df.columns = ['A','B','C','D']
apply_df.index = pd.date_range('20201014',periods=6)
apply_df

Unnamed: 0,A,B,C,D
2020-10-14,8,8,3,7
2020-10-15,7,0,4,2
2020-10-16,5,2,2,2
2020-10-17,1,0,8,4
2020-10-18,0,9,6,2
2020-10-19,4,1,5,3


In [221]:
# 각 행의 column에 대해서 최대값 - 최소값을 구해 새로운 column 추가
# 각 column안에서 최대값 - 최소값을 구해 출력 
func = lambda x : x.max() - x.min()

In [225]:
apply_df['row 최대-최소'] = apply_df.apply(func,axis=1) # axis=1 열

In [227]:
apply_df

Unnamed: 0,A,B,C,D,row 최대-최소
2020-10-14,8,8,3,7,5
2020-10-15,7,0,4,2,7
2020-10-16,5,2,2,2,3
2020-10-17,1,0,8,4,8
2020-10-18,0,9,6,2,9
2020-10-19,4,1,5,3,4


In [244]:
# embark_town의 문자열 개수를 별도의 컬럼인 embark_len 컬럼을 추가
titanic_reset_index_df['embark_len'] = titanic_reset_index_df['embark_town'].apply(lambda x : len(str(x)))
titanic_reset_index_df[['embark_town','embark_len']].head(3)


Unnamed: 0,embark_town,embark_len
0,Southampton,11
1,Southampton,11
2,Queenstown,10


In [243]:
# if ~ else 절을 활용하여 나이가 15세 이하면 child 그렇지 않으면 adult로 구분하는 child_adult를 구하라

titanic_reset_index_df['child_ault'] = titanic_reset_index_df['age'].apply(lambda x : 'child' if x <=15 else 'adult')
titanic_reset_index_df[['age','child_ault']].head()

Unnamed: 0,age,child_ault
0,35.0,adult
1,35.0,adult
2,,adult
3,54.0,adult
4,2.0,child
