In [1]:
import pandas as pd
from pandas import Series, DataFrame

import numpy as np

## Built-in functions

In [2]:
df = pd.read_csv("data/wages.csv")
df.head()

Unnamed: 0,earn,height,sex,race,ed,age
0,79571.299011,73.89,male,white,16,49
1,96396.988643,66.23,female,white,16,62
2,48710.666947,63.77,female,white,16,33
3,80478.096153,63.22,female,other,16,95
4,82089.345498,63.08,female,white,17,43


### Describe
- Numeric type 데이터의 요약 정보

In [3]:
df.describe()

Unnamed: 0,earn,height,ed,age
count,1379.0,1379.0,1379.0,1379.0
mean,32446.292622,66.59264,13.354605,45.328499
std,31257.070006,3.818108,2.438741,15.789715
min,-98.580489,57.34,3.0,22.0
25%,10538.790721,63.72,12.0,33.0
50%,26877.870178,66.05,13.0,42.0
75%,44506.215336,69.315,15.0,55.0
max,317949.127955,77.21,18.0,95.0


### Unique
- series data의 유일한 값을 list로 반환함
- data에 category형이 몇 개인지 모를 때 사용

In [4]:
# 유일한 인종의 값 list
df.race.unique()

array(['white', 'other', 'hispanic', 'black'], dtype=object)

In [5]:
# dict type으로 index
np.array(dict(enumerate(df["race"].unique())))

array({0: 'white', 1: 'other', 2: 'hispanic', 3: 'black'}, dtype=object)

In [6]:
dict(enumerate(sorted(df["race"].unique())))

{0: 'black', 1: 'hispanic', 2: 'other', 3: 'white'}

In [7]:
# label index 값과 label 값 각각 추출
value = list(map(int, np.array(list(enumerate(df["race"].unique())))[:, 0].tolist()))
key = np.array(list(enumerate(df["race"].unique())), dtype=str)[:, 1].tolist()

value, key

([0, 1, 2, 3], ['white', 'other', 'hispanic', 'black'])

In [8]:
# label str -> index 값으로 변환
df["race"].replace(to_replace=key, value=value, inplace=True)

In [9]:
df["race"]

0       0
1       0
2       0
3       1
4       0
       ..
1374    0
1375    0
1376    0
1377    0
1378    0
Name: race, Length: 1379, dtype: int64

In [10]:
# 성별에 대해서도 동일하게 적용
value = list(map(int, np.array(list(enumerate(df["sex"].unique())))[:, 0].tolist()))
key = np.array(list(enumerate(df["sex"].unique())), dtype=str)[:, 1].tolist()

value, key

([0, 1], ['male', 'female'])

In [11]:
# sex와 race column의 index labeling
df["sex"].replace(to_replace=key, value=value, inplace=True)
df.head()

Unnamed: 0,earn,height,sex,race,ed,age
0,79571.299011,73.89,0,0,16,49
1,96396.988643,66.23,1,0,16,62
2,48710.666947,63.77,1,0,16,33
3,80478.096153,63.22,1,1,16,95
4,82089.345498,63.08,1,0,17,43


### Sum
- 기본적인 column 또는 row 값의 연산을 지원
- sub, mean, min, max, count, median, mad, var 등

In [12]:
df.sum(axis=0)

earn      4.474344e+07
height    9.183125e+04
sex       8.590000e+02
race      5.610000e+02
ed        1.841600e+04
age       6.250800e+04
dtype: float64

In [13]:
df.sum(axis=1)

0       79710.189011
1       96542.218643
2       48824.436947
3       80654.316153
4       82213.425498
            ...     
1374    30290.060363
1375    25019.829514
1376    13824.311312
1377    95563.664410
1378     9686.681857
Length: 1379, dtype: float64

### Isnull
- column 또는 row 값의 NaN(null) 값의 index를 반환함

In [14]:
df.isnull()

Unnamed: 0,earn,height,sex,race,ed,age
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False
...,...,...,...,...,...,...
1374,False,False,False,False,False,False
1375,False,False,False,False,False,False
1376,False,False,False,False,False,False
1377,False,False,False,False,False,False


In [15]:
# Null인 값의 합
df.isnull().sum(0)

earn      0
height    0
sex       0
race      0
ed        0
age       0
dtype: int64

In [16]:
df[["age", "earn"]]

Unnamed: 0,age,earn
0,49,79571.299011
1,62,96396.988643
2,33,48710.666947
3,95,80478.096153
4,43,82089.345498
...,...,...
1374,33,30173.380363
1375,86,24853.519514
1376,37,13710.671312
1377,54,95426.014410


### Sort_values
- column 값을 기준으로 데이터를 sorting

In [17]:
df.sort_values(["age", "earn"], ascending=False).head(10)  # ascending -> 오름차순

Unnamed: 0,earn,height,sex,race,ed,age
3,80478.096153,63.22,1,1,16,95
809,42963.362005,72.94,0,0,12,95
331,39169.750135,64.79,1,0,12,95
102,39751.19403,67.14,0,0,12,93
993,32809.632677,59.61,1,1,16,92
1017,8942.806716,62.97,1,0,10,91
1192,39757.94721,64.79,0,0,16,90
952,8162.682672,58.09,1,0,5,89
827,55712.348432,70.13,0,0,9,88
939,40744.874765,59.15,1,0,15,87


In [18]:
# 누적합 메서드
df.cumsum().head(5)

Unnamed: 0,earn,height,sex,race,ed,age
0,79571.299011,73.89,0,0,16,49
1,175968.287654,140.12,1,0,32,111
2,224678.954602,203.89,2,0,48,144
3,305157.050754,267.11,3,1,64,239
4,387246.396253,330.19,4,1,81,282


In [19]:
# 누적 최대 -> 행/열의 누적 최댓값
df.cummax().head(10)

Unnamed: 0,earn,height,sex,race,ed,age
0,79571.299011,73.89,0,0,16,49
1,96396.988643,73.89,1,0,16,62
2,96396.988643,73.89,1,0,16,62
3,96396.988643,73.89,1,1,16,95
4,96396.988643,73.89,1,1,17,95
5,96396.988643,73.89,1,1,17,95
6,96396.988643,73.89,1,1,17,95
7,96396.988643,73.89,1,1,17,95
8,96396.988643,73.89,1,2,17,95
9,96396.988643,73.89,1,2,17,95


In [20]:
df.sort_values("age", ascending=False).head(10)

Unnamed: 0,earn,height,sex,race,ed,age
3,80478.096153,63.22,1,1,16,95
331,39169.750135,64.79,1,0,12,95
809,42963.362005,72.94,0,0,12,95
102,39751.19403,67.14,0,0,12,93
993,32809.632677,59.61,1,1,16,92
1017,8942.806716,62.97,1,0,10,91
1192,39757.94721,64.79,0,0,16,90
952,8162.682672,58.09,1,0,5,89
827,55712.348432,70.13,0,0,9,88
1068,10861.092284,64.03,1,0,13,87


### Correlation & Covariance
- 상관계수와 공분산을 구하는 함수
- corr, cov, corrwith

In [21]:
# 상관계수 반환
df.age.corr(df.earn)

0.07400349177836058

In [22]:
df.age[(df.age < 45) & (df.age > 15)].corr(df.earn)

0.3141178872518904

In [23]:
df.age.cov(df.earn)

36523.6992104089

In [24]:
df.corr()

Unnamed: 0,earn,height,sex,race,ed,age
earn,1.0,0.2916,-0.337328,-0.063977,0.350374,0.074003
height,0.2916,1.0,-0.703672,-0.045974,0.114047,-0.133727
sex,-0.337328,-0.703672,1.0,0.000858,-0.061747,0.070036
race,-0.063977,-0.045974,0.000858,1.0,-0.049487,-0.056879
ed,0.350374,0.114047,-0.061747,-0.049487,1.0,-0.129802
age,0.074003,-0.133727,0.070036,-0.056879,-0.129802,1.0


In [25]:
# 많은 feature attribute와 Y_label의 상관관계
# correlation을 series로 뽑아냄
df.corrwith(df.earn)

earn      1.000000
height    0.291600
sex      -0.337328
race     -0.063977
ed        0.350374
age       0.074003
dtype: float64

In [26]:
# 도수(값의 개수)
df.sex.value_counts(sort=True)

1    859
0    520
Name: sex, dtype: int64