## 정규화

In [None]:
"""
정규화 : 0~1사이의 범위로 데이터를 표준화

ex) 5 3 1 7 9 => (각 데이터 값 - 최소값) / (최대값 - 최소값)
    범위는  0~1사이가 된다.

"""

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [3]:
data=np.array([[10,-10,2],[5,0,6],[0,7,4]])
data


array([[ 10, -10,   2],
       [  5,   0,   6],
       [  0,   7,   4]])

In [6]:
data.max()
data.max(axis=0) #각 열 단위의 최댓값
data.max(axis=1) #각 행 단위의 최댓값

array([10,  7,  6])

### 1) 수식으로 정규화

In [9]:
data_mm=(data-data.min(axis=0))/(data.max(axis=0)-data.min(axis=0))  #정규화
data_mm

array([[1.        , 0.        , 0.        ],
       [0.5       , 0.58823529, 1.        ],
       [0.        , 1.        , 0.5       ]])

### 2) sklearn-MinMaxScaler() 으로 정규화

In [10]:
mms=MinMaxScaler()
data_mms=mms.fit_transform(data)
data_mms

array([[1.        , 0.        , 0.        ],
       [0.5       , 0.58823529, 1.        ],
       [0.        , 1.        , 0.5       ]])

### 3) sklearn-minmax_scale()으로 정규화 

In [11]:
from sklearn.preprocessing import minmax_scale

In [12]:
datamm=minmax_scale(data,axis=0)
datamm

array([[1.        , 0.        , 0.        ],
       [0.5       , 0.58823529, 1.        ],
       [0.        , 1.        , 0.5       ]])

## 이항변수화 

In [None]:
#이항변수화 : 연속형변수를 가준이하=> 0, 기준초과 => 1

#확률변수 x가 이항분포를 따른다고 가정했을때, 0 또는 1 값을 갖는 이항변수가 필요

#베르누이 시행 : 성공(1) 실패(0) 두가지 경우만 

#성공확룰이 p 인 베르누이 시행을 n 번 수행

#성공하는 횟수를  x라고 하면 , 확률변수 x는 모수 n과 p인 이항분포를 따른다.



"""
주사위 3이 나올 확률(성공) 1/6
안나올 확률(실패) 5/6

ex) A B C D 사람중 한사람만 주사위 3이 나올수 있는 확률
A= 1/6*5/6*5/6*5/6
B= 1/6*5/6*5/6*5/6
C= 1/6*5/6*5/6*5/6
D= 1/6*5/6*5/6*5/6
0.09(성공확률p)*4(횟수n)=0.36


"""

### 1) Binarizer (연속형 변수를 이항변수화) 

In [13]:
from sklearn.preprocessing import Binarizer

In [14]:
data=np.array([[10,-10,2],[5,0,6],[0,7,4]])
data


array([[ 10, -10,   2],
       [  5,   0,   6],
       [  0,   7,   4]])

In [19]:
bina=Binarizer(5).fit(data)  #threshold=5 (기준값)에 따라 기준이하면 0, 기준초과면 1
bina

Binarizer(copy=True, threshold=5)

In [20]:
bina.transform(data)

array([[1, 0, 0],
       [0, 0, 1],
       [0, 1, 0]])

In [23]:
#바이너라이저 : 연속형 변수를 이항변수화
#원핫인코더 : 범주형 변수를 이항변수화


"""
연령대 : 20대 => 0 , 30대 => 1 , 40대 => 2
성별 : 남 =>0 , 여 =>1
학점 : A:0, B:1, C:2, D:3 , E: 4 

-연령대(3개 범주)
100(20대)
010(30대)
001(40대)
.
.
.
-학점(5개의 범주) 
01234 
10000 (A)
01000 (B)
00100 (C)
00010 (D)
00001 (E)


ex) 20대 여 학점A : 100 01 10000

"""

### 2) OneHotEncoder (범주형 변수를 이항변수화)

In [161]:
from sklearn.preprocessing import OneHotEncoder

In [162]:
#[성별 , 연령대 , 학점]
data=np.array([[0,0,0],
               [0,1,1],
               [0,2,2],
               [1,0,3],
               [1,1,4]                          
              ])
data

array([[0, 0, 0],
       [0, 1, 1],
       [0, 2, 2],
       [1, 0, 3],
       [1, 1, 4]])

In [163]:
ohe=OneHotEncoder()
ohe.fit(data)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


OneHotEncoder(categorical_features=None, categories=None, drop=None,
              dtype=<class 'numpy.float64'>, handle_unknown='error',
              n_values=None, sparse=True)

In [33]:
ohe.active_features_ #총 범주 #[남,여,20,30,40,A,B,C,D,E]


ohe.n_values_ #각각 범주의 갯수

ohe.feature_indices_ #성별 : 0이상 2미만
                     #연령대: 2이상 5미만
                     #학점 : 5이상 10미만


array([ 0,  2,  5, 10], dtype=int32)

## 범주형 데이터 => 이항변수화 

In [35]:
data=np.array([[1,2,3]]) #여성, 40대, 학점 D
ohe.transform(data).toarray() #반드시 배열로 변형해주어야한다.

array([[0., 1., 0., 0., 1., 0., 0., 0., 1., 0.]])

In [37]:
df = pd.DataFrame({'C1': np.random.randn(20),

   'C2': ['a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a', 'a',

   'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b']})
df

Unnamed: 0,C1,C2
0,-0.302457,a
1,0.250573,a
2,0.132706,a
3,2.892304,a
4,-1.38965,a
5,-2.481422,a
6,0.112273,a
7,1.348299,a
8,-0.529223,a
9,-0.333602,a


## 이산화 

In [None]:
"""
np.digitize함수, np.where 함수 : 연속형 변수 => 이산화 변수

이항변수화 : 0 또는 1 값을 갖는 변수를 만드는 것
이산(형)화 : 연속형 변수를 2개이상의 범주를 갖는 변수로 변환

ex) 점수 : 75 =>A/B/C/D/E (category)
"""

In [38]:
np.random.seed(10)  #동일한 난수 발생  #주로 쓰는 씨드: seed(42) seed(777) seed(1004)
np.random.randn(1)

array([1.3315865])

In [42]:
"""
df의 c1열 20개 data구성
10개 bin 으로 균등하게 나눔

"""

df.C1.min() 
df.C1.max()
bins=np.linspace(df.C1.min(),df.C1.max(),10)
bins


array([-2.48142156, -1.88434097, -1.28726038, -0.69017979, -0.0930992 ,
        0.50398139,  1.10106198,  1.69814257,  2.29522316,  2.89230375])

### 1) digitize (연속형 변수를 이산형 변수화)

In [41]:
df["C1_bin"]=np.digitize(df["C1"],bins)
df

Unnamed: 0,C1,C2,C1_bin
0,-0.302457,a,4
1,0.250573,a,5
2,0.132706,a,5
3,2.892304,a,10
4,-1.38965,a,2
5,-2.481422,a,1
6,0.112273,a,5
7,1.348299,a,7
8,-0.529223,a,4
9,-0.333602,a,4


In [48]:
#groupby() : 구간별로 요약통계, 범주가 평균 차이
#독립성검정 , indexing
df.groupby("C1_bin")["C1"].size() #각 그룹안 값의 갯수
df.groupby("C1_bin")["C1"].mean() #각 그룹안 값의 평균
df.groupby("C1_bin")["C1"].std() #각 그룹안 값의 표준편차
df.groupby("C1_bin")["C2"].value_counts() #각 그룹(C1)안 데이터(C2)들 갯수

C1_bin  C2
1       a     1
2       a     1
        b     1
3       b     1
4       a     3
        b     3
5       a     3
        b     2
6       b     3
7       a     1
10      a     1
Name: C2, dtype: int64

In [50]:
df[df["C1_bin"]==4]  #C1_bin이 4인 것들 추출

Unnamed: 0,C1,C2,C1_bin
0,-0.302457,a,4
8,-0.529223,a,4
9,-0.333602,a,4
11,-0.401119,b,4
12,-0.112263,b,4
17,-0.137799,b,4


### 추가 ** get_dummies (이산화 변수를 이항변수화)

In [52]:
#pd.get_dummies(): 가변수 생성
pd.get_dummies(df["C1_bin"])  #원핫인코딩

Unnamed: 0,1,2,3,4,5,6,7,10
0,0,0,0,1,0,0,0,0
1,0,0,0,0,1,0,0,0
2,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,1
4,0,1,0,0,0,0,0,0
5,1,0,0,0,0,0,0,0
6,0,0,0,0,1,0,0,0
7,0,0,0,0,0,0,1,0
8,0,0,0,1,0,0,0,0
9,0,0,0,1,0,0,0,0


### 2) where(연속형 변수를 이산형화) 

In [53]:
#np.where(조건 , 참, 거짓)

df["h_l"]=np.where(df["C1"]>=df.C1.mean(), "high","low")
df

Unnamed: 0,C1,C2,C1_bin,h_l
0,-0.302457,a,4,low
1,0.250573,a,5,high
2,0.132706,a,5,high
3,2.892304,a,10,high
4,-1.38965,a,2,low
5,-2.481422,a,1,low
6,0.112273,a,5,high
7,1.348299,a,7,high
8,-0.529223,a,4,low
9,-0.333602,a,4,low


In [55]:
df.groupby("h_l")["C1"].size()
df.groupby("h_l")["C1"].mean()

h_l
high    0.753262
low    -0.781272
Name: C1, dtype: float64

In [64]:
Q1=np.percentile(df["C1"],25)
Q3=np.percentile(df["C1"],75)

c1=df["C1"]>Q1
c2=df["C1"]<=Q3

df["h_m_l"]=np.where(df["C1"]<=Q1,"l", np.where(c1&c2, "m","h"))
df



Unnamed: 0,C1,C2,C1_bin,h_l,h_m_l
0,-0.302457,a,4,low,m
1,0.250573,a,5,high,m
2,0.132706,a,5,high,m
3,2.892304,a,10,high,h
4,-1.38965,a,2,low,l
5,-2.481422,a,1,low,l
6,0.112273,a,5,high,m
7,1.348299,a,7,high,h
8,-0.529223,a,4,low,l
9,-0.333602,a,4,low,m


## 데이터 재구조화 

In [None]:
"""
데이터 재구조화
1) 피벗테이블 (pivot , pivot_table)
2) 스택 (stack)
3) melt 
"""

### 1) pivot 

In [67]:
data =pd.DataFrame({'cust_id': ['c1', 'c1', 'c1', 'c2', 'c2', 'c2', 'c3', 'c3', 'c3'],

   'prod_cd': ['p1', 'p2', 'p3', 'p1', 'p2', 'p3', 'p1', 'p2', 'p3'],
'grade' : ['A', 'A', 'A', 'A', 'A', 'A', 'B', 'B', 'B'],

   'pch_amt': [30, 10, 0, 40, 15, 30, 0, 0, 10]})

data.pivot(index="cust_id", columns="prod_cd", values="pch_amt") #dataframe안 pivot


prod_cd,p1,p2,p3
cust_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
c1,30,10,0
c2,40,15,30
c3,0,0,10


In [71]:
data.pivot(index=["cust_id","grade"], columns="prod_cd", values="pch_amt" ) #에러

ValueError: Length of passed values is 9, index implies 2

### 1-1) pivot_table

In [70]:
pd.pivot_table(data,index="cust_id", columns="prod_cd", values="pch_amt") #pandas 안 pivot_table 

prod_cd,p1,p2,p3
cust_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
c1,30,10,0
c2,40,15,30
c3,0,0,10


In [73]:
pd.pivot_table(data, index=["cust_id","grade"], columns="prod_cd", values="pch_amt") #pivot_table은 index,columns,values를 2개이상 지정가능

Unnamed: 0_level_0,prod_cd,p1,p2,p3
cust_id,grade,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
c1,A,30,10,0
c2,A,40,15,30
c3,B,0,0,10


In [75]:
#pivot_table  : 집계함수 제공

pd.pivot_table(data,index="grade", columns="prod_cd", values="pch_amt")  #각 prod_cd 열들과 grade에 대한 평균 출력

pd.pivot_table(data,index="grade", columns="prod_cd", values="pch_amt", aggfunc=np.mean) #디폴트 : aggfunc=np.mean

pd.pivot_table(data,index="grade", columns="prod_cd", values="pch_amt", aggfunc=np.sum) # 각 prod_cd 열들과 grade에 대한 합계 출력

prod_cd,p1,p2,p3
grade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,70,25,30
B,0,0,10


### 2) stack

In [None]:
#stack() : 위에서 아래로 쌓는것
#unstack() : 쌓여있는 것을 옆으로 놓는것(왼->오)


In [78]:
mul_index = pd.MultiIndex.from_tuples([('cust_1', '2020'), ('cust_1', '2021'),

   ('cust_2', '2020'), ('cust_2', '2021')])  



data = pd.DataFrame(data=np.arange(16).reshape(4, 4),

   index=mul_index,

   columns=['prd_1', 'prd_2', 'prd_3', 'prd_4'],

   dtype='int')

data

Unnamed: 0,Unnamed: 1,prd_1,prd_2,prd_3,prd_4
cust_1,2020,0,1,2,3
cust_1,2021,4,5,6,7
cust_2,2020,8,9,10,11
cust_2,2021,12,13,14,15


In [79]:
datastd=data.stack()
datastd

cust_1  2020  prd_1     0
              prd_2     1
              prd_3     2
              prd_4     3
        2021  prd_1     4
              prd_2     5
              prd_3     6
              prd_4     7
cust_2  2020  prd_1     8
              prd_2     9
              prd_3    10
              prd_4    11
        2021  prd_1    12
              prd_2    13
              prd_3    14
              prd_4    15
dtype: int32

In [81]:
datastd["cust_2"]["2020"]["prd_1"]

8

In [82]:
data

Unnamed: 0,Unnamed: 1,prd_1,prd_2,prd_3,prd_4
cust_1,2020,0,1,2,3
cust_1,2021,4,5,6,7
cust_2,2020,8,9,10,11
cust_2,2021,12,13,14,15


In [90]:
data.ix["cust_2","prd_4"]=None
data

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0,Unnamed: 1,prd_1,prd_2,prd_3,prd_4
cust_1,2020,0,1,2,3.0
cust_1,2021,4,5,6,7.0
cust_2,2020,8,9,10,
cust_2,2021,12,13,14,


In [91]:
data.stack() #NaN값이 빠져있음 => (dropna=False)설정
data.stack(dropna=False)  

cust_1  2020  prd_1     0.0
              prd_2     1.0
              prd_3     2.0
              prd_4     3.0
        2021  prd_1     4.0
              prd_2     5.0
              prd_3     6.0
              prd_4     7.0
cust_2  2020  prd_1     8.0
              prd_2     9.0
              prd_3    10.0
              prd_4     NaN
        2021  prd_1    12.0
              prd_2    13.0
              prd_3    14.0
              prd_4     NaN
dtype: float64

### 3) melt

In [92]:
data=pd.DataFrame({'cust_ID' : ['C_001', 'C_001', 'C_002', 'C_002'],

   'prd_CD' : ['P_001', 'P_002', 'P_001', 'P_002'],

   'pch_cnt' : [1, 2, 3, 4],
'pch_amt' : [100, 200, 300, 400]})

In [93]:
data

Unnamed: 0,cust_ID,prd_CD,pch_cnt,pch_amt
0,C_001,P_001,1,100
1,C_001,P_002,2,200
2,C_002,P_001,3,300
3,C_002,P_002,4,400


In [96]:
pd.melt(data, id_vars=["cust_ID","prd_CD"],var_name="CD", value_name="VAL") #기존 dataframe 변경 


Unnamed: 0,cust_ID,prd_CD,CD,VAL
0,C_001,P_001,pch_cnt,1
1,C_001,P_002,pch_cnt,2
2,C_002,P_001,pch_cnt,3
3,C_002,P_002,pch_cnt,4
4,C_001,P_001,pch_amt,100
5,C_001,P_002,pch_amt,200
6,C_002,P_001,pch_amt,300
7,C_002,P_002,pch_amt,400


In [99]:
pew=pd.read_csv("pew.csv")
pew.head()

Unnamed: 0,religion,<$10k,$10-20k,$20-30k,$30-40k,$40-50k,$50-75k,$75-100k,$100-150k,>150k,Don't know/refused
0,Agnostic,27,34,60,81,76,137,122,109,84,96
1,Atheist,12,27,37,52,35,70,73,59,74,76
2,Buddhist,27,21,30,34,33,58,62,39,53,54
3,Catholic,418,617,732,670,638,1116,949,792,633,1489
4,Don’t know/refused,15,14,15,11,10,35,21,17,18,116


In [100]:
pew.iloc[:,0:6]

Unnamed: 0,religion,<$10k,$10-20k,$20-30k,$30-40k,$40-50k
0,Agnostic,27,34,60,81,76
1,Atheist,12,27,37,52,35
2,Buddhist,27,21,30,34,33
3,Catholic,418,617,732,670,638
4,Don’t know/refused,15,14,15,11,10
5,Evangelical Prot,575,869,1064,982,881
6,Hindu,1,9,7,9,11
7,Historically Black Prot,228,244,236,238,197
8,Jehovah's Witness,20,27,24,24,21
9,Jewish,19,19,25,25,30


In [102]:
pd.melt(pew, id_vars=["religion"], var_name="income", value_name="count")

Unnamed: 0,religion,income,count
0,Agnostic,<$10k,27
1,Atheist,<$10k,12
2,Buddhist,<$10k,27
3,Catholic,<$10k,418
4,Don’t know/refused,<$10k,15
...,...,...,...
175,Orthodox,Don't know/refused,73
176,Other Christian,Don't know/refused,18
177,Other Faiths,Don't know/refused,71
178,Other World Religions,Don't know/refused,8


In [103]:
billboard=pd.read_csv("billboard.csv")
billboard

Unnamed: 0,year,artist,track,time,date.entered,wk1,wk2,wk3,wk4,wk5,...,wk67,wk68,wk69,wk70,wk71,wk72,wk73,wk74,wk75,wk76
0,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,87,82.0,72.0,77.0,87.0,...,,,,,,,,,,
1,2000,2Ge+her,The Hardest Part Of ...,3:15,2000-09-02,91,87.0,92.0,,,...,,,,,,,,,,
2,2000,3 Doors Down,Kryptonite,3:53,2000-04-08,81,70.0,68.0,67.0,66.0,...,,,,,,,,,,
3,2000,3 Doors Down,Loser,4:24,2000-10-21,76,76.0,72.0,69.0,67.0,...,,,,,,,,,,
4,2000,504 Boyz,Wobble Wobble,3:35,2000-04-15,57,34.0,25.0,17.0,17.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
312,2000,Yankee Grey,Another Nine Minutes,3:10,2000-04-29,86,83.0,77.0,74.0,83.0,...,,,,,,,,,,
313,2000,"Yearwood, Trisha",Real Live Woman,3:55,2000-04-01,85,83.0,83.0,82.0,81.0,...,,,,,,,,,,
314,2000,Ying Yang Twins,Whistle While You Tw...,4:19,2000-03-18,95,94.0,91.0,85.0,84.0,...,,,,,,,,,,
315,2000,Zombie Nation,Kernkraft 400,3:30,2000-09-02,99,99.0,,,,...,,,,,,,,,,


In [112]:
pd.melt(billboard, id_vars=["year","artist","track","time","date.entered"],var_name="week", value_name="rating")

Unnamed: 0,year,artist,track,time,date.entered,week,rating
0,2000,2 Pac,Baby Don't Cry (Keep...,4:22,2000-02-26,wk1,87.0
1,2000,2Ge+her,The Hardest Part Of ...,3:15,2000-09-02,wk1,91.0
2,2000,3 Doors Down,Kryptonite,3:53,2000-04-08,wk1,81.0
3,2000,3 Doors Down,Loser,4:24,2000-10-21,wk1,76.0
4,2000,504 Boyz,Wobble Wobble,3:35,2000-04-15,wk1,57.0
...,...,...,...,...,...,...,...
24087,2000,Yankee Grey,Another Nine Minutes,3:10,2000-04-29,wk76,
24088,2000,"Yearwood, Trisha",Real Live Woman,3:55,2000-04-01,wk76,
24089,2000,Ying Yang Twins,Whistle While You Tw...,4:19,2000-03-18,wk76,
24090,2000,Zombie Nation,Kernkraft 400,3:30,2000-09-02,wk76,


In [117]:
s=pd.Series(range(10))
s[3]=None
s.count()  #NaN은 제외


TypeError: Unsupported dtype "float64" for randint

In [124]:
df=pd.DataFrame(np.random.randint(5,size=(4,4)),dtype=float)
df
df.iloc[2,3]=None
df

Unnamed: 0,0,1,2,3
0,0.0,1.0,1.0,4.0
1,0.0,2.0,1.0,3.0
2,1.0,2.0,0.0,
3,1.0,0.0,2.0,3.0


In [123]:
df.count() #각 열의 데이터 갯수

0    4
1    4
2    4
3    3
dtype: int64

In [125]:
import seaborn as sns
titanic=sns.load_dataset("titanic")
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [126]:
titanic.count() #대략 각 columns의 NaN 갯수 여부 확인가능

survived       891
pclass         891
sex            891
age            714
sibsp          891
parch          891
fare           891
embarked       889
class          891
who            891
adult_male     891
deck           203
embark_town    889
alive          891
alone          891
dtype: int64

In [130]:
np.random.seed(777)
s2=pd.Series(np.random.randint(6,size=100))
s2.value_counts() #종류별로 데이터의 갯수 카운터 
                    #갯수를 기준으로 정렬 (내림차순)
                      #value_counts()는 Series에 대해서만 가능, DataFrame X

5    20
0    20
4    19
3    15
2    15
1    11
dtype: int64

In [133]:
df[0].value_counts() #가능

1.0    2
0.0    2
Name: 0, dtype: int64

In [135]:
s2.value_counts().sort_index()  #sort_index(): 갯수 기준이 아닌 index 기준으로 정렬(오름차순)

0    20
1    11
2    15
3    15
4    19
5    20
dtype: int64

In [145]:
s.sort_values() #값을 기준으로 정렬 (NaN은 맨 마지막으로 정렬)
s.sort_values(ascending=False) #내림차순

df.sort_values(by=2) #by=2 열의 값 정렬

df.sort_values(by=1)

Unnamed: 0,0,1,2,3
3,1.0,0.0,2.0,3.0
0,0.0,1.0,1.0,4.0
1,0.0,2.0,1.0,3.0
2,1.0,2.0,0.0,


In [147]:
df=pd.DataFrame(np.random.randint(10,size=(4,8)))
df
df["MySum"]=df.sum(axis=1)
df

Unnamed: 0,0,1,2,3,4,5,6,7,MySum
0,9,0,2,3,2,6,6,4,32
1,2,5,3,4,3,5,0,9,31
2,5,4,0,1,3,8,5,6,32
3,8,7,8,0,4,8,5,1,41


In [148]:
df.loc["total",:]=df.sum()
df

Unnamed: 0,0,1,2,3,4,5,6,7,MySum
0,9.0,0.0,2.0,3.0,2.0,6.0,6.0,4.0,32.0
1,2.0,5.0,3.0,4.0,3.0,5.0,0.0,9.0,31.0
2,5.0,4.0,0.0,1.0,3.0,8.0,5.0,6.0,32.0
3,8.0,7.0,8.0,0.0,4.0,8.0,5.0,1.0,41.0
total,24.0,16.0,13.0,8.0,12.0,27.0,16.0,20.0,136.0


## cut / qcut (실수 데이터를 범주화)

In [None]:
#실수 데이터를 범주화
#cut(실수 값의 경계 지정)
#qcut(똑같은 구간으로 지정)

In [150]:
#cut

ages=[0,2,10,21,23,37,31,61,20,42,32,100]
bins=[1,15,25,35,60,99]
labels=["미성년자","청년","중년","장년","노년"]

cat=pd.cut(ages,bins,labels=labels)
cat

[NaN, 미성년자, 미성년자, 청년, 청년, ..., 노년, 청년, 장년, 중년, NaN]
Length: 12
Categories (5, object): [미성년자 < 청년 < 중년 < 장년 < 노년]

In [151]:
cat.categories
cat.codes  #NaN은 -1로 출력됨

array([-1,  0,  0,  1,  1,  3,  2,  4,  1,  3,  2, -1], dtype=int8)

In [156]:
#qcut

df=pd.DataFrame(ages, columns=["ages"])
df["age_cat"]=pd.cut(ages,bins,labels=labels)  #category로 만든 age_cat을 기존 df에 추가
df

Unnamed: 0,ages,age_cat
0,0,
1,2,미성년자
2,10,미성년자
3,21,청년
4,23,청년
5,37,장년
6,31,중년
7,61,노년
8,20,청년
9,42,장년


In [158]:
#qcut 
data=np.random.rand(100)
data

cat=pd.qcut(data,4,labels=["Q1","Q2","Q3","Q4"])
cat

[Q2, Q4, Q3, Q2, Q3, ..., Q4, Q3, Q3, Q2, Q1]
Length: 100
Categories (4, object): [Q1 < Q2 < Q3 < Q4]