In [2]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pylab as plt
import scipy as sp
import scipy.stats as stats
from sklearn.preprocessing import *

In [None]:
"""
np.digitize(), np.where() 를 이용한 연속형 변수의 이산형화(discretization)

np.digitize(data, bins)
np.where(condition, factor1, factor2, ..)

이항변수화(binarization)는 '0'과 '1'의 값만을 가지는 
가변수(dummy variable)를 만드는 것 

이산형화(discretization)은 연속형 변수를 
2개 이상의 범주(category)를 가지는 변수로 변환하는 것

ex) 성적  등급
    ~90    A
    ~80    B
    ~70    C
    ~60    D
    <60    F

"""

In [39]:
# setting random seed
np.random.seed(10)

# making df 
df = pd.DataFrame(
    {
        "score": np.random.randint(1, 101, 20),
        "group": ["red", "red", "red", "red", "red", 
                  "red", "red", "red", "red", "red", 
                  "blue", "blue", "blue", "blue", "blue",
                  "blue", "blue", "blue", "blue", "blue"]
    }
)
df

Unnamed: 0,group,score
0,red,10
1,red,16
2,red,65
3,red,29
4,red,90
5,red,94
6,red,30
7,red,9
8,red,74
9,red,1


In [59]:
# (1) np.digitize(data, bins)를 이용한 
#     연속형 변수의 이산형화 (discretization)

# making 10 bins
bins = np.linspace(100, df.score.min(), 9, endpoint=False)
bins

array([ 100.,   89.,   78.,   67.,   56.,   45.,   34.,   23.,   12.])

In [60]:
# making digitized column using np.digitize(data, bins)
df["grade"] = np.digitize(df.score, bins)
df

Unnamed: 0,group,score,grade
0,red,10,9
1,red,16,8
2,red,65,4
3,red,29,7
4,red,90,1
5,red,94,1
6,red,30,7
7,red,9,9
8,red,74,3
9,red,1,9


In [61]:
# aggregation with groupby()
# 등급별 인원수 세기
print(df.groupby("grade")["score"].count())
print(df.groupby("grade")["score"].size())

grade
1    3
2    1
3    2
4    2
5    1
6    3
7    2
8    3
9    3
Name: score, dtype: int64
grade
1    3
2    1
3    2
4    2
5    1
6    3
7    2
8    3
9    3
Name: score, dtype: int64


In [62]:
# mean by grade group
# 등급별 평균
df.groupby("grade")["score"].mean()

grade
1    91.000000
2    79.000000
3    73.500000
4    64.000000
5    55.000000
6    37.333333
7    29.500000
8    15.000000
9     6.666667
Name: score, dtype: float64

In [63]:
# std by grade group
df.groupby("grade")["score"].std()

grade
1    2.645751
2         NaN
3    0.707107
4    1.414214
5         NaN
6    3.511885
7    0.707107
8    2.645751
9    4.932883
Name: score, dtype: float64

In [64]:
# 그룹별 등급 빈도수 확인
df.groupby("group")["grade"].value_counts(sort=False)

group  grade
blue   1        1
       2        1
       3        1
       4        1
       5        1
       6        3
       8        2
red    1        2
       3        1
       4        1
       7        2
       8        1
       9        3
Name: grade, dtype: int64

In [66]:
# 그룹별 점수 평균
df.groupby("group")["score"].mean()

group
blue    50.0
red     41.8
Name: score, dtype: float64

In [68]:
# indexing
# 1등급인 학생 데이터 확인
df[df["grade"]==1]

Unnamed: 0,group,score,grade
4,red,90,1
5,red,94,1
15,blue,89,1


In [None]:
# pd.get_dummies() 를 이용해 가변수(dummy var) 만들기
# prefix : 접두사 추가
# drop_first=True : 가변수의 첫번째 변수 자동 삭제. 
#                   dummy trap을 피할 수 있게 해준다

In [71]:
# get dummy variables with prefix from a categorical variable
pd.get_dummies(df["grade"], prefix="grade")

Unnamed: 0,grade_1,grade_2,grade_3,grade_4,grade_5,grade_6,grade_7,grade_8,grade_9
0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,1,0
2,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0
4,1,0,0,0,0,0,0,0,0
5,1,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,1,0,0
7,0,0,0,0,0,0,0,0,1
8,0,0,1,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,1


In [70]:
# drop_first : Whether to get k-1 dummies out of k categorical levels
#              by removing the first level to avoid dummy trap

pd.get_dummies(df["grade"], prefix="grade", drop_first=True)

Unnamed: 0,grade_2,grade_3,grade_4,grade_5,grade_6,grade_7,grade_8,grade_9
0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,1,0
2,0,0,1,0,0,0,0,0
3,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0
6,0,0,0,0,0,1,0,0
7,0,0,0,0,0,0,0,1
8,0,1,0,0,0,0,0,0
9,0,0,0,0,0,0,0,1


In [105]:
# np.where(condition, factor1, factor2, ...)를 
# 이용한 연속형 변수의 이산형화


# ex) 성적  등급
#     ~90    A
#     ~80    B
#     ~70    C
#     ~60    D
#     <60    F

df["grade2"] = np.where(df["score"]>=90, "A", 
                        (np.where(df["score"]>=80, "B",
                        (np.where(df["score"]>=70, "C",
                        (np.where(df["score"]>=60, "D", "F")))))))

df.sort_values("score", ascending=False)

Unnamed: 0,group,score,grade,grade2
5,red,94,1,A
4,red,90,1,A
15,blue,89,1,B
19,blue,79,2,C
8,red,74,3,C
18,blue,73,3,C
2,red,65,4,D
16,blue,63,4,D
14,blue,55,5,F
10,blue,41,6,F


In [104]:
# 각 grade2별 빈도수
print(df.groupby("grade2")["score"].size())

# 평균
print(df.groupby("grade2")["score"].mean())

grade2
A     2
B     1
C     3
D     2
F    12
Name: score, dtype: int64
grade2
A    92.000000
B    89.000000
C    75.333333
D    64.000000
F    24.250000
Name: score, dtype: float64
