<a href="https://colab.research.google.com/github/JakeOh/202105_itw_bd26/blob/main/lab_da/da17_categorical.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Categorical Data

In [1]:
import numpy as np
import pandas as pd

In [2]:
movie_rating = pd.DataFrame({'user_id': np.arange(1, 1_000_001),
                             'gender': ['M'] * 500_000 + ['F'] * 500_000,
                             'rating': np.random.randint(1, 6, size=1_000_000)})

In [3]:
movie_rating

Unnamed: 0,user_id,gender,rating
0,1,M,3
1,2,M,2
2,3,M,4
3,4,M,5
4,5,M,3
...,...,...,...
999995,999996,F,5
999996,999997,F,4
999997,999998,F,5
999998,999999,F,4


In [4]:
movie_rating.info()
#> gender 컬럼의 타입은 object(문자열)
#> rating 컬럼의 타입은 int64(정수)
#> memory usage: 22.9+ MB

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 3 columns):
 #   Column   Non-Null Count    Dtype 
---  ------   --------------    ----- 
 0   user_id  1000000 non-null  int64 
 1   gender   1000000 non-null  object
 2   rating   1000000 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 22.9+ MB


In [5]:
movie_rating['gender'].value_counts()

M    500000
F    500000
Name: gender, dtype: int64

In [6]:
movie_rating['rating'].value_counts()

3    200527
1    200254
2    199854
4    199844
5    199521
Name: rating, dtype: int64

In [7]:
# gender 컬럼의 타입을 object에서 category 타입으로 변환
movie_rating['gender'] = movie_rating['gender'].astype('category')

In [8]:
movie_rating.info()
#> memory usage: 16.2 MB

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 3 columns):
 #   Column   Non-Null Count    Dtype   
---  ------   --------------    -----   
 0   user_id  1000000 non-null  int64   
 1   gender   1000000 non-null  category
 2   rating   1000000 non-null  int64   
dtypes: category(1), int64(2)
memory usage: 16.2 MB


In [9]:
# rating 컬럼의 타입을 int64에서 category 타입으로 변환
movie_rating['rating'] = movie_rating['rating'].astype('category')

In [10]:
movie_rating.info()
#> memory usage: 9.5 MB

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 3 columns):
 #   Column   Non-Null Count    Dtype   
---  ------   --------------    -----   
 0   user_id  1000000 non-null  int64   
 1   gender   1000000 non-null  category
 2   rating   1000000 non-null  category
dtypes: category(2), int64(1)
memory usage: 9.5 MB


In [11]:
movie_rating

Unnamed: 0,user_id,gender,rating
0,1,M,3
1,2,M,2
2,3,M,4
3,4,M,5
4,5,M,3
...,...,...,...
999995,999996,F,5
999996,999997,F,4
999997,999998,F,5
999998,999999,F,4


# 연속형 변수에서 파생된 카테고리 변수 생성

In [12]:
df = pd.DataFrame({'id': np.arange(1, 11), 
                   'age': np.random.randint(10, 90, size=10)})

In [13]:
df

Unnamed: 0,id,age
0,1,61
1,2,83
2,3,87
3,4,69
4,5,74
5,6,64
6,7,52
7,8,14
8,9,36
9,10,52


In [19]:
df['ages'] = pd.cut(x=df['age'], bins=np.arange(10, 100, 10), right=False)
# bins: 구간의 경계값들로 이루어진 배열
# right: 구간의 오른쪽 경계를 포함할 지 말 지를 결정. 기본값은 True.

In [20]:
df

Unnamed: 0,id,age,ages
0,1,61,"[60, 70)"
1,2,83,"[80, 90)"
2,3,87,"[80, 90)"
3,4,69,"[60, 70)"
4,5,74,"[70, 80)"
5,6,64,"[60, 70)"
6,7,52,"[50, 60)"
7,8,14,"[10, 20)"
8,9,36,"[30, 40)"
9,10,52,"[50, 60)"


In [21]:
df.info()
#> pd.cut() 함수는 Category 타입이 리턴됨.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   id      10 non-null     int64   
 1   age     10 non-null     int64   
 2   ages    10 non-null     category
dtypes: category(1), int64(2)
memory usage: 746.0 bytes


In [22]:
df.groupby('ages').size()  # df['ages'].value_counts()

ages
[10, 20)    1
[20, 30)    0
[30, 40)    1
[40, 50)    0
[50, 60)    2
[60, 70)    3
[70, 80)    1
[80, 90)    2
dtype: int64

pd.cut() 함수에서 bins 파라미터의 구간 경계값들은 일정한 간격일 필요는 없다.

In [24]:
df['age_level'] = pd.cut(x=df['age'],
                         bins=[0, 20, 60, 100],
                         labels=['young', 'middle', 'old'])
#> bins: 구간의 경계값들로 이루어진 배열(리스트)
#> labels: 구간의 레이블(이름)
df

Unnamed: 0,id,age,ages,age_level
0,1,61,"[60, 70)",old
1,2,83,"[80, 90)",old
2,3,87,"[80, 90)",old
3,4,69,"[60, 70)",old
4,5,74,"[70, 80)",old
5,6,64,"[60, 70)",old
6,7,52,"[50, 60)",middle
7,8,14,"[10, 20)",young
8,9,36,"[30, 40)",middle
9,10,52,"[50, 60)",middle


In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   id         10 non-null     int64   
 1   age        10 non-null     int64   
 2   ages       10 non-null     category
 3   age_level  10 non-null     category
dtypes: category(2), int64(2)
memory usage: 860.0 bytes


In [26]:
df.groupby('age_level').size()

age_level
young     1
middle    3
old       6
dtype: int64

In [29]:
df.groupby('age_level')['age'].mean()

age_level
young     14.000000
middle    46.666667
old       73.000000
Name: age, dtype: float64