# 자료형 변환하기

> seaborn 라이브러리의 tips 데이터 집합 확인

In [1]:
import pandas as pd
import seaborn as sns


tips = sns.load_dataset('tips')
tips.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

In [2]:
tips['smoker_str'] = tips['smoker'].astype(str)
print(tips.dtypes) # 판다스에서 문자열은 오브젝트로 취급 

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
smoker_str      object
dtype: object


In [5]:
tips['smoker_str'] = tips['smoker'].astype(str)
tips.dtypes

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
smoker_str      object
dtype: object

In [4]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,smoker_str
0,16.99,1.01,Female,No,Sun,Dinner,2,No
1,10.34,1.66,Male,No,Sun,Dinner,3,No
2,21.01,3.5,Male,No,Sun,Dinner,3,No
3,23.68,3.31,Male,No,Sun,Dinner,2,No
4,24.59,3.61,Female,No,Sun,Dinner,4,No


In [6]:
tips['total_bill'] = tips['total_bill'].astype(str)
print(tips.dtypes)

total_bill      object
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
smoker_str      object
dtype: object


In [7]:
tips['total_bill'] = tips['total_bill'].astype(float)
print(tips.dtypes)

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
smoker_str      object
dtype: object


In [9]:
tips.loc[[1,3,5,7],['total_bill']] = 'missing' # 자동으로 total_bill 컬럼 type은 object로 변경 
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,smoker_str
0,16.99,1.01,Female,No,Sun,Dinner,2,No
1,missing,1.66,Male,No,Sun,Dinner,3,No
2,21.01,3.50,Male,No,Sun,Dinner,3,No
3,missing,3.31,Male,No,Sun,Dinner,2,No
4,24.59,3.61,Female,No,Sun,Dinner,4,No
...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,No
240,27.18,2.00,Female,Yes,Sat,Dinner,2,Yes
241,22.67,2.00,Male,Yes,Sat,Dinner,2,Yes
242,17.82,1.75,Male,No,Sat,Dinner,2,No


In [10]:
tips.dtypes 

total_bill      object
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
smoker_str      object
dtype: object

In [11]:
tips['total_bill'].astype(float) # missing 때문에 float형으로 변경 X 

ValueError: could not convert string to float: 'missing'

> 잘못 입력한 데이터 처리하기 ( to_numeric)

In [12]:
tips.loc[[1,3,5,7],['total_bill']] = 'missing'

In [13]:
tips_sub_miss = tips.head(10)

In [14]:
tips_sub_miss

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,smoker_str
0,16.99,1.01,Female,No,Sun,Dinner,2,No
1,missing,1.66,Male,No,Sun,Dinner,3,No
2,21.01,3.5,Male,No,Sun,Dinner,3,No
3,missing,3.31,Male,No,Sun,Dinner,2,No
4,24.59,3.61,Female,No,Sun,Dinner,4,No
5,missing,4.71,Male,No,Sun,Dinner,4,No
6,8.77,2.0,Male,No,Sun,Dinner,2,No
7,missing,3.12,Male,No,Sun,Dinner,4,No
8,15.04,1.96,Male,No,Sun,Dinner,2,No
9,14.78,3.23,Male,No,Sun,Dinner,2,No


In [15]:
pd.to_numeric(tips_sub_miss['total_bill'], errors ='coerce' )

0    16.99
1      NaN
2    21.01
3      NaN
4    24.59
5      NaN
6     8.77
7      NaN
8    15.04
9    14.78
Name: total_bill, dtype: float64

In [16]:
pd.to_numeric(tips_sub_miss['total_bill'], errors='ignore')

0      16.99
1    missing
2      21.01
3    missing
4      24.59
5    missing
6       8.77
7    missing
8      15.04
9      14.78
Name: total_bill, dtype: object

#### 카테고리 자료형의 장점과 특징 

용량과 속도면에서 효율적이며 범주형이라 주료 동일한 문자열이 반복되어 데이터를 구성하는 경우에 사용

In [19]:
tips['smoker'] = tips['smoker'].astype('category')
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    object  
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
 7   smoker_str  244 non-null    object  
dtypes: category(4), float64(1), int64(1), object(2)
memory usage: 9.3+ KB
