# 1.자료형 변환
## 1-1.자료형을 자유자재로 변환하기 ─ astype 메서드

In [36]:
import warnings
warnings.simplefilter(action='ignore')


In [23]:
import pandas as pd
import seaborn as sns
tips = sns.load_dataset('tips')

### 1-1-1.여러 가지 자료형을 문자열로 변환하기

In [24]:
# 자료형을 변환하려면 astype 메서드를 사용
# sex열의 자료형은 카테고리입니다. 문자열로 변환하여 sex_str열에 저장
tips['sex_str'] = tips['sex'].astype(str)
print(tips.dtypes)

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
sex_str         object
dtype: object


### 1-1-2.  자료형을 변환한 데이터 다시 원래대로 만들기

In [27]:
tips['total_bill'] = tips['total_bill'].astype(str) 
print(tips.dtypes)

total_bill      object
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
sex_str         object
dtype: object


In [26]:
tips['total_bill'] = tips['total_bill'].astype(float) 
print(tips.dtypes)

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
sex_str         object
dtype: object


### 1-2.잘못 입력한 문자열 처리하기 ─ to_numeric 메서드

In [28]:
# total_bill열의 1,3,5,7행의 데이터를 'missing'로 변경해서 tips_sub_miss에 저장
tips_sub_miss = tips.head(10)
tips_sub_miss.loc[[1,3,5,7],'total_bill'] = 'missing'
tips_sub_miss

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,sex_str
0,16.99,1.01,Female,No,Sun,Dinner,2,Female
1,missing,1.66,Male,No,Sun,Dinner,3,Male
2,21.01,3.5,Male,No,Sun,Dinner,3,Male
3,missing,3.31,Male,No,Sun,Dinner,2,Male
4,24.59,3.61,Female,No,Sun,Dinner,4,Female
5,missing,4.71,Male,No,Sun,Dinner,4,Male
6,8.77,2.0,Male,No,Sun,Dinner,2,Male
7,missing,3.12,Male,No,Sun,Dinner,4,Male
8,15.04,1.96,Male,No,Sun,Dinner,2,Male
9,14.78,3.23,Male,No,Sun,Dinner,2,Male


In [29]:
# total_bill이 float였는데 'missing' 이라는 문자열때문에 object타입으로 변경됨
print(tips_sub_miss.dtypes)

total_bill      object
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
sex_str         object
dtype: object


In [31]:
# missing을 float으로 변경할 수 없어서 에러
tips_sub_miss['total_bill'] = tips_sub_miss['total_bill'].astype(float) 

ValueError: could not convert string to float: 'missing'

In [38]:
# error인자를 ignore로 설정하면 오류는 발생하지 않지만, 자료형은 변하지 않는다.
tips_sub_miss['total_bill'] = pd.to_numeric( tips_sub_miss['total_bill']
                                           ,errors='ignore')
tips_sub_miss.dtypes

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
sex_str         object
dtype: object

#### coerce
- 숫자로 변환할 수 없는 값을 누락값으로 지정

In [40]:
tips_sub_miss['total_bill'] = \
    pd.to_numeric( tips_sub_miss['total_bill'], errors='coerce')

print(tips_sub_miss.dtypes)
tips_sub_miss

total_bill     float64
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
sex_str         object
dtype: object


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,sex_str
0,16.99,1.01,Female,No,Sun,Dinner,2,Female
1,,1.66,Male,No,Sun,Dinner,3,Male
2,21.01,3.5,Male,No,Sun,Dinner,3,Male
3,,3.31,Male,No,Sun,Dinner,2,Male
4,24.59,3.61,Female,No,Sun,Dinner,4,Female
5,,4.71,Male,No,Sun,Dinner,4,Male
6,8.77,2.0,Male,No,Sun,Dinner,2,Male
7,,3.12,Male,No,Sun,Dinner,4,Male
8,15.04,1.96,Male,No,Sun,Dinner,2,Male
9,14.78,3.23,Male,No,Sun,Dinner,2,Male


#### downcast = float
- float64 -> float32

float64는 float32보다 더 많은 실수를 표현하지만 2배 많은 메모리공간을 차지한다.

In [41]:
tips_sub_miss['total_bill'] = pd.to_numeric( tips_sub_miss['total_bill']
                                            , errors='coerce', downcast='float')

print(tips_sub_miss.dtypes)

total_bill     float32
tip            float64
sex           category
smoker        category
day           category
time          category
size             int64
sex_str         object
dtype: object


### 1-3. 자열을 카테고리로 변환하기

In [48]:
tips['sex'] = tips['sex'].astype('str')
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 8 columns):
total_bill    244 non-null object
tip           244 non-null float64
sex           244 non-null object
smoker        244 non-null category
day           244 non-null category
time          244 non-null category
size          244 non-null int64
sex_str       244 non-null object
dtypes: category(3), float64(1), int64(1), object(3)
memory usage: 10.7+ KB


In [50]:
# sex열을 카테고리로 변환, df의 용량도 줄어듦을 확인
tips['sex'] = tips['sex'].astype('category')
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 8 columns):
total_bill    244 non-null object
tip           244 non-null float64
sex           244 non-null category
smoker        244 non-null category
day           244 non-null category
time          244 non-null category
size          244 non-null int64
sex_str       244 non-null object
dtypes: category(4), float64(1), int64(1), object(2)
memory usage: 9.2+ KB
