### 03 판다스 자료구조 살펴보기

#### [03 - 1] 나만의 데이터 만들기

##### 1. 시리즈와 데이터프레임 만들기

##### 1) 시리즈 만들기

In [95]:
import pandas as pd

In [96]:
s = pd.Series(['banana', 42])
print(s)

0    banana
1        42
dtype: object


In [97]:
s = pd.Series(data = ['Wed McKinney', 'Creator of Pandas'], index = ['Person', 'Who'])
print(s)

Person         Wed McKinney
Who       Creator of Pandas
dtype: object


##### 2) 데이터 프레임 만들기

In [98]:
# 데이터 프레임 생성
scientists = pd.DataFrame({
    "Name" : ["Rosaline Franklin", "William Gosset"],
    "Occupation" : ["Chemist", "Statistician"],
    "Born" : ["1920-07-25", "1876-06-13"],
    "Died" : ["1958-04-16", "1937-10-16"],
    "Age" : [37, 61],
})
print(scientists)

                Name    Occupation        Born        Died  Age
0  Rosaline Franklin       Chemist  1920-07-25  1958-04-16   37
1     William Gosset  Statistician  1876-06-13  1937-10-16   61


In [99]:
# name 열의 값을 열 이름 인덱스로 사용
scientists = pd.DataFrame(
    data = {
        "Occupation" : ["Chemist", "Statistician"],
        "Born" : ["1920-07-25", "1876-06-13"],
        "Died" : ["1958-04-16", "1937-10-16"],
        "Age" : [37, 61],
    },
    index = ["Rosaline Franklin", "William Gosset"],
    columns = ["Occupation", "Born", "Died", "Age"],
)
print(scientists)

                     Occupation        Born        Died  Age
Rosaline Franklin       Chemist  1920-07-25  1958-04-16   37
William Gosset     Statistician  1876-06-13  1937-10-16   61


#### [03 - 2] 시리즈 다루기

##### 1) 시리즈 추출하기

In [100]:
first_row = scientists.loc['William Gosset']
print(first_row, "\n", type(first_row))

Occupation    Statistician
Born            1876-06-13
Died            1937-10-16
Age                     61
Name: William Gosset, dtype: object 
 <class 'pandas.core.series.Series'>


In [102]:
# 인덱스와 값 추출
## 속성
print(first_row.index)
print(first_row.values)

Index(['Occupation', 'Born', 'Died', 'Age'], dtype='object')
['Statistician' '1876-06-13' '1937-10-16' 61]


##### 1. 시리즈의 keys() 메서드

In [103]:
## 메서드
print(first_row.keys())

Index(['Occupation', 'Born', 'Died', 'Age'], dtype='object')


In [104]:
# 속성과 메서드의 데이터 추출 구문 차이
print(first_row.index[0])
print(first_row.keys()[0])

Occupation
Occupation


##### 2. 시리즈와 ndarray

##### 1) 시리즈의 메서드 사용하기

In [105]:
# scientists 데이터프레임에서 Age 열 시리즈 추출
ages = scientists['Age']
print(ages)

Rosaline Franklin    37
William Gosset       61
Name: Age, dtype: int64


In [106]:
# 평균
print(ages.mean())

# 최솟값
print(ages.min())

# 최댓값
print(ages.max())

# 표준편차
print(ages.std())

49.0
37
61
16.97056274847714


##### 3. 시리즈와 불리언

In [107]:
scientists = pd.read_csv('../data/scientists.csv')

##### 1) 기술 통계량 계산하기

In [108]:
# 기술 통계량
ages = scientists['Age']
print(ages.describe())
print("평균값 => ", ages.mean())

count     8.000000
mean     59.125000
std      18.325918
min      37.000000
25%      44.000000
50%      58.500000
75%      68.750000
max      90.000000
Name: Age, dtype: float64
평균값 =>  59.125


In [109]:
# 평균 나이보다 나이가 많은 과학자만 추출
print(ages[ages > ages.mean()])     ## 대괄호 안에 조건을 지정

1    61
2    90
3    66
7    77
Name: Age, dtype: int64


In [110]:
print(ages > ages.mean())

# 자료형 확인
print(type(ages > ages.mean()))

0    False
1     True
2     True
3     True
4    False
5    False
6    False
7     True
Name: Age, dtype: bool
<class 'pandas.core.series.Series'>


In [111]:
# 불리언 시리즈 직접 지정하여 데이터 추출
manual_bool_values = [
    True,   #0
    True,   #1
    False,  #2
    False,  #3
    True,   #4
    True,   #5
    False,  #6
    True,   #7
]
print(ages[manual_bool_values])     # True로 지정한 0, 1, 4, 5, 7 행만 추출

0    37
1    61
4    56
5    45
7    77
Name: Age, dtype: int64


##### 4. 시리즈와 브로드캐스팅

##### 1) 벡터와 벡터, 벡터와 스칼라 계산하기

In [112]:
# 덧셈
print("덧셈 => \n", ages + ages, "\n")

# 곱셈
print("곱셈 => \n", ages * ages, "\n")

덧셈 => 
 0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64 

곱셈 => 
 0    1369
1    3721
2    8100
3    4356
4    3136
5    2025
6    1681
7    5929
Name: Age, dtype: int64 



In [113]:
# 스칼라와 벡터 연산
print(ages + 100)

0    137
1    161
2    190
3    166
4    156
5    145
6    141
7    177
Name: Age, dtype: int64


##### 2) 길이가 서로 다른 벡터 연산하기

In [114]:
print(ages + pd.Series([1, 100]))

0     38.0
1    161.0
2      NaN
3      NaN
4      NaN
5      NaN
6      NaN
7      NaN
dtype: float64


In [115]:
import numpy as np

# print(ages + np.array([1, 100]))      

##### 3) 인덱스가 같은 벡터 자동 정렬하기

In [116]:
# ages 시리즈 내림차순 정렬
rev_ages = ages.sort_index(ascending = False)
print(rev_ages)

7    77
6    41
5    45
4    56
3    66
2    90
1    61
0    37
Name: Age, dtype: int64


In [117]:
# ages와 스칼라 연산
print(ages * 2)

# ages와 rev_ages 더하기
print(ages + rev_ages)      # rev_ages의 인덱스가 내림차순으로 정렬된 상태였음에도 연산이 완료된 결과는 오름차순으로 자동 정렬

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64
0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64


#### [03 - 3] 데이터프레임 다루기

##### 1. 데이터프레임의 구성

In [118]:
# 행
print("행 => ", scientists.index)

# 열
print("열 => ", scientists.columns)

# 값
print("값 => ", scientists.values)

행 =>  RangeIndex(start=0, stop=8, step=1)
열 =>  Index(['Name', 'Born', 'Died', 'Age', 'Occupation'], dtype='object')
값 =>  [['Rosaline Franklin' '1920-07-25' '1958-04-16' 37 'Chemist']
 ['William Gosset' '1876-06-13' '1937-10-16' 61 'Statistician']
 ['Florence Nightingale' '1820-05-12' '1910-08-13' 90 'Nurse']
 ['Marie Curie' '1867-11-07' '1934-07-04' 66 'Chemist']
 ['Rachel Carson' '1907-05-27' '1964-04-14' 56 'Biologist']
 ['John Snow' '1813-03-15' '1858-06-16' 45 'Physician']
 ['Alan Turing' '1912-06-23' '1954-06-07' 41 'Computer Scientist']
 ['Johann Gauss' '1777-04-30' '1855-02-23' 77 'Mathematician']]


##### 2. 데이터프레임과 불리언 추출

In [119]:
# 데이터프레임과 불리언 추출
print(scientists.loc[scientists['Age'] > scientists['Age'].mean()])

                   Name        Born        Died  Age     Occupation
1        William Gosset  1876-06-13  1937-10-16   61   Statistician
2  Florence Nightingale  1820-05-12  1910-08-13   90          Nurse
3           Marie Curie  1867-11-07  1934-07-04   66        Chemist
7          Johann Gauss  1777-04-30  1855-02-23   77  Mathematician


##### 3. 데이터프레임과 브로드캐스팅

##### 1) 데이터프레임을 대상으로 연산하기

In [120]:
# 데이터프레임 반씩 나누기
first_half = scientists[:4]
second_half = scientists[4:]
print(first_half, "\n")
print(second_half)

                   Name        Born        Died  Age    Occupation
0     Rosaline Franklin  1920-07-25  1958-04-16   37       Chemist
1        William Gosset  1876-06-13  1937-10-16   61  Statistician
2  Florence Nightingale  1820-05-12  1910-08-13   90         Nurse
3           Marie Curie  1867-11-07  1934-07-04   66       Chemist 

            Name        Born        Died  Age          Occupation
4  Rachel Carson  1907-05-27  1964-04-14   56           Biologist
5      John Snow  1813-03-15  1858-06-16   45           Physician
6    Alan Turing  1912-06-23  1954-06-07   41  Computer Scientist
7   Johann Gauss  1777-04-30  1855-02-23   77       Mathematician


In [121]:
# 문자열과 스칼라의 연산
print(scientists * 2)

                                       Name                  Born  \
0        Rosaline FranklinRosaline Franklin  1920-07-251920-07-25   
1              William GossetWilliam Gosset  1876-06-131876-06-13   
2  Florence NightingaleFlorence Nightingale  1820-05-121820-05-12   
3                    Marie CurieMarie Curie  1867-11-071867-11-07   
4                Rachel CarsonRachel Carson  1907-05-271907-05-27   
5                        John SnowJohn Snow  1813-03-151813-03-15   
6                    Alan TuringAlan Turing  1912-06-231912-06-23   
7                  Johann GaussJohann Gauss  1777-04-301777-04-30   

                   Died  Age                            Occupation  
0  1958-04-161958-04-16   74                        ChemistChemist  
1  1937-10-161937-10-16  122              StatisticianStatistician  
2  1910-08-131910-08-13  180                            NurseNurse  
3  1934-07-041934-07-04  132                        ChemistChemist  
4  1964-04-141964-04-14  112     

In [122]:
# 같은 셀 값 더하기
df1 = df2 = pd.DataFrame(data = [[1, 2, 3], [4, 5, 6], [7, 8, 9]])
df_added = df1.add(df2)
print(df_added)

    0   1   2
0   2   4   6
1   8  10  12
2  14  16  18


#### [03 - 4] 시리즈와 데이터프레임 데이터 변환

##### 1) 열 추가하기

In [123]:
print(scientists.dtypes)

Name          object
Born          object
Died          object
Age            int64
Occupation    object
dtype: object


In [124]:
# Born 열 : 문자열(object) -> datetime 형으로 변환
born_datetime = pd.to_datetime(scientists['Born'], format = '%Y-%m-%d')
print(born_datetime)

0   1920-07-25
1   1876-06-13
2   1820-05-12
3   1867-11-07
4   1907-05-27
5   1813-03-15
6   1912-06-23
7   1777-04-30
Name: Born, dtype: datetime64[ns]


In [125]:
# Died 열 : 문자열(object) -> datetime 형으로 변환
died_datetime = pd.to_datetime(scientists['Died'], format = '%Y-%m-%d')

In [126]:
# born_dt, died_dt 새로운 열 추가
scientists['born_dt'], scientists['died_dt'] = (born_datetime, died_datetime)
print(scientists.head())

                   Name        Born        Died  Age    Occupation    born_dt  \
0     Rosaline Franklin  1920-07-25  1958-04-16   37       Chemist 1920-07-25   
1        William Gosset  1876-06-13  1937-10-16   61  Statistician 1876-06-13   
2  Florence Nightingale  1820-05-12  1910-08-13   90         Nurse 1820-05-12   
3           Marie Curie  1867-11-07  1934-07-04   66       Chemist 1867-11-07   
4         Rachel Carson  1907-05-27  1964-04-14   56     Biologist 1907-05-27   

     died_dt  
0 1958-04-16  
1 1937-10-16  
2 1910-08-13  
3 1934-07-04  
4 1964-04-14  


In [127]:
# 데이터프레임 모양과 각 열의 자료형 확인
print(scientists.shape)
print(scientists.dtypes)

(8, 7)
Name                  object
Born                  object
Died                  object
Age                    int64
Occupation            object
born_dt       datetime64[ns]
died_dt       datetime64[ns]
dtype: object


##### 2) 열 내용 변환하기

In [128]:
print(scientists['Age'])

# 무작위로 열 섞기
print(scientists["Age"].sample(frac = 1, random_state = 42))

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64
1    61
5    45
0    37
7    77
2    90
4    56
3    66
6    41
Name: Age, dtype: int64


In [129]:
# 섞인 열 Age 열에 할당
scientists["Age"] = scientists["Age"].sample(frac = 1, random_state = 42)   # 각 값에 대응하는 인덱스는 변하지 않으면서 제자리를 찾아감
print(scientists['Age'])

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64


In [130]:
# 인덱스 정보 없이 값만 반환
scientists["Age"] = scientists["Age"].sample(frac = 1, random_state = 42).values    # 인덱스 정보 없이 값만 반환하는 values 사용으로 섞인 값 그대로 할당 가능
print(scientists["Age"])

0    61
1    45
2    37
3    77
4    90
5    56
6    66
7    41
Name: Age, dtype: int64


In [131]:
# 과학자의 실제 나이
scientists['age_days'] = (scientists['died_dt'] - scientists['born_dt'])
print(scientists, "\n\n")

# 햇수로 변경
scientists['age_years'] = (scientists['age_days'].dt.days / 365).apply(np.floor)
print(scientists)

                   Name        Born        Died  Age          Occupation  \
0     Rosaline Franklin  1920-07-25  1958-04-16   61             Chemist   
1        William Gosset  1876-06-13  1937-10-16   45        Statistician   
2  Florence Nightingale  1820-05-12  1910-08-13   37               Nurse   
3           Marie Curie  1867-11-07  1934-07-04   77             Chemist   
4         Rachel Carson  1907-05-27  1964-04-14   90           Biologist   
5             John Snow  1813-03-15  1858-06-16   56           Physician   
6           Alan Turing  1912-06-23  1954-06-07   66  Computer Scientist   
7          Johann Gauss  1777-04-30  1855-02-23   41       Mathematician   

     born_dt    died_dt   age_days  
0 1920-07-25 1958-04-16 13779 days  
1 1876-06-13 1937-10-16 22404 days  
2 1820-05-12 1910-08-13 32964 days  
3 1867-11-07 1934-07-04 24345 days  
4 1907-05-27 1964-04-14 20777 days  
5 1813-03-15 1858-06-16 16529 days  
6 1912-06-23 1954-06-07 15324 days  
7 1777-04-30 1855-0

##### 3) assign()으로 열 수정하기

In [132]:
# assign()으로 열 수정
scientists = scientists.assign(
    age_days_assign = scientists['died_dt'] - scientists['born_dt'],
    age_year_assign = (scientists['age_days'].dt.days / 365).apply(np.floor))
    
print(scientists)

                   Name        Born        Died  Age          Occupation  \
0     Rosaline Franklin  1920-07-25  1958-04-16   61             Chemist   
1        William Gosset  1876-06-13  1937-10-16   45        Statistician   
2  Florence Nightingale  1820-05-12  1910-08-13   37               Nurse   
3           Marie Curie  1867-11-07  1934-07-04   77             Chemist   
4         Rachel Carson  1907-05-27  1964-04-14   90           Biologist   
5             John Snow  1813-03-15  1858-06-16   56           Physician   
6           Alan Turing  1912-06-23  1954-06-07   66  Computer Scientist   
7          Johann Gauss  1777-04-30  1855-02-23   41       Mathematician   

     born_dt    died_dt   age_days  age_years age_days_assign  age_year_assign  
0 1920-07-25 1958-04-16 13779 days       37.0      13779 days             37.0  
1 1876-06-13 1937-10-16 22404 days       61.0      22404 days             61.0  
2 1820-05-12 1910-08-13 32964 days       90.0      32964 days           

##### 4) 열 삭제하기

In [133]:
# scientists 모든 열 확인
print(scientists.columns)

Index(['Name', 'Born', 'Died', 'Age', 'Occupation', 'born_dt', 'died_dt',
       'age_days', 'age_years', 'age_days_assign', 'age_year_assign'],
      dtype='object')


In [134]:
# Age 열 삭제
scientists_dropped = scientists.drop(['Age'], axis = "columns")

In [135]:
# 열 삭제 확인
print(scientists_dropped.columns)

Index(['Name', 'Born', 'Died', 'Occupation', 'born_dt', 'died_dt', 'age_days',
       'age_years', 'age_days_assign', 'age_year_assign'],
      dtype='object')


#### [03 - 5] 데이터 저장하고 불러오기

##### 1. 피클로 저장하고 불러오기

##### 1) 시리즈와 데이터프레임 저장하기

In [136]:
# Name 열 names에 저장
names = scientists['Name']
print(names)

# to_pickle() 메서드로 호출하여 피클로 저장
names.to_pickle('../output/scientists_names_series.pickle')

# 데이터프레임에서도 같은 메서드 사용 가능
scientists.to_pickle('../output/scientists_df.pickle')

0       Rosaline Franklin
1          William Gosset
2    Florence Nightingale
3             Marie Curie
4           Rachel Carson
5               John Snow
6             Alan Turing
7            Johann Gauss
Name: Name, dtype: object


##### 2) 피클 데이터 읽어 오기

In [137]:
# 피클 파일 읽어오기
series_pickle = pd.read_pickle('../output/scientists_names_series.pickle')
dataframe_pickle = pd.read_pickle('../output/scientists_df.pickle')
print(series_pickle, "\n\n")
print(dataframe_pickle)

0       Rosaline Franklin
1          William Gosset
2    Florence Nightingale
3             Marie Curie
4           Rachel Carson
5               John Snow
6             Alan Turing
7            Johann Gauss
Name: Name, dtype: object 


                   Name        Born        Died  Age          Occupation  \
0     Rosaline Franklin  1920-07-25  1958-04-16   61             Chemist   
1        William Gosset  1876-06-13  1937-10-16   45        Statistician   
2  Florence Nightingale  1820-05-12  1910-08-13   37               Nurse   
3           Marie Curie  1867-11-07  1934-07-04   77             Chemist   
4         Rachel Carson  1907-05-27  1964-04-14   90           Biologist   
5             John Snow  1813-03-15  1858-06-16   56           Physician   
6           Alan Turing  1912-06-23  1954-06-07   66  Computer Scientist   
7          Johann Gauss  1777-04-30  1855-02-23   41       Mathematician   

     born_dt    died_dt   age_days  age_years age_days_assign  age_year_assign

##### 2. 엑셀로 저장하기

##### 1) 시리즈와 데이터프레임 저장하기

In [138]:
# scientists 데이터프레임의 Name 열 확인
names = scientists['Name']
print(names)

# 시리즈를 데이터프레임으로 변환
names_df = names.to_frame()

# 엑셀 파일 저장
names_df.to_excel('../output/scientists_names_series_df.xls', engine = 'openpyxl')

# 엑셀 파일의 특정 시트에 결과 저장
scientists.to_excel('../output/scientists_df.xlsx', sheet_name = "scientists", index = False)


0       Rosaline Franklin
1          William Gosset
2    Florence Nightingale
3             Marie Curie
4           Rachel Carson
5               John Snow
6             Alan Turing
7            Johann Gauss
Name: Name, dtype: object


##### 3. 다양한 형식으로 저장

##### 1) 딕셔너리로 변환하기

In [139]:
# scientists 데이터셋의 처음 두 행만 추출
sci_sub_dict = scientists.head(2)

# 데이터프레임 -> 딕셔너리 변환
sci_dict = sci_sub_dict.to_dict()

# 딕셔너리 결과 출력
import pprint
pprint.pprint(sci_dict)

{'Age': {0: 61, 1: 45},
 'Born': {0: '1920-07-25', 1: '1876-06-13'},
 'Died': {0: '1958-04-16', 1: '1937-10-16'},
 'Name': {0: 'Rosaline Franklin', 1: 'William Gosset'},
 'Occupation': {0: 'Chemist', 1: 'Statistician'},
 'age_days': {0: Timedelta('13779 days 00:00:00'),
              1: Timedelta('22404 days 00:00:00')},
 'age_days_assign': {0: Timedelta('13779 days 00:00:00'),
                     1: Timedelta('22404 days 00:00:00')},
 'age_year_assign': {0: 37.0, 1: 61.0},
 'age_years': {0: 37.0, 1: 61.0},
 'born_dt': {0: Timestamp('1920-07-25 00:00:00'),
             1: Timestamp('1876-06-13 00:00:00')},
 'died_dt': {0: Timestamp('1958-04-16 00:00:00'),
             1: Timestamp('1937-10-16 00:00:00')}}


In [140]:
# 판다스 데이터프레임으로 읽기
sci_dict_df = pd.DataFrame.from_dict(sci_dict)
print(sci_dict_df)

                Name        Born        Died  Age    Occupation    born_dt  \
0  Rosaline Franklin  1920-07-25  1958-04-16   61       Chemist 1920-07-25   
1     William Gosset  1876-06-13  1937-10-16   45  Statistician 1876-06-13   

     died_dt   age_days  age_years age_days_assign  age_year_assign  
0 1958-04-16 13779 days       37.0      13779 days             37.0  
1 1937-10-16 22404 days       61.0      22404 days             61.0  


##### 2) JSON으로 저장하기

In [141]:
# json으로 불러오기
sci_json = sci_sub_dict.to_json(orient = 'records', indent = 2, date_format = 'iso')
pprint.pprint(sci_json)

('[\n'
 '  {\n'
 '    "Name":"Rosaline Franklin",\n'
 '    "Born":"1920-07-25",\n'
 '    "Died":"1958-04-16",\n'
 '    "Age":61,\n'
 '    "Occupation":"Chemist",\n'
 '    "born_dt":"1920-07-25T00:00:00.000",\n'
 '    "died_dt":"1958-04-16T00:00:00.000",\n'
 '    "age_days":"P13779DT0H0M0S",\n'
 '    "age_years":37.0,\n'
 '    "age_days_assign":"P13779DT0H0M0S",\n'
 '    "age_year_assign":37.0\n'
 '  },\n'
 '  {\n'
 '    "Name":"William Gosset",\n'
 '    "Born":"1876-06-13",\n'
 '    "Died":"1937-10-16",\n'
 '    "Age":45,\n'
 '    "Occupation":"Statistician",\n'
 '    "born_dt":"1876-06-13T00:00:00.000",\n'
 '    "died_dt":"1937-10-16T00:00:00.000",\n'
 '    "age_days":"P22404DT0H0M0S",\n'
 '    "age_years":61.0,\n'
 '    "age_days_assign":"P22404DT0H0M0S",\n'
 '    "age_year_assign":61.0\n'
 '  }\n'
 ']')


In [142]:
# 데이터프레임으로 변환
sci_json_df = pd.read_json(
    ('[\n'
 '  {\n'
 '    "Name":"Rosaline Franklin",\n'
 '    "Born":"1920-07-25",\n'
 '    "Died":"1958-04-16",\n'
 '    "Age":61,\n'
 '    "Occupation":"Chemist",\n'
 '    "born_dt":"1920-07-25T00:00:00.000",\n'
 '    "died_dt":"1958-04-16T00:00:00.000",\n'
 '    "age_days":"P13779DT0H0M0S",\n'
 '    "age_years":37.0,\n'
 '    "age_days_assign":"P13779DT0H0M0S",\n'
 '    "age_year_assign":37.0\n'
 '  },\n'
 '  {\n'
 '    "Name":"William Gosset",\n'
 '    "Born":"1876-06-13",\n'
 '    "Died":"1937-10-16",\n'
 '    "Age":45,\n'
 '    "Occupation":"Statistician",\n'
 '    "born_dt":"1876-06-13T00:00:00.000",\n'
 '    "died_dt":"1937-10-16T00:00:00.000",\n'
 '    "age_days":"P22404DT0H0M0S",\n'
 '    "age_years":61.0,\n'
 '    "age_days_assign":"P22404DT0H0M0S",\n'
 '    "age_year_assign":61.0\n'
 '  }\n'
 ']'),
 orient = "records"
)
print(sci_json_df)

                Name        Born        Died  Age    Occupation  \
0  Rosaline Franklin  1920-07-25  1958-04-16   61       Chemist   
1     William Gosset  1876-06-13  1937-10-16   45  Statistician   

                   born_dt                  died_dt        age_days  \
0  1920-07-25T00:00:00.000  1958-04-16T00:00:00.000  P13779DT0H0M0S   
1  1876-06-13T00:00:00.000  1937-10-16T00:00:00.000  P22404DT0H0M0S   

   age_years age_days_assign  age_year_assign  
0         37  P13779DT0H0M0S               37  
1         61  P22404DT0H0M0S               61  


  sci_json_df = pd.read_json(


In [143]:
# 자료형 확인
print(sci_json_df.dtypes)   # born_dt, died_dt 등 날짜와 시간 형식이 원본 데이터와 다름

Name               object
Born               object
Died               object
Age                 int64
Occupation         object
born_dt            object
died_dt            object
age_days           object
age_years           int64
age_days_assign    object
age_year_assign     int64
dtype: object


In [144]:
# datetime으로 변환
sci_json_df["died_dt_json"] = pd.to_datetime(sci_json_df["died_dt"])
print(sci_json_df, "\n")
print("자료형 확인 => \n", sci_json_df.dtypes)

                Name        Born        Died  Age    Occupation  \
0  Rosaline Franklin  1920-07-25  1958-04-16   61       Chemist   
1     William Gosset  1876-06-13  1937-10-16   45  Statistician   

                   born_dt                  died_dt        age_days  \
0  1920-07-25T00:00:00.000  1958-04-16T00:00:00.000  P13779DT0H0M0S   
1  1876-06-13T00:00:00.000  1937-10-16T00:00:00.000  P22404DT0H0M0S   

   age_years age_days_assign  age_year_assign died_dt_json  
0         37  P13779DT0H0M0S               37   1958-04-16  
1         61  P22404DT0H0M0S               61   1937-10-16   

자료형 확인 => 
 Name                       object
Born                       object
Died                       object
Age                         int64
Occupation                 object
born_dt                    object
died_dt                    object
age_days                   object
age_years                   int64
age_days_assign            object
age_year_assign             int64
died_dt_json 