# 시리즈 만들기

In [20]:
import pandas as pd
s = pd.Series(['Wes McKinney', 'Creator of Pandas'])
print(s)
print('-'*30)
s = pd.Series(['Wes McKinney', 'Creator of Pandas'], index=['Person', 'Who'])
print(s)
print(type(s))

0         Wes McKinney
1    Creator of Pandas
dtype: object
------------------------------
Person         Wes McKinney
Who       Creator of Pandas
dtype: object
<class 'pandas.core.series.Series'>


# 데이터프레임 만들기

In [9]:
scientists = pd.DataFrame({
    'Name' : ['Rosaline Franklin', 'William Gosset'],
    'Occupation' : ['chemist', 'Statistician'],
    'Born' : ['1920-07-25', '1876-06-06'],
    'Died' : ['1958-04-16', '1937-10-16'],
    'Age' : [37, 68]
})
print(scientists)
print('-'*60)
scientists = pd.DataFrame({
    'Name' : ['Rosaline Franklin', 'William Gosset'],
    'Occupation' : ['chemist', 'Statistician'],
    'Born' : ['1920-07-25', '1876-06-06'],
    'Died' : ['1958-04-16', '1937-10-16'],
    'Age' : [37, 68]
},
index = ['Rosaline Franklin', 'William Gosset'],
columns = ['Occupation', 'Born', 'Age', 'Died'])
print(scientists)

                Name    Occupation        Born        Died  Age
0  Rosaline Franklin       chemist  1920-07-25  1958-04-16   37
1     William Gosset  Statistician  1876-06-06  1937-10-16   68
------------------------------------------------------------
                     Occupation        Born  Age        Died
Rosaline Franklin       chemist  1920-07-25   37  1958-04-16
William Gosset     Statistician  1876-06-06   68  1937-10-16


# 데이터 프레임은 데이터의 순서를 보장하지 않는다.
## 순서가 보장된 딕셔너리를 전달하려면 "OrderedDict"를 사용해야 한다.

In [11]:
from collections import OrderedDict

scientists = pd.DataFrame(OrderedDict([
    ('Name' , ['Rosaline Franklin', 'William Gosset']),
    ('Occupation' , ['chemist', 'Statistician']),
    ('Born' , ['1920-07-25', '1876-06-06']),
    ('Died' , ['1958-04-16', '1937-10-16']),
    ('Age' , [37, 68])
])
)
print(scientists)

                Name    Occupation        Born        Died  Age
0  Rosaline Franklin       chemist  1920-07-25  1958-04-16   37
1     William Gosset  Statistician  1876-06-06  1937-10-16   68


# 데이터프레임에서 시리즈 선택하기

In [19]:
scientists = pd.DataFrame(
    data = {
    'Name' : ['Rosaline Franklin', 'William Gosset'],
    'Occupation' : ['chemist', 'Statistician'],
    'Born' : ['1920-07-25', '1876-06-06'],
    'Died' : ['1958-04-16', '1937-10-16'],
    'Age' : [37, 68]
},
index = ['Rosaline Franklin', 'William Gosset'],
columns = ['Occupation', 'Born', 'Age', 'Died']
)
print(scientists)
print('-'*60)
print(scientists['Born'])
print('-'*60)
print(scientists.loc['William Gosset'])
print(type(scientists.loc['William Gosset']))
print('-'*60)
print(scientists.iloc[1])
print(type(scientists.iloc[1]))

                     Occupation        Born  Age        Died
Rosaline Franklin       chemist  1920-07-25   37  1958-04-16
William Gosset     Statistician  1876-06-06   68  1937-10-16
------------------------------------------------------------
Rosaline Franklin    1920-07-25
William Gosset       1876-06-06
Name: Born, dtype: object
------------------------------------------------------------
Occupation    Statistician
Born            1876-06-06
Age                     68
Died            1937-10-16
Name: William Gosset, dtype: object
<class 'pandas.core.series.Series'>
------------------------------------------------------------
Occupation    Statistician
Born            1876-06-06
Age                     68
Died            1937-10-16
Name: William Gosset, dtype: object
<class 'pandas.core.series.Series'>


# index, values, keys 사용하기

In [25]:
first_row = scientists.loc['William Gosset']
print(first_row.index)
print(first_row.values)
print(first_row.keys())
print(first_row.index[0])
print(first_row.keys()[0])

Index(['Occupation', 'Born', 'Age', 'Died'], dtype='object')
['Statistician' '1876-06-06' 68 '1937-10-16']
Index(['Occupation', 'Born', 'Age', 'Died'], dtype='object')
Occupation
Occupation


# 시리즈의 mean, min, max, std 메서드 사용하기

In [32]:
print(scientists.columns)
print(scientists.index)
ages = scientists['Age']
print(ages)
print('-'*60)
print(ages.mean())
print('-'*60)
print(ages.min())
print('-'*60)
print(ages.max())
print('-'*60)
print(ages.std())

Index(['Occupation', 'Born', 'Age', 'Died'], dtype='object')
Index(['Rosaline Franklin', 'William Gosset'], dtype='object')
Rosaline Franklin    37
William Gosset       68
Name: Age, dtype: int64
------------------------------------------------------------
52.5
------------------------------------------------------------
37
------------------------------------------------------------
68
------------------------------------------------------------
21.920310216782973


# 시리즈와 불린 추출 사용하기

In [43]:
scientists = pd.read_csv('../data/scientists.csv')
print(scientists.shape)
print(scientists)
ages = scientists['Age']
print('평균나이는',ages.mean(),'살 입니다.')
print('최대나이는',ages.max(),'살 입니다.')
print('최소나이는',ages.min(),'살 입니다.')

(8, 5)
                   Name        Born        Died  Age          Occupation
0     Rosaline Franklin  1920-07-25  1958-04-16   37             Chemist
1        William Gosset  1876-06-13  1937-10-16   61        Statistician
2  Florence Nightingale  1820-05-12  1910-08-13   90               Nurse
3           Marie Curie  1867-11-07  1934-07-04   66             Chemist
4         Rachel Carson  1907-05-27  1964-04-14   56           Biologist
5             John Snow  1813-03-15  1858-06-16   45           Physician
6           Alan Turing  1912-06-23  1954-06-07   41  Computer Scientist
7          Johann Gauss  1777-04-30  1855-02-23   77       Mathematician
평균나이는 59.125 살 입니다.
최대나이는 90 살 입니다.
최소나이는 37 살 입니다.


# 불린 추출을 사용하는 법
### 평균 나이보다 많은 사람의 데이터를 추출할 때 

In [47]:
print(ages)
print('-'*60)
print(ages[ages > ages.mean()])
print('-'*60)
print(ages > ages.mean())

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64
------------------------------------------------------------
1    61
2    90
3    66
7    77
Name: Age, dtype: int64
------------------------------------------------------------
0    False
1     True
2     True
3     True
4    False
5    False
6    False
7     True
Name: Age, dtype: bool


## 즉 불린 추출은 리스트에 boolean값을 담아 index에 적용하면 원하는 곳만 나오게 할 수 있다.

In [50]:
manual_bool = [True,True,False,True,False,False,False,True]
print(ages[manual_bool])

0    37
1    61
3    66
7    77
Name: Age, dtype: int64


# 시리즈와 브로드캐스팅

- 모든 데이터에 대해 한 번에 연산하는 것을 브로드 캐스팅이라고 한다.
- 시리즈 처럼 여러 개의 값을 가진 데이터를 "벡터"라고 하고 단순 크기를 나타내는 데이터를 "스칼라"라고 한다.

### 예시로 같은 길이의 벡터로 더하기 곱하기 연산을 수행하고 결과값으로 같은 길이의 벡터가 출력된다.

In [51]:
print(ages + ages)

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64


In [53]:
print(ages * ages)

0    1369
1    3721
2    8100
3    4356
4    3136
5    2025
6    1681
7    5929
Name: Age, dtype: int64


### 벡터에 스칼라를 연산하면 벡터의 모든 값에 스칼라가 적용되여 브로드캐스팅된다.

In [55]:
print(ages)
print('-'*60)
print(ages * 100)

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64
------------------------------------------------------------
0    3700
1    6100
2    9000
3    6600
4    5600
5    4500
6    4100
7    7700
Name: Age, dtype: int64


In [67]:
ss = pd.Series([1, 100], index=['a', 'b'])
print(ss)
print('-'*60)
ss = pd.Series({'a' : 1, 'b' : 100})
print(ss)
print('-'*90)
ss = pd.DataFrame(
    data = {
        'a' : ['abc', 'def'],
        'b' : ['fed', 'cba'],
        'c' : ['123', '456']
    },
    index = ['first', 'second'],
    columns = ['b', 'c', 'a']
)
print(ss)

a      1
b    100
dtype: int64
------------------------------------------------------------
a      1
b    100
dtype: int64
------------------------------------------------------------------------------------------
          b    c    a
first   fed  123  abc
second  cba  456  def


### 길이가 서로 다른 벡터를 연산하기
- 시리즈와 시리즈를 연산하는 경우 같은 인덱스의 값만 계산한다.
- 데이터 개수가 2개인 시리즈와 8인 시리즈를 더하는 예시 (아래)
- 결과값을 살펴보면 인덱스가 일치한 0, 1만 계산한 걸 알 수 있다.
- 나머지 인덱스 (2~7)은 계산할 수 없기 때문에 누락값(NaN)으로 처리한다.

In [75]:
print(pd.Series([1,100]))
print('-'*60)
print(ages)
print('-'*60)
print(ages + pd.Series([1,100]))

0      1
1    100
dtype: int64
------------------------------------------------------------
0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64
------------------------------------------------------------
0     38.0
1    161.0
2      NaN
3      NaN
4      NaN
5      NaN
6      NaN
7      NaN
dtype: float64


# sort_index를 사용하여 인덱스를 역순으로 정렬 할 수 있다.
## 이때 ascending 인자로 False를 전달하여 인덱스 연순으로 데이터를 정렬한다.

In [79]:
print(ages)
print('-'*60)
reverse_age = ages.sort_index(ascending=False)
print(reverse_age)
print('-'*30)
print(ages * 2)
print(ages + reverse_age)

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64
------------------------------------------------------------
7    77
6    41
5    45
4    56
3    66
2    90
1    61
0    37
Name: Age, dtype: int64
------------------------------
0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64
0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64


### ages * 2와 ages + reverse_ages의 값은 같다. 이유는 같은 인덱스 끼리 합치기 때문이다.

# 데이터프레임과 불린 추출

In [83]:
print(scientists)
print(scientists.shape)
print(scientists.columns)
print(scientists.index)

                   Name        Born        Died  Age          Occupation
0     Rosaline Franklin  1920-07-25  1958-04-16   37             Chemist
1        William Gosset  1876-06-13  1937-10-16   61        Statistician
2  Florence Nightingale  1820-05-12  1910-08-13   90               Nurse
3           Marie Curie  1867-11-07  1934-07-04   66             Chemist
4         Rachel Carson  1907-05-27  1964-04-14   56           Biologist
5             John Snow  1813-03-15  1858-06-16   45           Physician
6           Alan Turing  1912-06-23  1954-06-07   41  Computer Scientist
7          Johann Gauss  1777-04-30  1855-02-23   77       Mathematician
(8, 5)
Index(['Name', 'Born', 'Died', 'Age', 'Occupation'], dtype='object')
RangeIndex(start=0, stop=8, step=1)


### 데이터프레임도 불린 추출이 가능하다.
- 데이터프레임의 Age 열에서 Age열의 평균보다 높은 행만 출력합니다.

In [85]:
print(scientists[scientists['Age'] > scientists['Age'].mean()])

                   Name        Born        Died  Age     Occupation
1        William Gosset  1876-06-13  1937-10-16   61   Statistician
2  Florence Nightingale  1820-05-12  1910-08-13   90          Nurse
3           Marie Curie  1867-11-07  1934-07-04   66        Chemist
7          Johann Gauss  1777-04-30  1855-02-23   77  Mathematician


### 시리즈에 리스트로 참, 거짓을 전달하여 데이터를 추출했던 것 처럼 데이터 프레임도 가능하다.
- 참, 거짓을 담은 리스트를 bool 벡터라고 부른다.
- 만약 bool 벡터의 길이가 데이터프레임의 행 길이보다 짧으면 bool벡터의 길이 만틈만 연산한다.
- 데이터 프레임의 loc 속성에 길이가 4인 bool 벡터를 전달한 것이다.

In [110]:
print(scientists.loc[[True,True,True,False,True,False,True,True]])

                   Name        Born        Died  Age          Occupation
0     Rosaline Franklin  1920-07-25  1958-04-16   37             Chemist
1        William Gosset  1876-06-13  1937-10-16   61        Statistician
2  Florence Nightingale  1820-05-12  1910-08-13   90               Nurse
4         Rachel Carson  1907-05-27  1964-04-14   56           Biologist
6           Alan Turing  1912-06-23  1954-06-07   41  Computer Scientist
7          Johann Gauss  1777-04-30  1855-02-23   77       Mathematician


# 데이터프레임과 브로드캐스팅

In [112]:
print(scientists * 3)

                                                Name  \
0  Rosaline FranklinRosaline FranklinRosaline Fra...   
1         William GossetWilliam GossetWilliam Gosset   
2  Florence NightingaleFlorence NightingaleFloren...   
3                  Marie CurieMarie CurieMarie Curie   
4            Rachel CarsonRachel CarsonRachel Carson   
5                        John SnowJohn SnowJohn Snow   
6                  Alan TuringAlan TuringAlan Turing   
7               Johann GaussJohann GaussJohann Gauss   

                             Born                            Died  Age  \
0  1920-07-251920-07-251920-07-25  1958-04-161958-04-161958-04-16  111   
1  1876-06-131876-06-131876-06-13  1937-10-161937-10-161937-10-16  183   
2  1820-05-121820-05-121820-05-12  1910-08-131910-08-131910-08-13  270   
3  1867-11-071867-11-071867-11-07  1934-07-041934-07-041934-07-04  198   
4  1907-05-271907-05-271907-05-27  1964-04-141964-04-141964-04-14  168   
5  1813-03-151813-03-151813-03-15  1858-06-161858-0

### 데이터프레임도 마찬가지로 스칼라를 이용하여 곱하기를 하면 브로드캐스트를 하여 문자열이 2배 3배가 된다.

# 열의 자료형 바꾸기와 새로운 열 추가하기

In [116]:
print(scientists.columns)
# 각각의 자료형은 문자열(오브젝트)이다.
print(scientists['Born'].dtype)
print(scientists['Died'].dtype)

Index(['Name', 'Born', 'Died', 'Age', 'Occupation'], dtype='object')
object
object


In [118]:
print(scientists['Born'])
print('-'*60)
print(scientists['Died'])

0    1920-07-25
1    1876-06-13
2    1820-05-12
3    1867-11-07
4    1907-05-27
5    1813-03-15
6    1912-06-23
7    1777-04-30
Name: Born, dtype: object
------------------------------------------------------------
0    1958-04-16
1    1937-10-16
2    1910-08-13
3    1934-07-04
4    1964-04-14
5    1858-06-16
6    1954-06-07
7    1855-02-23
Name: Died, dtype: object


### 날짜를 문자열로 저장한 데이터는 시간 관련 작업을 할 수 있도록 datetime 자료형으로 바꾸는게 좋다.
- 다음예시로 Born died 열의 자료형을 datetime이라는 자료형으로 바꾼다
- formate 속성을 '%Y-%m-%d'로 지정하여 날짜 형식을 지정한다.

In [124]:
born_datetime = pd.to_datetime(scientists['Born'], format='%Y-%m-%d')
print(born_datetime)
print(type(born_datetime))
print(born_datetime.dtype)
print(born_datetime.info())

0   1920-07-25
1   1876-06-13
2   1820-05-12
3   1867-11-07
4   1907-05-27
5   1813-03-15
6   1912-06-23
7   1777-04-30
Name: Born, dtype: datetime64[ns]
<class 'pandas.core.series.Series'>
datetime64[ns]
<class 'pandas.core.series.Series'>
RangeIndex: 8 entries, 0 to 7
Series name: Born
Non-Null Count  Dtype         
--------------  -----         
8 non-null      datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 192.0 bytes
None


In [125]:
died_datetime = pd.to_datetime(scientists['Died'], format='%Y-%m-%d')
print(died_datetime)
print(type(died_datetime))
print(died_datetime.dtype)
print(died_datetime.info())

0   1958-04-16
1   1937-10-16
2   1910-08-13
3   1934-07-04
4   1964-04-14
5   1858-06-16
6   1954-06-07
7   1855-02-23
Name: Died, dtype: datetime64[ns]
<class 'pandas.core.series.Series'>
datetime64[ns]
<class 'pandas.core.series.Series'>
RangeIndex: 8 entries, 0 to 7
Series name: Died
Non-Null Count  Dtype         
--------------  -----         
8 non-null      datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 192.0 bytes
None


In [134]:
print('추가 전 :',scientists.shape)
# 추가 작업
scientists['born_at'], scientists['died_at'] = (born_datetime,died_datetime)
print('추가 후 :', scientists.shape)
print('-'*60)
print(scientists)

추가 전 : (8, 7)
추가 후 : (8, 7)
------------------------------------------------------------
                   Name        Born        Died  Age          Occupation  \
0     Rosaline Franklin  1920-07-25  1958-04-16   37             Chemist   
1        William Gosset  1876-06-13  1937-10-16   61        Statistician   
2  Florence Nightingale  1820-05-12  1910-08-13   90               Nurse   
3           Marie Curie  1867-11-07  1934-07-04   66             Chemist   
4         Rachel Carson  1907-05-27  1964-04-14   56           Biologist   
5             John Snow  1813-03-15  1858-06-16   45           Physician   
6           Alan Turing  1912-06-23  1954-06-07   41  Computer Scientist   
7          Johann Gauss  1777-04-30  1855-02-23   77       Mathematician   

     born_at    died_at  
0 1920-07-25 1958-04-16  
1 1876-06-13 1937-10-16  
2 1820-05-12 1910-08-13  
3 1867-11-07 1934-07-04  
4 1907-05-27 1964-04-14  
5 1813-03-15 1858-06-16  
6 1912-06-23 1954-06-07  
7 1777-04-30 1855-

In [139]:
print(scientists.head(n=2))
scientists['age_day_at'] = (scientists['died_at'] - scientists['born_at'])
print(scientists)

                Name        Born        Died  Age    Occupation    born_at  \
0  Rosaline Franklin  1920-07-25  1958-04-16   37       Chemist 1920-07-25   
1     William Gosset  1876-06-13  1937-10-16   61  Statistician 1876-06-13   

     died_at age_day_at  
0 1958-04-16 13779 days  
1 1937-10-16 22404 days  
                   Name        Born        Died  Age          Occupation  \
0     Rosaline Franklin  1920-07-25  1958-04-16   37             Chemist   
1        William Gosset  1876-06-13  1937-10-16   61        Statistician   
2  Florence Nightingale  1820-05-12  1910-08-13   90               Nurse   
3           Marie Curie  1867-11-07  1934-07-04   66             Chemist   
4         Rachel Carson  1907-05-27  1964-04-14   56           Biologist   
5             John Snow  1813-03-15  1858-06-16   45           Physician   
6           Alan Turing  1912-06-23  1954-06-07   41  Computer Scientist   
7          Johann Gauss  1777-04-30  1855-02-23   77       Mathematician   

  

# 시리즈, 데이터프레임의 데이터 섞어보기

In [141]:
print(scientists['Age'])

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64


### Age 열의 데이터를 섞으려면 random 라이브러리를 불러야한다.
- random 라이브러리에는 데이터를 섞어주는 shuffle 메서드가 있다.
- shuffle 메서드에 Age 열을 전달하여 데이터를 섞는다.
- Age 열을 출력해 보면 인덱스 0~7에 해당하는 값이 잘 섞여 있음을 알 수 있다.
- seed 메서드는 컴퓨터가 생성하는 난수의 기준값을 정하기 위해 사용한다.

In [145]:
import random
print(scientists['Age'])
print('-'*60)
random.seed(42)
random.shuffle(scientists['Age'])
print(scientists['Age'])

0    77
1    41
2    56
3    61
4    45
5    90
6    37
7    66
Name: Age, dtype: int64
------------------------------------------------------------
0    61
1    45
2    37
3    66
4    56
5    90
6    77
7    41
Name: Age, dtype: int64


# 데이터프레임의 열 삭제하기 (drop)
- 열을 통째로 삭제
- 데이터프레임의 drop 메서드를 사용하여 열을 삭제한다.
- drop 메서드의 첫 번째  인자에 열 이름을 리스트에 담아 전달하고 두 번째 인자에는 axis=1을 전달하면 Age열을 삭제할 수 있다.

In [149]:
scientists_dropped = scientists.drop(['Age'], axis=1)
print(scientists)
print(scientists_dropped.columns)
# aixs=0(index)은 행을 따라 동작한다. 각 컬럼의 모든 행에 대해서 작용합니다.
# aixs=1(columns)은 열을 따라 동작합니다. 각 행의 모든 컬럼에 대해서 작동합니다.

                   Name        Born        Died  Age          Occupation  \
0     Rosaline Franklin  1920-07-25  1958-04-16   61             Chemist   
1        William Gosset  1876-06-13  1937-10-16   45        Statistician   
2  Florence Nightingale  1820-05-12  1910-08-13   37               Nurse   
3           Marie Curie  1867-11-07  1934-07-04   66             Chemist   
4         Rachel Carson  1907-05-27  1964-04-14   56           Biologist   
5             John Snow  1813-03-15  1858-06-16   90           Physician   
6           Alan Turing  1912-06-23  1954-06-07   77  Computer Scientist   
7          Johann Gauss  1777-04-30  1855-02-23   41       Mathematician   

     born_at    died_at age_day_at  
0 1920-07-25 1958-04-16 13779 days  
1 1876-06-13 1937-10-16 22404 days  
2 1820-05-12 1910-08-13 32964 days  
3 1867-11-07 1934-07-04 24345 days  
4 1907-05-27 1964-04-14 20777 days  
5 1813-03-15 1858-06-16 16529 days  
6 1912-06-23 1954-06-07 15324 days  
7 1777-04-30 1855-0

# 피클 형식으로 저장하기
- 판다스는 데이터를 저장하는 다양한 방법을 제공한다.
- 가공한 데이터를 피클, CSV, TSV 파일로 저장하고 다시 불러와본다.
- 데이터를 오래 보관한다는 뜻으로 피클이라는 이름이 붙여진 것이다.

In [155]:
names = scientists['Name']
names.to_pickle('../output/scientists_names_series.pickle')
# 이 땐 꼭 저장할 폴더를 생성해둬야한다.

# 데이터프레임도 피클로 저장할 수 있다.

In [156]:
scientists.to_pickle('../output/scientists_df.pickle')

## 피클은 바이너리 형태의 오브젝트이기 때문에 저장된 피클 데이터를 편집기와 같은 프로그램으로 열어보면 이상한 문자가 나온다. 
## 즉 피클 데이터는 반드시 read_pickle 메서드를 읽어 들여야한다.

In [157]:
scientists_names_from_pickle = pd.read_pickle('../output/scientists_names_series.pickle')
print(scientists_names_from_pickle)

0       Rosaline Franklin
1          William Gosset
2    Florence Nightingale
3             Marie Curie
4           Rachel Carson
5               John Snow
6             Alan Turing
7            Johann Gauss
Name: Name, dtype: object


In [158]:
scientists_from_pickle = pd.read_pickle('../output/scientists_df.pickle')
print(scientists_from_pickle)

                   Name        Born        Died  Age          Occupation  \
0     Rosaline Franklin  1920-07-25  1958-04-16   61             Chemist   
1        William Gosset  1876-06-13  1937-10-16   45        Statistician   
2  Florence Nightingale  1820-05-12  1910-08-13   37               Nurse   
3           Marie Curie  1867-11-07  1934-07-04   66             Chemist   
4         Rachel Carson  1907-05-27  1964-04-14   56           Biologist   
5             John Snow  1813-03-15  1858-06-16   90           Physician   
6           Alan Turing  1912-06-23  1954-06-07   77  Computer Scientist   
7          Johann Gauss  1777-04-30  1855-02-23   41       Mathematician   

     born_at    died_at age_day_at  
0 1920-07-25 1958-04-16 13779 days  
1 1876-06-13 1937-10-16 22404 days  
2 1820-05-12 1910-08-13 32964 days  
3 1867-11-07 1934-07-04 24345 days  
4 1907-05-27 1964-04-14 20777 days  
5 1813-03-15 1858-06-16 16529 days  
6 1912-06-23 1954-06-07 15324 days  
7 1777-04-30 1855-0

In [160]:
names.to_csv('../output/scientists_names_series.csv')
scientists.to_csv('../output/scientists_df.csv')
# to_csv로 저장을 하면서 sep = '\t'을 추가하고 확장자를 .tsv로 저장을 한다.
scientists.to_csv('../output/scientists_df.tsv', sep = '\t')

### 시리즈는 엑셀 구조와 맞지 않기 때문에 엑셀 파일로 저장할 수 없다.
### 엑셀 파일로 저장할 수 있는 데이터프레임으로 변환해야한다.
### xls파일로 저장하려면 xlwt 라이브러리가 필요하다.
### xlsx 파일로 저장하려면 openpyxl 라이브러리가 필요하다.
### pip install xlwt
### pip install openpyxl

# CSV 불러오기

In [166]:
name_df = names.to_frame()

import xlwt
name_df.to_excel('../output/scientists_names_series_df.xls')

import openpyxl
name_df.to_excel('../output/scientists_names_series_df.xlsx')

  name_df.to_excel('../output/scientists_names_series_df.xls')


In [172]:
excel_file = pd.read_excel('../output/scientists_names_series_df.xls')
print(excel_file)
print(type(excel_file))
print(excel_file.info())

   Unnamed: 0                  Name
0           0     Rosaline Franklin
1           1        William Gosset
2           2  Florence Nightingale
3           3           Marie Curie
4           4         Rachel Carson
5           5             John Snow
6           6           Alan Turing
7           7          Johann Gauss
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  8 non-null      int64 
 1   Name        8 non-null      object
dtypes: int64(1), object(1)
memory usage: 256.0+ bytes
None


In [175]:
excel_file = pd.read_excel('../output/scientists_names_series_df.xlsx', engine = 'openpyxl')
print(excel_file)
print(type(excel_file))
print(excel_file.info())

   Unnamed: 0                  Name
0           0     Rosaline Franklin
1           1        William Gosset
2           2  Florence Nightingale
3           3           Marie Curie
4           4         Rachel Carson
5           5             John Snow
6           6           Alan Turing
7           7          Johann Gauss
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  8 non-null      int64 
 1   Name        8 non-null      object
dtypes: int64(1), object(1)
memory usage: 256.0+ bytes
None


In [176]:
excel_file = pd.read_excel('../output/scientists_names_series_df.xlsx')
print(excel_file)
print(type(excel_file))
print(excel_file.info())

   Unnamed: 0                  Name
0           0     Rosaline Franklin
1           1        William Gosset
2           2  Florence Nightingale
3           3           Marie Curie
4           4         Rachel Carson
5           5             John Snow
6           6           Alan Turing
7           7          Johann Gauss
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  8 non-null      int64 
 1   Name        8 non-null      object
dtypes: int64(1), object(1)
memory usage: 256.0+ bytes
None


In [178]:
print(excel_file.columns)

Index(['Unnamed: 0', 'Name'], dtype='object')


In [179]:
print(excel_file['Name'])

0       Rosaline Franklin
1          William Gosset
2    Florence Nightingale
3             Marie Curie
4           Rachel Carson
5               John Snow
6             Alan Turing
7            Johann Gauss
Name: Name, dtype: object
