# Pandas 데이터 입출력

## Pandas 데이터 입출력 종류

- CSV
- Clipboard
- Excel
- JSON
- HTML
- Python Pickling
- HDF5
- SAS
- STATA
- SQL
- Google BigQuery

## CSV 파일 입력

- Comma Separated Values
- MicroSoft Excel에서 export 가능
- pandas.from_csv() : csv file -> DataFrame

In [1]:
!cat pd_ex1.csv

a,b,c,d,message
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo

In [2]:
import pandas as pd

df = pd.read_csv('pd_ex1.csv')
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


- 컬럼 이름이 없는 경우에는 names 인수로 설정 가능

In [3]:
!cat pd_ex2.csv

1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo

In [4]:
pd.read_csv('pd_ex2.csv', names=['a', 'b', 'c', 'd', 'message'])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


- 특정한 컬럼을 인덱스로 지정하고 싶으면 index_col 인수 사용

In [5]:
!cat pd_csv_mindex.csv

key1,key2,value1,value2
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16

In [6]:
pd.read_csv('pd_csv_mindex.csv', index_col=['key1', 'key2'])

Unnamed: 0_level_0,Unnamed: 1_level_0,value1,value2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


- 구분자가 comma가 아닌 경우에는 sep 인수 사용

In [7]:
!cat 'pd_ex3.csv'

            A         B         C
aaa -0.264438 -1.026059 -0.619500
bbb  0.927272  0.302904 -0.032399
ccc -0.264273 -0.386314 -0.217601
ddd -0.871858 -0.348382  1.100491

In [8]:
pd.read_table('pd_ex3.csv', sep='\s+')

Unnamed: 0,A,B,C
aaa,-0.264438,-1.026059,-0.6195
bbb,0.927272,0.302904,-0.032399
ccc,-0.264273,-0.386314,-0.217601
ddd,-0.871858,-0.348382,1.100491


- 건너 뛰어야 할 행이 있으면 skiprows 사용

In [9]:
!cat pd_ex4.csv

# hey!
a,b,c,d,message
# just wanted to make things more difficult for you
# who reads CSV files with computers, anyway?
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo

In [10]:
pd.read_csv('pd_ex4.csv', skiprows=[0,2,3])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


- 특정한 값을 NA로 취급하고 싶으면 na_values 인수 사용

In [11]:
!cat pd_ex5.csv

something,a,b,c,d,message
one,1,2,3,4,NA
two,5,6,,8,world
three,9,10,11,12,foo

In [12]:
sentinels = {'message': ['foo', 'NA'], 'something': ['two']}
pd.read_csv('pd_ex5.csv', na_values=sentinels)

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,,5,6,,8,world
2,three,9,10,11.0,12,


- 일부 행만 읽고 싶다면 nrows 인수 사용

In [13]:
!head pd_ex6.csv

one,two,three,four,key
0.467976300189,-0.0386485396255,-0.295344251987,-1.82472622729,L
-0.358893469543,1.40445260007,0.704964644926,-0.200638304015,B
-0.50184039929,0.659253707223,-0.421690619312,-0.0576883018364,G
0.204886212202,1.07413396504,1.38836131252,-0.982404023494,R
0.354627914484,-0.133115852296,0.283762637978,-0.837062961653,Q
1.81748001608,0.742272722638,0.419394843928,-2.25103520513,Q
-0.776764319165,0.935517747061,-0.332871759623,-1.87564085416,U
-0.913134961617,1.53062351168,-0.572656719239,0.477252252981,K
0.358479538224,-0.49757199147,-0.367016188009,0.507701778685,S

In [14]:
pd.read_csv('pd_ex6.csv', nrows=5)

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.50184,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q


## CSV 파일 출력

- DataFrame.to_csv() : DataFrame -> CSV file

In [15]:
df.to_csv('pd_out.csv')

In [16]:
!cat pd_out.csv

,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


- sep 인수로 구분자 변경 가능

In [17]:
import sys
df.to_csv(sys.stdout, sep='|')

|a|b|c|d|message
0|1|2|3|4|hello
1|5|6|7|8|world
2|9|10|11|12|foo


- na_rep 인수로 NA 표시 변경 가능

In [18]:
df.to_csv(sys.stdout, na_rep='NULL')

,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


- index, header 인수로 인덱스 및 헤더 출력 여부 결정 가능

In [19]:
df.to_csv(sys.stdout, index=False, header=False)

1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo


## 인터넷 상의 CSV 파일 입력

- 파일 path 대신 URL을 지정하면 다운로드하여 import

In [20]:
titanic = pd.read_csv('https://www.kaggle.com/c/titanic/download/test.csv')

CParserError: Error tokenizing data. C error: Expected 1 fields in line 13, saw 6


## 인터넷 상의 데이터 베이스 자료 입력

다음과 같은 인터넷 상의 자료는 pandas_datareader 패키지의 DataReader 을 써서 바로 pandas로 입력 가능

- Yahoo! Finance
- Google Finance
- St.Louis FED (FRED)
- Kenneth French’s data library
- World Bank
- Google Analytics

In [21]:
import pandas_datareader.data as web

In [22]:
import datetime
start = datetime.datetime(2015, 1, 1)
end = datetime.datetime(2016, 6, 30)

- http://finance.yahoo.com/q?s=005930.ks

In [23]:
df = web.DataReader("005930.KS", 'yahoo', start, end)
df.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2016-06-24,1445000.0,1445000.0,1360000.0,1400000.0,411000,1377613.81
2016-06-27,1400000.0,1405000.0,1385000.0,1398000.0,236700,1375645.79
2016-06-28,1390000.0,1404000.0,1379000.0,1399000.0,214300,1376629.8
2016-06-29,1408000.0,1412000.0,1391000.0,1396000.0,210500,1374660.37
2016-06-30,1408000.0,1445000.0,1397000.0,1425000.0,273400,1403217.07


- https://www.google.com/finance?cid=151610035517112

In [24]:
df = web.DataReader("KRX:005930", 'google', start, end)
df.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016-06-24,1445000.0,1445000.0,1360000.0,1400000.0,408920
2016-06-27,1400000.0,1405000.0,1385000.0,1398000.0,236573
2016-06-28,1390000.0,1404000.0,1379000.0,1399000.0,213829
2016-06-29,1408000.0,1412000.0,1391000.0,1396000.0,208090
2016-06-30,1408000.0,1445000.0,1397000.0,1425000.0,272883


- https://fred.stlouisfed.org/series/GDP
- https://fred.stlouisfed.org/series/CPIAUCSL
- https://fred.stlouisfed.org/series/CPILFESL

In [25]:
gdp = web.DataReader("GDP", "fred", start, end)
gdp

Unnamed: 0_level_0,GDP
DATE,Unnamed: 1_level_1
2015-01-01,17783.6
2015-04-01,17998.3
2015-07-01,18141.9
2015-10-01,18222.8
2016-01-01,18281.6
2016-04-01,18450.1


In [26]:
inflation = web.DataReader(["CPIAUCSL", "CPILFESL"], "fred", start, end)
inflation

Unnamed: 0_level_0,CPIAUCSL,CPILFESL
DATE,Unnamed: 1_level_1,Unnamed: 2_level_1
2015-01-01,234.913,239.91
2015-02-01,235.489,240.236
2015-03-01,235.989,240.783
2015-04-01,236.201,241.366
2015-05-01,236.891,241.662
2015-06-01,237.419,242.021
2015-07-01,237.876,242.48
2015-08-01,237.811,242.754
2015-09-01,237.467,243.249
2015-10-01,237.792,243.719
