### Pandas的解析函数
- read_csv: 从文件、URL或文件型对象读取分隔好的数据，逗号是默认分隔符
- read_table: 从文件、URL或文件型对象读取分隔好的数据，制表符(\t)是默认分隔符
- read_html: 从HTML文件中读取所有表格数据
- read_excel
- read_json
- read_sql

In [1]:
import pandas as pd
from pandas import Series, DataFrame
data_dir = 'D:/python/np_pd_mat/datasets/'

In [2]:
list(open(data_dir + 'ex1.csv'))

['a,b,c,d,message\n', '1,2,3,4,hello\n', '5,6,7,8,world\n', '9,10,11,12,foo']

In [3]:
# equivalents to: pd.read_table(data_dir + 'ex1.csv', sep=',')
df = pd.read_csv(data_dir + 'ex1.csv')       
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [4]:
list(open(data_dir + 'ex2.csv'))

['1,2,3,4,hello\n', '5,6,7,8,world\n', '9,10,11,12,foo']

In [5]:
# 有些并不包含表头行，可以允许pandas自动分配默认列名，也可以指定列名
pd.read_csv(data_dir + 'ex2.csv', names=['a', 'b', 'c', 'd', 'message'])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [6]:
# 指定message列为索引
pd.read_csv(data_dir + 'ex2.csv', names=['a', 'b', 'c', 'd', 'message'], index_col='message')

Unnamed: 0_level_0,a,b,c,d
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


### 分层索引

In [7]:
list(open(data_dir + "csv_mindex.csv"))

['key1,key2,value1,value2\n',
 'one,a,1,2\n',
 'one,b,3,4\n',
 'one,c,5,6\n',
 'one,d,7,8\n',
 'two,a,9,10\n',
 'two,b,11,12\n',
 'two,c,13,14\n',
 'two,d,15,16\n']

In [8]:
parsed = pd.read_csv(data_dir + "csv_mindex.csv", index_col=['key1', 'key2'])
parsed

Unnamed: 0_level_0,Unnamed: 1_level_0,value1,value2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


当字段是以多种不同数量的空格分开时，可以向read_csv传入一个正则表达式作为分隔符:

In [9]:
list(open(data_dir + 'ex3.csv'))

['            A         B         C\n',
 'aaa -0.264438 -1.026059 -0.619500\n',
 'bbb  0.927272  0.302904 -0.032399\n',
 'ccc -0.264273 -0.386314 -0.217601\n',
 'ddd -0.871858 -0.348382  1.100491']

In [10]:
pd.read_csv(data_dir + 'ex3.csv', sep='\s+')    # s+可以将tab和多个空格都当成一样的分隔符

Unnamed: 0,A,B,C
aaa,-0.264438,-1.026059,-0.6195
bbb,0.927272,0.302904,-0.032399
ccc,-0.264273,-0.386314,-0.217601
ddd,-0.871858,-0.348382,1.100491


读取文件时跳过指定的行:

In [11]:
list(open(data_dir + 'ex4.csv'))

['# hey!\n',
 'a,b,c,d,message\n',
 '# just wanted to make things more difficult for you\n',
 '# who reads CSV files with computers, anyway?\n',
 '1,2,3,4,hello\n',
 '5,6,7,8,world\n',
 '9,10,11,12,foo']

In [12]:
pd.read_csv(data_dir + 'ex4.csv', skiprows=[0, 2, 3])      # 跳过第1、3、4行

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


### 缺失值处理

In [13]:
obj5 = pd.read_csv(data_dir + 'ex5.csv')
obj5.isnull()

Unnamed: 0,something,a,b,c,d,message
0,False,False,False,False,False,True
1,False,False,False,True,False,False
2,False,False,False,False,False,False


na_values选项可以传入一个列表或一组字符串来处理缺失值:

In [14]:
pd.read_csv(data_dir + 'ex5.csv', na_values=['NULL'])

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


在字典中，每列可以指定不同的缺失值标记:

In [15]:
sen = {'message': ['foo', 'NA'], 'something': ['two']}      # 将message列的foo值替换为NaN，something列的two替换为NaN
pd.read_csv(data_dir + 'ex5.csv', na_values=sen)

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,,5,6,,8,world
2,three,9,10,11.0,12,


### 分块读取文本文件

In [16]:
# 设置显式的行数
pd.options.display.max_rows = 6

result = pd.read_csv(data_dir + "ex6.csv")
result

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.501840,0.659254,-0.421691,-0.057688,G
...,...,...,...,...,...
9997,0.523331,0.787112,0.486066,1.093156,K
9998,-0.362559,0.598894,-1.843201,0.887292,G
9999,-0.096376,-1.012999,-0.657431,-0.573315,0


In [17]:
# 读取一小部分行
pd.read_csv(data_dir + "ex6.csv", nrows=5)

Unnamed: 0,one,two,three,four,key
0,0.467976,-0.038649,-0.295344,-1.824726,L
1,-0.358893,1.404453,0.704965,-0.200638,B
2,-0.50184,0.659254,-0.421691,-0.057688,G
3,0.204886,1.074134,1.388361,-0.982404,R
4,0.354628,-0.133116,0.283763,-0.837063,Q


分块读入文件：read_csv返回的TextPraser对象允许根据chunksize遍历文件:

In [18]:
chunker = pd.read_csv(data_dir + "ex6.csv", chunksize=1000)
chunker

<pandas.io.parsers.TextFileReader at 0x15276dbb070>

In [19]:
import numpy as np

tot = Series([], dtype=np.float64)
for piece in chunker:
    tot = tot.add(piece['key'].value_counts(), fill_value=0)

tot = tot.sort_values(ascending=False)
tot[:10]

E    368.0
X    364.0
L    346.0
     ...  
F    335.0
K    334.0
H    330.0
Length: 10, dtype: float64

### 将数据写入文本

In [20]:
data = pd.read_csv(data_dir + "ex5.csv")
data.to_csv(data_dir + 'out5.csv')
list(open(data_dir + 'out5.csv'))

[',something,a,b,c,d,message\n',
 '0,one,1,2,3.0,4,\n',
 '1,two,5,6,,8,world\n',
 '2,three,9,10,11.0,12,foo\n']

禁止写入行和列的标签:

In [21]:
import sys

data.to_csv(sys.stdout, index=False, header=False)

one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo


写入列的子集:

In [22]:
data.to_csv(sys.stdout, index=False, columns=['a', 'b', 'c'])

a,b,c
1,2,3.0
5,6,
9,10,11.0


### 使用分隔格式

In [23]:
list(open(data_dir + 'ex7.csv'))

['"a","b","c"\n', '"1","2","3"\n', '"1","2","3"\n']

对于任何带有单字符分隔符的文件，可以使用Python内建的csv模块:

In [24]:
import csv

with open(data_dir + 'ex7.csv') as f:
    lines = list(csv.reader(f))         # csv.reader(f, delimiter='|')


header, values = lines[0], lines[1:]
print(header)
print(values)

['a', 'b', 'c']
[['1', '2', '3'], ['1', '2', '3']]


In [25]:
data_dict = {h: v for h, v in zip(header, zip(*values))}
data_dict

{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}

### JSON文件的读写

In [26]:
import json

# 注意必须使用双引号
obj = """
{"name": "Faye",
 "city": ["Shanghai", "Shenzhen", "Hefei"],
 "pet": null,
 "friends": [{"name": "lf", "age": 28, "pets": "cat"}, 
             {"name": "yx", "age": 29, "pets": ["cat", "dog"]}]
}
"""

# json.loads将JSON字符串转换为Python形式
result = json.loads(obj)
result

{'name': 'Faye',
 'city': ['Shanghai', 'Shenzhen', 'Hefei'],
 'pet': None,
 'friends': [{'name': 'lf', 'age': 28, 'pets': 'cat'},
  {'name': 'yx', 'age': 29, 'pets': ['cat', 'dog']}]}

In [27]:
# json.dumps将Python对象转化为JSON
asjson = json.dumps(result)
asjson

'{"name": "Faye", "city": ["Shanghai", "Shenzhen", "Hefei"], "pet": null, "friends": [{"name": "lf", "age": 28, "pets": "cat"}, {"name": "yx", "age": 29, "pets": ["cat", "dog"]}]}'

pd.read_json可以自动将JSON数据集按照指定次序转化为Series或者DataFrame:

In [28]:
friends = DataFrame(result['friends'], columns=['name', 'age'])
friends

Unnamed: 0,name,age
0,lf,28
1,yx,29


In [29]:
data = pd.read_json(data_dir + "example.json")
data

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6
2,7,8,9


In [30]:
data.to_json()

'{"a":{"0":1,"1":4,"2":7},"b":{"0":2,"1":5,"2":8},"c":{"0":3,"1":6,"2":9}}'

In [31]:
data.to_json(orient="records")      # 转成列表

'[{"a":1,"b":2,"c":3},{"a":4,"b":5,"c":6},{"a":7,"b":8,"c":9}]'