In [65]:
s = """
# hey!
a,b,c,d,message
# just wanted to make things more difficult for you
# who reads CSV files with computers, anyway?
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo
"""
with open('data/ex4.csv', 'w+') as f:
    f.write(s)
    print('finish')

finish


In [1]:
import pandas as pd
import numpy as np

![](https://camo.githubusercontent.com/36984721f22772758cf5a184ec79073ce95ccd81/687474703a2f2f75706c6f61642d696d616765732e6a69616e7368752e696f2f75706c6f61645f696d616765732f373137383639312d393538663834396536303637623139622e706e673f696d6167654d6f6772322f6175746f2d6f7269656e742f7374726970253743696d61676556696577322f322f772f31323430)

## 读取文件

In [14]:
!type data\ex1.csv

a,b,c,d,message
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo


### `pd.read_csv()`

In [17]:
df = pd.read_csv('data/ex1.csv')
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [25]:
!type data\ex2.csv

1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo


#### 指定列名

In [37]:
pd.read_csv('data/ex2.csv', header=None)

Unnamed: 0,0,1,2,3,4
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [42]:
# header 把第N行当成索引
pd.read_csv('data/ex2.csv', header=[0])

Unnamed: 0,1,2,3,4,hello
0,5,6,7,8,world
1,9,10,11,12,foo


In [45]:
# names 对列起名
pd.read_csv('data/ex2.csv', names=list('abcde'))

Unnamed: 0,a,b,c,d,e
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [49]:
# 对列起名，并指定索引行index_col
pd.read_csv('data/ex2.csv', names=list('abcde'), index_col=['e'])

Unnamed: 0_level_0,a,b,c,d
e,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
hello,1,2,3,4
world,5,6,7,8
foo,9,10,11,12


#### 层级索引

In [59]:
df = pd.read_clipboard(sep=',')
df.to_csv('data/csv_mindex.csv', index=None)

In [60]:
!type data\csv_mindex.csv

key1,key2,value1,value2
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


In [63]:
parsed = pd.read_csv('data/csv_mindex.csv', index_col=['key1', 'key2'])
parsed

Unnamed: 0_level_0,Unnamed: 1_level_0,value1,value2
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16


#### 跳过文本

In [67]:
!type data\ex4.csv


# hey!
a,b,c,d,message
# just wanted to make things more difficult for you
# who reads CSV files with computers, anyway?
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo


In [69]:
pd.read_csv('data/ex4.csv', skiprows=[0,2,3])

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


#### 缺失符处理


In [74]:
df = pd.read_clipboard(sep=',')
df.to_csv('data/ex5.csv', index=None)

In [75]:
!type data\ex5.csv

something,a,b,c,d,message
one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo


In [77]:
pd.read_csv('data/ex5.csv')

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


指定na_values是哪些， 比如na_values=['a','b'] 会把表格里面含有a, b 的值换成NaN  
`pd.read_csv(path, na_values=[''])`

In [84]:
pd.read_csv('data/ex5.csv', na_values=['1'])

Unnamed: 0,something,a,b,c,d,message
0,one,,2,3.0,4,
1,two,5.0,6,,8,world
2,three,9.0,10,11.0,12,foo


根据不同字段指定na_values

In [85]:
sentinels = {'message':['foo', 'NA'], 'something':'three'}
pd.read_csv('data/ex5.csv', na_values= sentinels)

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,,9,10,11.0,12,


In [88]:
pd.read_csv('data/ex4.csv', comment='#')

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


#### 更多常用参数信息

![](https://camo.githubusercontent.com/df64e7c2dd1582121c2942fdf02411a582d148a1/687474703a2f2f75706c6f61642d696d616765732e6a69616e7368752e696f2f75706c6f61645f696d616765732f373137383639312d303832646166346130306564393439342e706e673f696d6167654d6f6772322f6175746f2d6f7269656e742f7374726970253743696d61676556696577322f322f772f31323430)

![](https://camo.githubusercontent.com/68641d6dbc7958d5e1abcf501a888f788e1699b1/687474703a2f2f75706c6f61642d696d616765732e6a69616e7368752e696f2f75706c6f61645f696d616765732f373137383639312d663262636330613730336337323336662e706e673f696d6167654d6f6772322f6175746f2d6f7269656e742f7374726970253743696d61676556696577322f322f772f31323430)

![](https://camo.githubusercontent.com/a229339992e060003769fbb690af90d9436c158b/687474703a2f2f75706c6f61642d696d616765732e6a69616e7368752e696f2f75706c6f61645f696d616765732f373137383639312d353937333237616465336539346337612e706e673f696d6167654d6f6772322f6175746f2d6f7269656e742f7374726970253743696d61676556696577322f322f772f31323430)

### 逐块读取文本文件

在处理很大的文件时，或找出大文件中的参数集以便于后续处理时，你可能只想读取文件的一小部分或逐块对文件进行迭代。

在看大文件之前，我们先设置pandas显示地更紧些：

In [95]:
pd.options.display.max_rows = 10

In [96]:
result = pd.read_csv('data/ex6.csv')

In [97]:
result

Unnamed: 0,one,two,three,four,key
0,0.387555,0.104824,0.261377,0.986381,E
1,0.922591,0.269760,0.608944,0.135650,A
2,0.152968,0.647628,0.114709,0.204878,C
3,0.702921,0.189018,0.005071,0.879412,B
4,0.102917,0.928880,0.594324,0.435483,A
...,...,...,...,...,...
9995,0.634468,0.852927,0.251010,0.962021,E
9996,0.090700,0.682432,0.679442,0.049422,G
9997,0.274815,0.108452,0.759533,0.668607,C
9998,0.230151,0.382808,0.115648,0.248200,C


#### 指定读取的行数， 避免将整个文件载入

In [98]:
pd.read_csv('data/ex6.csv', nrows=5)

Unnamed: 0,one,two,three,four,key
0,0.387555,0.104824,0.261377,0.986381,E
1,0.922591,0.26976,0.608944,0.13565,A
2,0.152968,0.647628,0.114709,0.204878,C
3,0.702921,0.189018,0.005071,0.879412,B
4,0.102917,0.92888,0.594324,0.435483,A


#### 逐块读取文件，指定`chunksize`(行数)
pd会将文件分成data_row / chunksize份，每次读取其中一份，读取完成就无法再次读取

In [127]:
chunker = pd.read_csv('data/ex6.csv', chunksize=200)


In [128]:
tot = pd.Series([])
for piece in chunker:
    tot = tot.add(piece['key'].value_counts(), fill_value=0)
tot = tot.sort_values(ascending=False)
tot

D    1311.0
F    1261.0
H    1253.0
E    1251.0
G    1240.0
C    1237.0
B    1237.0
A    1210.0
dtype: float64

### pd.read_table()
指定分隔符

In [18]:
pd.read_table('data/ex1.csv', sep=',')

  """Entry point for launching an IPython kernel.


Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [129]:
!type data\ex2.csv

1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo


## 写入数据

In [130]:
data = pd.read_csv('data/ex5.csv')

In [131]:
data

Unnamed: 0,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


### 输出csv文件

In [132]:
data.to_csv('data/ex5_copy.csv')

In [133]:
!type data\ex5_copy.csv

,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo


### 指定分隔符

In [134]:
data.to_csv('data/ex5_sep.csv', sep='-')

In [135]:
!type data\ex5_sep.csv

-something-a-b-c-d-message
0-one-1-2-3.0-4-
1-two-5-6--8-world
2-three-9-10-11.0-12-foo


In [136]:
import sys
data.to_csv(sys.stdout, sep='|')

|something|a|b|c|d|message
0|one|1|2|3.0|4|
1|two|5|6||8|world
2|three|9|10|11.0|12|foo


In [137]:
data.to_csv(sys.stdout, na_rep='NULL')

,something,a,b,c,d,message
0,one,1,2,3.0,4,NULL
1,two,5,6,NULL,8,world
2,three,9,10,11.0,12,foo


### 取消行列标签输出

In [138]:
data.to_csv(sys.stdout, index=None, header=None)

one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo


### 只输出部分列

In [139]:
data.to_csv(sys.stdout, index=False, columns=['a', 'b', 'c'])

a,b,c
1,2,3.0
5,6,
9,10,11.0


### 层级索引输出

In [143]:
index = pd.MultiIndex.from_product([['a','b'], [1,2]])
columns = pd.MultiIndex.from_product([['A', 'B'], [1,2,3]])

In [146]:
data = np.random.randint(1,100,size=(4, 6))
df = pd.DataFrame(data, index=index, columns=columns)

In [148]:
df.to_csv(sys.stdout)

,,A,A,A,B,B,B
,,1,2,3,1,2,3
a,1,16,69,61,59,71,60
a,2,51,67,71,42,59,6
b,1,43,56,53,7,32,59
b,2,38,14,63,72,94,11


### Series也可以转出csv

### 处理分隔符格式

In [156]:
!type data\ex7.csv

"a","b","c"
"1","2","3"
"1","2","3"


In [160]:
import csv
f = open('data/ex7.csv')
reader = csv.reader(f)

In [162]:
for line in reader:
    print(line)

### 首先读取文件到一个多行列表中

In [163]:
with open('data/ex7.csv') as f:
    lines = list(csv.reader(f))

### 然后，将这些数据分为标题行和数据行

In [164]:
header, values = lines[0], lines[1:]

In [166]:
display(header, values)

['a', 'b', 'c']

[['1', '2', '3'], ['1', '2', '3']]

### 利用字典生成式，和zip创建数据列的字典

In [167]:
data_dict = {h:v for h, v in zip(header, zip(*values))}

In [171]:
data_dict

{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}