# Pandas-read_csv参数解析

In [2]:
import pandas as pd

In [3]:
import numpy as np

In [4]:
from io import StringIO

In [5]:
from contextlib import suppress

## 参数

In [6]:
# 直接可以返回参考文档
pd.read_csv?

pd.read_csv(

    filepath_or_buffer: 'FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str]',
    sep=<no_default>,
    delimiter=None,
    header='infer',
    names=<no_default>,
    index_col=None,
    usecols=None,
    squeeze=None,
    prefix=<no_default>,
    mangle_dupe_cols=True,
    dtype: 'DtypeArg | None' = None,
    engine: 'CSVEngine | None' = None,
    converters=None,
    true_values=None,
    false_values=None,
    skipinitialspace=False,
    skiprows=None,
    skipfooter=0,
    nrows=None,
    na_values=None,
    keep_default_na=True,
    na_filter=True,
    verbose=False,
    skip_blank_lines=True,
    parse_dates=None,
    infer_datetime_format=False,
    keep_date_col=False,
    date_parser=None,
    dayfirst=False,
    cache_dates=True,
    iterator=False,
    chunksize=None,
    compression: 'CompressionOptions' = 'infer',
    thousands=None,
    decimal: 'str' = '.',
    lineterminator=None,
    quotechar='"',
    quoting=0,
    doublequote=True,
    escapechar=None,
    comment=None,
    encoding=None,
    encoding_errors: 'str | None' = 'strict',
    dialect=None,
    error_bad_lines=None,
    warn_bad_lines=None,
    on_bad_lines=None,
    delim_whitespace=False,
    low_memory=True,
    memory_map=False,
    float_precision=None,
    storage_options: 'StorageOptions' = None,
)

### filepath_or_buffer

内容的输入

In [8]:
# 一个采用;;分割的csv, 没有表头
file = './test_data/test.csv'

In [9]:
# 直接读取
a = pd.read_csv(file)

In [16]:
# stringio
pd.read_csv(StringIO('a,b,c\n1,2,3\n4,5,6'))

Unnamed: 0,a,b,c
0,1,2,3
1,4,5,6


In [17]:
# 直接读取GitHub的文件
pd.read_csv('https://raw.githubusercontent.com/Kyouichirou/NoteBook/main/Pandas/read_csv_%E4%B8%93%E9%A1%B9/test_data/test_2.csv')

Unnamed: 0,header_a,header_b,header_c,header_d
0,,true,1,1.2
1,True,b,2,1.235
2,,c,3,1.258469


In [10]:
a.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 1 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   2021-09-06;;1;;7.9;;2;;6  2 non-null      object
dtypes: object(1)
memory usage: 144.0+ bytes


In [11]:
a

Unnamed: 0,2021-09-06;;1;;7.9;;2;;6
0,2021-09-07;;1;;8.5;;2;;7
1,2021-09-08;;2;;8;;1;;8.1


In [12]:
a.size

2

In [13]:
a.shape

(2, 1)

### 分隔符

- sep

- delimiter

In [14]:
# 指定分割符号
b = pd.read_csv(file, sep=';;')
# 不指定引擎的话, 将触发警告

  b = pd.read_csv(file, sep=';;')


In [9]:
b.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   2021-09-06  2 non-null      object 
 1   1           2 non-null      int64  
 2   7.9         2 non-null      float64
 3   2           2 non-null      int64  
 4   6           2 non-null      float64
dtypes: float64(2), int64(2), object(1)
memory usage: 208.0+ bytes


In [10]:
# 准确拆分出了内容, 但是因为没有表头, 所以第一行默认为表头
b

Unnamed: 0,2021-09-06,1,7.9,2,6
0,2021-09-07,1,8.5,2,7.0
1,2021-09-08,2,8.0,1,8.1


In [11]:
# 准确读取
pd.read_csv(file, engine='python', sep=';;')

Unnamed: 0,2021-09-06,1,7.9,2,6
0,2021-09-07,1,8.5,2,7.0
1,2021-09-08,2,8.0,1,8.1


In [13]:
# 增加表头
c = pd.read_csv(file, engine='python', sep=';;', names=[*'abcde'])

In [14]:
c

Unnamed: 0,a,b,c,d,e
0,2021-09-06,1,7.9,2,6.0
1,2021-09-07,1,8.5,2,7.0
2,2021-09-08,2,8.0,1,8.1


In [15]:
c.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   a       3 non-null      object 
 1   b       3 non-null      int64  
 2   c       3 non-null      float64
 3   d       3 non-null      int64  
 4   e       3 non-null      float64
dtypes: float64(2), int64(2), object(1)
memory usage: 248.0+ bytes


In [37]:
pd.read_csv(file, engine='python', sep=';;', names=[*'abcde'], true_values=[1])

Unnamed: 0,a,b,c,d,e
0,2021-09-06,1,7.9,2,6.0
1,2021-09-07,1,8.5,2,7.0
2,2021-09-08,2,8.0,1,8.1


In [19]:
# 增加表头和引入更多的干扰项
file2 = './test_data/test_1.csv'

In [26]:
c = pd.read_csv(file2, engine='python', sep=';;')

In [27]:
c

Unnamed: 0,date,index,rate,number,col
0,2021-09-06,TRUE,,2.0,#DIV/0!
1,2021-09-07,#VALUE!,8.5,,#NAME?
2,2021-09-08,2,,1.0,8.1


In [28]:
c.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   date    3 non-null      object 
 1   index   3 non-null      object 
 2   rate    2 non-null      object 
 3   number  2 non-null      float64
 4   col     3 non-null      object 
dtypes: float64(1), object(4)
memory usage: 248.0+ bytes


### NaN值数据

- keep_default_na

- na_values

In [31]:
pd.read_csv(file2, engine='python', sep=';;', keep_default_na=False)

Unnamed: 0,date,index,rate,number,col
0,2021-09-06,TRUE,,2.0,#DIV/0!
1,2021-09-07,#VALUE!,8.5,,#NAME?
2,2021-09-08,2,,1.0,8.1


In [32]:
pd.read_csv(file2, engine='python', sep=';;', keep_default_na=False, na_values=['#DIV/0!', '#VALUE!'])

Unnamed: 0,date,index,rate,number,col
0,2021-09-06,TRUE,,2.0,
1,2021-09-07,,8.5,,#NAME?
2,2021-09-08,2,,1.0,8.1


### false & true

- true_values

- false_values

In [20]:
# 并未直接转换为True or False
pd.read_csv(file2, engine='python', sep=';;', keep_default_na=False, false_values=[1])

Unnamed: 0,date,index,rate,number,col
0,2021-09-06,TRUE,,2.0,#DIV/0!
1,2021-09-07,#VALUE!,8.5,,#NAME?
2,2021-09-08,2,,1.0,8.1


In [46]:
# 直接报错
pd.read_csv(file_3, true_values=[1])

ValueError: Must be all encoded bytes

In [38]:
# 常规的csv
file_3 = './test_data/test_2.csv'

In [47]:
# #N/A, 被自动转换为NaN
pd.read_csv(file_3)

Unnamed: 0,header_a,header_b,header_c,header_d
0,,a,1,1.2
1,True,b,2,1.235
2,,c,3,1.258469


In [48]:
# 不报错, 但是1也没有变成True
pd.read_csv(file_3, true_values=[1], engine='python')

Unnamed: 0,header_a,header_b,header_c,header_d
0,,a,1,1.2
1,True,b,2,1.235
2,,c,3,1.258469


In [49]:
# 文本, 但是也没有变True
pd.read_csv(file_3, true_values=['a'])

Unnamed: 0,header_a,header_b,header_c,header_d
0,,a,1,1.2
1,True,b,2,1.235
2,,c,3,1.258469


In [51]:
pd.read_csv(file_3, true_values=['true'])

Unnamed: 0,header_a,header_b,header_c,header_d
0,,true,1,1.2
1,True,b,2,1.235
2,,c,3,1.258469


In [58]:
data = ('a,b,c\n1,Yes,2\n3,No,4')

In [79]:
tmp = StringIO(data)

In [60]:
pd.read_csv(tmp)

Unnamed: 0,a,b,c
0,1,Yes,2
1,3,No,4


In [63]:
pd.read_csv(tmp, true_values=['Yes'], false_values=['No'])

Unnamed: 0,a,b,c
0,1,True,2
1,3,False,4


In [65]:
pd.read_csv(tmp, false_values=['No'])

Unnamed: 0,a,b,c
0,1,Yes,2
1,3,No,4


In [68]:
pd.read_csv(tmp, true_values=['yes'], false_values=['No'])

Unnamed: 0,a,b,c
0,1,Yes,2
1,3,No,4


In [70]:
# 不支持数字, 在C引擎下直接报错
pd.read_csv(tmp, true_values=[2], false_values=[4], engine='python')

Unnamed: 0,a,b,c
0,1,Yes,2
1,3,No,4


In [72]:
pd.read_csv(tmp, true_values=['Yes'], false_values=['No'], engine='python')

Unnamed: 0,a,b,c
0,1,True,2
1,3,False,4


In [83]:
pd.read_csv(StringIO('a,b,c\n1,Yes,2\n3,No,4\n5,abc,6'), true_values=['Yes'], false_values=['No'])

Unnamed: 0,a,b,c
0,1,True,2
1,3,False,4
2,5,True,6


In [84]:
pd.read_csv(StringIO('a,b,c\n1,Yes,2\n3,No,4\n5,abc,6'), true_values=['Yes',  'abc'], false_values=['No'])

Unnamed: 0,a,b,c
0,1,True,2
1,3,False,4
2,5,True,6


### 小结

**true_values/false_values**

- 不支持数字, engine='c', 直接报错, python, 不会报错

- 按列来操作, 列的数据必须全部为True or False时才会被执行

### datetime格式数据

- parser_dates

- keep_date_col

In [86]:
e = pd.read_csv(file2, engine='python', sep=';;', keep_default_na=False, parse_dates=['date'])

In [87]:
e.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    3 non-null      datetime64[ns]
 1   index   3 non-null      object        
 2   rate    3 non-null      object        
 3   number  3 non-null      object        
 4   col     3 non-null      object        
dtypes: datetime64[ns](1), object(4)
memory usage: 248.0+ bytes


In [94]:
# 将多列数据作为日期解析
pd.read_csv(StringIO('a,b,c\n2011,11,2\n2009,10,4\n2003,03,6'), parse_dates=[[0,1,2]])

Unnamed: 0,a_b_c
0,2011-11-02
1,2009-10-04
2,2003-03-06


In [95]:
# 同时保留数据
pd.read_csv(
    StringIO('a,b,c\n2011,11,2\n2009,10,4\n2003,03,6'),
    parse_dates=[[0,1,2]],
    keep_date_col=True
)

Unnamed: 0,a_b_c,a,b,c
0,2011-11-02,2011,11,2
1,2009-10-04,2009,10,4
2,2003-03-06,2003,3,6


In [97]:
# 合并数据为日期, 同时重命名新列为date_col
f = pd.read_csv(
    StringIO('a,b,c\n2011,11,2\n2009,10,4\n2003,03,6'), 
    parse_dates={'date_col':[0,1,2]}, 
    keep_date_col=True
)

In [98]:
f

Unnamed: 0,date_col,a,b,c
0,2011-11-02,2011,11,2
1,2009-10-04,2009,10,4
2,2003-03-06,2003,3,6


In [99]:
f.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   date_col  3 non-null      datetime64[ns]
 1   a         3 non-null      object        
 2   b         3 non-null      object        
 3   c         3 non-null      object        
dtypes: datetime64[ns](1), object(3)
memory usage: 224.0+ bytes


In [262]:
# 欧洲不少国家习惯将日期中的日放在第一位
pd.read_csv(
    StringIO('a,b,c\n11.3.2011,11,2\n9.3.2009,10,4\n1.1.2003,03,6'),
    parse_dates=['a']
)

Unnamed: 0,a,b,c
0,2011-11-03,11,2
1,2009-09-03,10,4
2,2003-01-01,3,6


In [263]:
# 日期转换
pd.read_csv(
    StringIO('a,b,c\n11.3.2011,11,2\n9.3.2009,10,4\n1.1.2003,03,6'),
    parse_dates=['a'],
    dayfirst=True,
)

Unnamed: 0,a,b,c
0,2011-03-11,11,2
1,2009-03-09,10,4
2,2003-01-01,3,6


In [272]:
%%time
pd.read_csv('./test_data/test10.csv', parse_dates=['e'], dayfirst=True)

CPU times: total: 500 ms
Wall time: 520 ms


Unnamed: 0,d,e
0,a,1976-02-19
1,a,1951-01-06
2,a,1957-08-10
3,a,1996-06-20
4,a,1995-03-26
...,...,...
9995,a,1951-05-07
9996,a,1993-07-09
9997,a,1975-10-04
9998,a,1996-07-27


In [273]:
%%time
# 速度狂飙, 500 / 31 > 15
pd.read_csv('./test_data/test10.csv', parse_dates=['e'], dayfirst=True, infer_datetime_format=True)

CPU times: total: 31.2 ms
Wall time: 28 ms


Unnamed: 0,d,e
0,a,1976-02-19
1,a,1951-01-06
2,a,1957-08-10
3,a,1996-06-20
4,a,1995-03-26
...,...,...
9995,a,1951-05-07
9996,a,1993-07-09
9997,a,1975-10-04
9998,a,1996-07-27


In [101]:
# 指定读取的列
g = pd.read_csv(
    StringIO('a,b,c\n2011,11,2\n2009,10,4\n2003,03,6'), 
    usecols=['a', 'b']
)

In [102]:
g.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   a       3 non-null      int64
 1   b       3 non-null      int64
dtypes: int64(2)
memory usage: 176.0 bytes


In [104]:
# 强制按照指定的数据类型读取列
g = pd.read_csv(
    StringIO('a,b,c\n2011,11,2\n2009,10,4\n2003,03,6'), 
    dtype={'a': 'str', 'b': 'int'}
)

In [105]:
g.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   a       3 non-null      object
 1   b       3 non-null      int32 
 2   c       3 non-null      int64 
dtypes: int32(1), int64(1), object(1)
memory usage: 188.0+ bytes


In [107]:
# 关闭空值的检测
pd.read_csv(
    StringIO('a,b,c\n2011,,2\n2009,10,4\n2003,03,6'), 
    na_filter=False
)

Unnamed: 0,a,b,c
0,2011,,2
1,2009,10.0,4
2,2003,3.0,6


In [108]:
pd.read_csv(
    StringIO('a,b,c\n2011,,2\n2009,10,4\n2003,03,6')
)

Unnamed: 0,a,b,c
0,2011,,2
1,2009,10.0,4
2,2003,3.0,6


In [177]:
# 选定使用那一列作为index
n = pd.read_csv(
    StringIO('a,b,c\n2011,,2\n2009,10,4\n2003,03,6'),
    index_col=[0]
)

In [178]:
n

Unnamed: 0_level_0,b,c
a,Unnamed: 1_level_1,Unnamed: 2_level_1
2011,,2
2009,10.0,4
2003,3.0,6


In [179]:
n.index

Int64Index([2011, 2009, 2003], dtype='int64', name='a')

In [185]:
# 多级index
pd.read_csv(
    StringIO('a,b,c\nab,a1,bb\nab,ab2,4\nac,03,6'),
    index_col=[0, 1]
)

Unnamed: 0_level_0,Unnamed: 1_level_0,c
a,b,Unnamed: 2_level_1
ab,a1,bb
ab,ab2,4
ac,03,6


In [187]:
# convertor, 自定义列的数据怎么处理
p = pd.read_csv(
    StringIO('a,b,c\nab,1,bb\nab,1,4\nac,3,6'),
    converters={'b': lambda x: str(x)}
)

In [188]:
p.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   a       3 non-null      object
 1   b       3 non-null      object
 2   c       3 non-null      object
dtypes: object(3)
memory usage: 200.0+ bytes


In [189]:
p

Unnamed: 0,a,b,c
0,ab,1,bb
1,ab,1,4
2,ac,3,6


In [192]:
pd.read_csv(
    StringIO('a,b,c\nab,1,bb\nab,,4\nac,3,6')
)

Unnamed: 0,a,b,c
0,ab,1.0,bb
1,ab,,4
2,ac,3.0,6


In [191]:
# 优先级比内部的处理函数
# 空值保存下来, 并没有转为nana
pd.read_csv(
    StringIO('a,b,c\nab,1,bb\nab,,4\nac,3,6'),
    converters={'b': lambda x: str(x)}
)

Unnamed: 0,a,b,c
0,ab,1.0,bb
1,ab,,4
2,ac,3.0,6


In [195]:
# 行分隔符
# \n, \r, 都支持
pd.read_csv(
    StringIO('a,b,c|ab,1,bb|ab,,4|ac,3,6')
)

Unnamed: 0,a,b,c|ab,1,bb|ab,Unnamed: 5,4|ac,3,6


In [197]:
pd.read_csv(
    StringIO('a,b,c|ab,1,bb|ab,,4|ac,3,6'),
    lineterminator='|'
)

Unnamed: 0,a,b,c
0,ab,1.0,bb
1,ab,,4
2,ac,3.0,6


In [196]:
# 长度必须为1
pd.read_csv(
    StringIO('a,b,c||ab,1,bb||ab,,4||ac,3,6'),
    lineterminator='||'
)

ValueError: Only length-1 line terminators supported

In [216]:
# 多个空格分隔符
t = pd.read_csv(
    StringIO('a  b c\nab  1  bb\nab  10  4\nac  3  6'),
)
# 注意别被蒙骗了

In [217]:
t

Unnamed: 0,a b c
0,ab 1 bb
1,ab 10 4
2,ac 3 6


In [218]:
t.shape

(3, 1)

In [219]:
t.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   a  b c  3 non-null      object
dtypes: object(1)
memory usage: 152.0+ bytes


In [220]:
# delim_whitespace等价于 \s+
s = pd.read_csv(
    StringIO('a  b c\nab  1  bb\nab  10  4\nac  3  6'),
    delim_whitespace=True
)

In [221]:
s

Unnamed: 0,a,b,c
0,ab,1,bb
1,ab,10,4
2,ac,3,6


In [222]:
s.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   a       3 non-null      object
 1   b       3 non-null      int64 
 2   c       3 non-null      object
dtypes: int64(1), object(2)
memory usage: 200.0+ bytes


In [224]:
# 注意这里, c engine不支持多空格
pd.read_csv(
    StringIO('a  b c\nab  1  bb\nab  10  4\nac  3  6'),
    sep='  '
)

  pd.read_csv(


Unnamed: 0,a,b c
ab,1,bb
ab,10,4
ac,3,6


In [225]:
pd.read_csv(
    StringIO('a  b  c\nab  1  bb\nab  10  4\nac  3  6'),
    sep='  ',
    engine='python'
)

Unnamed: 0,a,b,c
0,ab,1,bb
1,ab,10,4
2,ac,3,6


In [227]:
# 展示执行的细节
pd.read_csv(
    StringIO('a,b,c\n7,8,9\n4,,6\n2003,03,6'),
    verbose=True
)

Tokenization took: 0.00 ms
Type conversion took: 0.00 ms
Parser memory cleanup took: 0.00 ms


Unnamed: 0,a,b,c
0,7,8.0,9
1,4,,6
2,2003,3.0,6


In [233]:
# Indicate number of NA values placed in non-numeric columns.
pd.read_csv(
    StringIO('a,b,c\nab,8,9\n,6,6\ntest,03,6'),
    verbose=True
)

Tokenization took: 0.00 ms
Type conversion took: 0.00 ms
Parser memory cleanup took: 0.00 ms


Unnamed: 0,a,b,c
0,ab,8,9
1,,6,6
2,test,3,6


In [237]:
# 转义字符
pd.read_csv(
    StringIO('a,b,c\nab,8,9\n\,6,6\ntest,03,6'),
)

Unnamed: 0,a,b,c
0,ab,8,9
1,\,6,6
2,test,3,6


In [240]:
pd.read_csv('./test_data/test_6.csv', escapechar='\\')

Unnamed: 0,title,description
0,Jeans,blue
1,Jeans,"2"" seam"
2,Jeans,"2"" seam, blue"


In [245]:
# 引用符号
pd.read_csv('./test_data/test_7.csv', skipinitialspace = True, quotechar = '"')

Unnamed: 0,column1,column2,column3,column4,column5,column6
0,AM,7,1,SD,SD,CR
1,AM,8,"1,2 ,3","PR, SD,SD","PR ; , SD,SD","PR , ,, SD ,SD"
2,AM,1,2,SD,SD,SD


In [247]:
# 如何处理双引号
pd.read_csv('./test_data/test_8.csv', quoting=3)

Unnamed: 0,head_a,head_b
0,"""""""""",test
1,abc,ab


In [248]:
pd.read_csv('./test_data/test_8.csv', quoting=0)

Unnamed: 0,head_a,head_b
0,"""",test
1,abc,ab


In [249]:
pd.read_csv('./test_data/test_8.csv', quoting=2)

Unnamed: 0,head_a,head_b
0,"""",test
1,abc,ab


In [250]:
pd.read_csv('./test_data/test_8.csv', quoting=1)

Unnamed: 0,head_a,head_b
0,"""",test
1,abc,ab


In [252]:
pd.read_csv('./test_data/test_8.csv', doublequote=False)

Unnamed: 0,head_a,head_b
0,"""""",test
1,abc,ab


In [253]:
pd.read_csv('./test_data/test_8.csv', doublequote=True)

Unnamed: 0,head_a,head_b
0,"""",test
1,abc,ab


In [254]:
pd.read_csv('./test_data/test_8.csv')

Unnamed: 0,head_a,head_b
0,"""",test
1,abc,ab


In [255]:
pd.read_csv('./test_data/test_8.csv', doublequote=True, quoting=3)

Unnamed: 0,head_a,head_b
0,"""""""""",test
1,abc,ab


In [256]:
pd.read_csv('./test_data/test_8.csv', doublequote=True, quoting=1)

Unnamed: 0,head_a,head_b
0,"""",test
1,abc,ab


In [257]:
pd.read_csv('./test_data/test_8.csv', doublequote=True, quoting=2)

Unnamed: 0,head_a,head_b
0,"""",test
1,abc,ab


In [258]:
pd.read_csv('./test_data/test_8.csv', doublequote=True, quoting=0)

Unnamed: 0,head_a,head_b
0,"""",test
1,abc,ab


In [281]:
# 数字精度
pd.read_csv(
    StringIO('a,b,c\n1.23555514541412,1.25454574777441,9.215544154545\n1445545.2445544147,6.12541574892236544,6.252455\n125554,3.12445,6.2545455444'),
    float_precision='high'
).to_csv('test_11.csv', index=False)

In [134]:
# 只有这样设置才会跳过空行
# 这里的跳过空行并非如我们所一般认为, 是跳过空的行
pd.read_csv(
    StringIO(',,\n7,8,9\n4,5,6\n2003,03,6')
)

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2
0,7,8,9
1,4,5,6
2,2003,3,6


In [135]:
# 跳过空行的逻辑很神奇啊
# 
pd.read_csv(
    StringIO('1,2,3\n,,\n4,5,6\n2003,03,6'),
    skip_blank_lines=True
)

Unnamed: 0,1,2,3
0,,,
1,4.0,5.0,6.0
2,2003.0,3.0,6.0


In [136]:
pd.read_csv(
    StringIO(',,\n7,8,9\n4,5,6\n2003,03,6'),
    header = 0
)

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2
0,7,8,9
1,4,5,6
2,2003,3,6


In [None]:
pd.read_csv(
    StringIO(',,\n7,8,9\n4,5,6\n2003,03,6'),
    skip_blank_lines=True,
    names=[*'abc']
)

In [137]:
pd.read_csv('./test_data/test_3.csv', skip_blank_lines=True, engine='python', header=0)

Unnamed: 0,a,b,c
0,1.0,2.0,3.0
1,,,
2,4.0,2.0,5.0


In [139]:
# 指定需要读取多少列的数据
pd.read_csv('./test_data/test_3.csv', nrows=2)

Unnamed: 0,a,b,c
0,1.0,2.0,3.0
1,,,


In [147]:
# 跳过指定行, 就算指定表头, 表头也被纳入跳过
# 这里跳过的行, 就是从0开始
pd.read_csv('./test_data/test_3.csv', skiprows=1, header=0)

Unnamed: 0,1,2,3
0,,,
1,4.0,2.0,5.0


In [148]:
comment = '# empty\n' \
       '# second empty line\n' \
       '# third empty line\n' \
       'X,Y,Z\n' \
       '1,2,3\n' \
       'A,B,C\n' \
       '1,2.,4.\n' \
       '5.,NaN,10.0'

In [149]:
print(comment)

# empty
# second empty line
# third empty line
X,Y,Z
1,2,3
A,B,C
1,2.,4.
5.,NaN,10.0


In [151]:
# 剔除掉注释的行
pd.read_csv(StringIO(comment), comment='#')

Unnamed: 0,X,Y,Z
0,1,2,3
1,A,B,C
2,1,2.,4.
3,5.,,10.0


In [152]:
pd.read_csv(StringIO(comment), comment='#', skipfooter=1, engine='python')

Unnamed: 0,X,Y,Z
0,1,2,3
1,A,B,C
2,1,2.,4.


In [153]:
# C不支持skipfooter这个参数
pd.read_csv(StringIO(comment), comment='#', skipfooter=1, engine='c')

ValueError: the 'c' engine does not support skipfooter

### skipinitialspace

清除分割符后的空格

In [155]:
# 往分隔符前后添加了空格
h = pd.read_csv('./test_data/test_4.csv')

In [156]:
h['len'] = h['a'].apply(len)

In [157]:
h

Unnamed: 0,a,b,c,len
0,aa,aa,aa,3
1,bb,bb,bb,2
2,cc,cc,cc,4


In [158]:
k = pd.read_csv('./test_data/test_4.csv', skipinitialspace=True)

In [159]:
k['len'] = k['a'].apply(len)

In [160]:
k

Unnamed: 0,a,b,c,len
0,aa,aa,aa,2
1,bb,bb,bb,2
2,cc,cc,cc,2


In [161]:
# 读取压缩包
pd.read_csv('./test_data/test_4.zip', compression='zip')

Unnamed: 0,a,b,c
0,aa,aa,aa
1,bb,bb,bb
2,cc,cc,cc


### 分块数据读取

- iterator

- chunksize

In [173]:
## 分块读取数据
reader = pd.read_csv('./test_data/test_5.csv', iterator=True, encoding='gb18030')

In [174]:
# 当迭代完成, 将会触发StopIteration
with suppress(StopIteration):
    while True:
        block = reader.get_chunk(4)
        print(block)

   序号      代码     名称       最新价    涨跌额     涨跌幅       成交量       成交额        昨收  \
0   1       1   上证指数   3168.65  -7.88  -0.25%     2.43亿  3052.05亿   3176.53   
1   2  399001   深证成指  11358.11  36.30   0.32%     3.93亿  4610.44亿  11321.81   
2   3     300  沪深300   3951.99  -2.90  -0.07%  8886.47万  1819.95亿   3954.89   
3   4  399005  中小100   7589.63  28.11   0.37%  2159.41万   566.15亿   7561.52   

         今开        最高        最低  
0   3177.20   3179.10   3158.45  
1  11306.79  11371.18  11269.61  
2   3954.67   3963.50   3926.50  
3   7552.24   7597.74   7525.94  
   序号      代码     名称      最新价    涨跌额     涨跌幅       成交量       成交额       昨收  \
4   5  399006   创业板指  2399.12  31.04   1.31%  8763.43万  1503.77亿  2368.08   
5   6      10  上证180  8389.93 -28.09  -0.33%  6136.06万  1031.49亿  8418.02   
6   7      16   上证50  2675.19 -11.18  -0.42%  2315.92万   483.83亿  2686.37   
7   8       9  上证380  5781.25   4.42   0.08%  4181.22万   636.52亿  5776.83   

        今开       最高       最低  
4  2365.23  2399.

In [170]:
read_size =  pd.read_csv('./test_data/test_5.csv', iterator=True, encoding='gb18030', chunksize=10)

In [171]:
for block in read_size:
    print(block)

   序号      代码     名称       最新价    涨跌额     涨跌幅       成交量       成交额        昨收  \
0   1       1   上证指数   3168.65  -7.88  -0.25%     2.43亿  3052.05亿   3176.53   
1   2  399001   深证成指  11358.11  36.30   0.32%     3.93亿  4610.44亿  11321.81   
2   3     300  沪深300   3951.99  -2.90  -0.07%  8886.47万  1819.95亿   3954.89   
3   4  399005  中小100   7589.63  28.11   0.37%  2159.41万   566.15亿   7561.52   
4   5  399006   创业板指   2399.12  31.04   1.31%  8763.43万  1503.77亿   2368.08   
5   6      10  上证180   8389.93 -28.09  -0.33%  6136.06万  1031.49亿   8418.02   
6   7      16   上证50   2675.19 -11.18  -0.42%  2315.92万   483.83亿   2686.37   
7   8       9  上证380   5781.25   4.42   0.08%  4181.22万   636.52亿   5776.83   
8   9     132  上证100   6489.40   6.34   0.10%   767.57万   137.47亿   6483.06   
9  10     133  上证150   5223.62  57.18   1.11%   915.20万   143.31亿   5166.44   

         今开        最高        最低  
0   3177.20   3179.10   3158.45  
1  11306.79  11371.18  11269.61  
2   3954.67   3963.50   3926

In [175]:
# 不需要iterator=True, 也没有影响?
read_size =  pd.read_csv('./test_data/test_5.csv', encoding='gb18030', chunksize=10)

In [176]:
for block in read_size:
    print(block)

   序号      代码     名称       最新价    涨跌额     涨跌幅       成交量       成交额        昨收  \
0   1       1   上证指数   3168.65  -7.88  -0.25%     2.43亿  3052.05亿   3176.53   
1   2  399001   深证成指  11358.11  36.30   0.32%     3.93亿  4610.44亿  11321.81   
2   3     300  沪深300   3951.99  -2.90  -0.07%  8886.47万  1819.95亿   3954.89   
3   4  399005  中小100   7589.63  28.11   0.37%  2159.41万   566.15亿   7561.52   
4   5  399006   创业板指   2399.12  31.04   1.31%  8763.43万  1503.77亿   2368.08   
5   6      10  上证180   8389.93 -28.09  -0.33%  6136.06万  1031.49亿   8418.02   
6   7      16   上证50   2675.19 -11.18  -0.42%  2315.92万   483.83亿   2686.37   
7   8       9  上证380   5781.25   4.42   0.08%  4181.22万   636.52亿   5776.83   
8   9     132  上证100   6489.40   6.34   0.10%   767.57万   137.47亿   6483.06   
9  10     133  上证150   5223.62  57.18   1.11%   915.20万   143.31亿   5166.44   

         今开        最高        最低  
0   3177.20   3179.10   3158.45  
1  11306.79  11371.18  11269.61  
2   3954.67   3963.50   3926

### low_memory

> nternally process the file in chunks, resulting in lower memory use while parsing, but possibly mixed type inference. To ensure no mixed types either set False, or specify the type with the dtype parameter. Note that the entire file is read into a single DataFrame regardless, use the chunksize or iterator parameter to return the data in chunks. (Only valid with C parser).

> 在内部以块的形式处理文件，从而在解析时减少内存使用，但可能会混合类型推断。要确保没有混合类型，可以设置为False，或者使用dtype参数指定类型。注意，无论如何，整个文件都被读入一个DataFrame，使用chunksize或iterator参数以块形式返回数据。(仅对C解析器有效)。

- 再处理混合内容时的处理方式会发生变化

- 注意仅对 `C engine `有效, 在处理一些欧洲的数字时, 可能会遇到问题, 那些数据的处理可能需要`python engine`

In [29]:
%%time
# thousands=",", low_memory=True
dfm = pd.read_csv('./test_data/test_11.csv', header=7, encoding='utf-8-sig', low_memory=False)

CPU times: total: 562 ms
Wall time: 576 ms


In [30]:
dfm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110328 entries, 0 to 110327
Data columns (total 28 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   date/time                 110328 non-null  object 
 1   settlement id             110328 non-null  float64
 2   type                      109763 non-null  object 
 3   order id                  109341 non-null  object 
 4   sku                       102882 non-null  object 
 5   description               110300 non-null  object 
 6   quantity                  103066 non-null  float64
 7   marketplace               106723 non-null  object 
 8   account type              110328 non-null  object 
 9   fulfillment               99391 non-null   object 
 10  order city                99385 non-null   object 
 11  order state               99267 non-null   object 
 12  order postal              99366 non-null   object 
 13  tax collection model      90722 non-null   o

In [31]:
%%time
# thousands=",", low_memory=True
dfx = pd.read_csv('./test_data/test_11.csv', header=7, encoding='utf-8-sig', low_memory=True)

CPU times: total: 562 ms
Wall time: 564 ms




In [32]:
dfx.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110328 entries, 0 to 110327
Data columns (total 28 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   date/time                 110328 non-null  object 
 1   settlement id             110328 non-null  float64
 2   type                      109763 non-null  object 
 3   order id                  109341 non-null  object 
 4   sku                       102882 non-null  object 
 5   description               110300 non-null  object 
 6   quantity                  103066 non-null  float64
 7   marketplace               106723 non-null  object 
 8   account type              110328 non-null  object 
 9   fulfillment               99391 non-null   object 
 10  order city                99385 non-null   object 
 11  order state               99267 non-null   object 
 12  order postal              99366 non-null   object 
 13  tax collection model      90722 non-null   o