In [1]:
print("""
@File         : ch03_creating_and_persisting_dataframes.ipynb
@Author(s)    : Stephen CUI
@LastEditor(s): Stephen CUI
@CreatedTime  : 2024-07-21 17:32:17
@Email        : cuixuanstephen@gmail.com
@Description  : 创建和持久化 DataFrames
""")


@File         : ch03_creating_and_persisting_dataframes.ipynb
@Author(s)    : Stephen CUI
@LastEditor(s): Stephen CUI
@CreatedTime  : 2024-07-21 17:32:17
@Email        : cuixuanstephen@gmail.com
@Description  : 创建和持久化 DataFrames



In [2]:
%cd ../

d:\Data-Analysis-and-Science\P1XC2E


In [3]:
import pandas as pd
import numpy as np

## 从头创建 DataFrames

In [4]:
fname = ["Paul", "John", "Richard", "George"]
lname = ["McCartney", "Lennon", "Starkey", "Harrison"]
birth = [1942, 1940, 1940, 1943]

In [5]:
people = {'first': fname, 'last': fname, 'birth': birth}

In [6]:
beatles = pd.DataFrame(people)
beatles

Unnamed: 0,first,last,birth
0,Paul,Paul,1942
1,John,John,1940
2,Richard,Richard,1940
3,George,George,1943


默认情况下，当我们调用构造函数时， pandas 将为我们的 DataFrame 创建一个RangeIndex：

In [7]:
beatles.index

RangeIndex(start=0, stop=4, step=1)

如果需要，我们可以为 DataFrame 指定另一个索引：

In [8]:
pd.DataFrame(people, index=['a', 'b', 'c', 'd'])

Unnamed: 0,first,last,birth
a,Paul,Paul,1942
b,John,John,1940
c,Richard,Richard,1940
d,George,George,1943


In [10]:
pd.DataFrame(
    [
        {
            "first": "Paul",
            "last": "McCartney",
            "birth": 1942,
            },       
        {
            "first": "John",
            "last": "Lennon",
            "birth": 1940,
            },       
        {
            "first": "Richard",
            "last": "Starkey",
            "birth": 1940,
            },
        {
            "first": "George",
            "last": "Harrison",
            "birth": 1943,
            }
    ]
)

Unnamed: 0,first,last,birth
0,Paul,McCartney,1942
1,John,Lennon,1940
2,Richard,Starkey,1940
3,George,Harrison,1943


请注意，当您使用字典行时，列是按键的字母顺序排序的。您可以使用 `columns` 参数来指定列顺序。
> 新版本中似乎没有这种说法

## 写入 CSV

DataFrame 上有几个以 to_ 开头的方法。这些是导出 DataFrame 的方法。我们将使用 `.to_csv` 方法。

In [11]:
beatles

Unnamed: 0,first,last,birth
0,Paul,Paul,1942
1,John,John,1940
2,Richard,Richard,1940
3,George,George,1943


In [12]:
from io import StringIO

In [13]:
fout = StringIO()
beatles.to_csv(fout)
# 示例中将
# 其写入字符串缓冲区，但您通常会使用文件名。

In [14]:
print(fout.getvalue())

,first,last,birth
0,Paul,Paul,1942
1,John,John,1940
2,Richard,Richard,1940
3,George,George,1943



In [15]:
_ = fout.seek(0)

In [16]:
pd.read_csv(fout)

Unnamed: 0.1,Unnamed: 0,first,last,birth
0,0,Paul,Paul,1942
1,1,John,John,1940
2,2,Richard,Richard,1940
3,3,George,George,1943


In [17]:
_ = fout.seek(0)
pd.read_csv(fout, index_col=0)

Unnamed: 0,first,last,birth
0,Paul,Paul,1942
1,John,John,1940
2,Richard,Richard,1940
3,George,George,1943


In [19]:
fout = StringIO()
beatles.to_csv(fout, index=False)
print(fout.getvalue())

first,last,birth
Paul,Paul,1942
John,John,1940
Richard,Richard,1940
George,George,1943



## 读取大型 CSV 文件

pandas 库是一种内存工具。您需要能够将数据放入内存中才能使用 pandas。如果您遇到要处理的大型 CSV 文件，则有几个选项。如果您可以一次处理部分内容，则可以将其读入块并处理每个块。或者，如果您知道应该有足够的内存来加载文件，则有一些提示可以帮助缩减文件大小。

> 请注意，一般来说，您应拥有比要操作的 DataFrame 大小多三到十倍的内存。额外的内存应能为您提供足够的额外空间来执行许多常见操作。

**假设文件比实际大很多，或者我的机器的内存有限，以至于当 pandas 尝试使用 read_csv 加载它时，出现内存错误。**

In [20]:
diamonds = pd.read_csv('data/diamonds.csv', nrows=1000)

In [21]:
diamonds

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
995,0.54,Ideal,D,VVS2,61.4,52.0,2897,5.30,5.34,3.26
996,0.72,Ideal,E,SI1,62.5,55.0,2897,5.69,5.74,3.57
997,0.72,Good,F,VS1,59.4,61.0,2897,5.82,5.89,3.48
998,0.74,Premium,D,VS2,61.8,58.0,2897,5.81,5.77,3.58


In [22]:
diamonds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    1000 non-null   float64
 1   cut      1000 non-null   object 
 2   color    1000 non-null   object 
 3   clarity  1000 non-null   object 
 4   depth    1000 non-null   float64
 5   table    1000 non-null   float64
 6   price    1000 non-null   int64  
 7   x        1000 non-null   float64
 8   y        1000 non-null   float64
 9   z        1000 non-null   float64
dtypes: float64(6), int64(1), object(3)
memory usage: 78.2+ KB


使用 `read_csv` 的 `dtype` 参数告诉它使用正确的（或更小的）数字类型：

In [24]:
diamonds2 = pd.read_csv('data/diamonds.csv', nrows=1000,
                        dtype={
                            'carat': np.float32,
                            'depth': np.float32,
                            'table': np.float32,
                            'x': np.float32,
                            'y': np.float32,
                            'z': np.float32,
                            'price': np.int16,
                        })

In [25]:
diamonds2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    1000 non-null   float32
 1   cut      1000 non-null   object 
 2   color    1000 non-null   object 
 3   clarity  1000 non-null   object 
 4   depth    1000 non-null   float32
 5   table    1000 non-null   float32
 6   price    1000 non-null   int16  
 7   x        1000 non-null   float32
 8   y        1000 non-null   float32
 9   z        1000 non-null   float32
dtypes: float32(6), int16(1), object(3)
memory usage: 49.0+ KB


如果基数较低(分类变量)，则可以将它们转换为分类列以节省更多内存：

In [27]:
diamonds3 = pd.read_csv('data/diamonds.csv', nrows=1000,
                        dtype={
                            'carat': np.float32,
                            'depth': np.float32,
                            'table': np.float32,
                            'x': np.float32,
                            'y': np.float32,
                            'z': np.float32,
                            'price': np.int16,
                            'cut': 'category',
                            'color': 'category',
                            'clarity': 'category',
                        })

In [28]:
diamonds3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    1000 non-null   float32 
 1   cut      1000 non-null   category
 2   color    1000 non-null   category
 3   clarity  1000 non-null   category
 4   depth    1000 non-null   float32 
 5   table    1000 non-null   float32 
 6   price    1000 non-null   int16   
 7   x        1000 non-null   float32 
 8   y        1000 non-null   float32 
 9   z        1000 non-null   float32 
dtypes: category(3), float32(6), int16(1)
memory usage: 29.4 KB


如果您可以一次处理数据块并且不需要将全部数据都放在内存中，则可以使用 `chunksize` 参数：

In [29]:
cols = ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price']
diamonds_iter = pd.read_csv('data/diamonds.csv', nrows=1000,
    dtype={'carat': np.float32, 'depth': np.float32,
           'table': np.float32, 'price': np.int16,
           'cut': 'category', 'color': 'category',
           'clarity': 'category'},
    usecols=cols,
    chunksize=200)

In [32]:
def process(df):
    return f'processed {df.size} items'

In [33]:
for chunk in diamonds_iter:
    process(chunk)

由于 CSV 文件不包含有关类型的信息，因此 pandas 会尝试推断列的类型。如果列的所有值都是整数且没有缺失值，则使用 int64 类型。如果列是数字但不是整数，或者有缺失值，则使用 float64。这些数据类型可能会存储您需要的更多信息。

从 pandas 0.24 开始，有一种新类型 “Int64” （注意是大写），它支持缺少数字的整数类型。如果要使用此类型，您需要使用 dtype 参数指定它，因为 pandas 会将缺少数字的整数转换为 float64。

如果列不是数字，pandas 会将其转换为对象列，并将值视为字符串。pandas 中的字符串值会占用大量内存，因为每个值都存储为 Python 字符串。如果我们将它们转换为分类值，pandas 将使用更少的内存，因为它只存储一次字符串，而不是为每一行创建新的字符串（即使它们重复）。

In [34]:
np.iinfo(np.int8)

iinfo(min=-128, max=127, dtype=int8)

In [37]:
np.finfo(np.float16)

finfo(resolution=0.001, min=-6.55040e+04, max=6.55040e+04, dtype=float16)

In [38]:
diamonds.price.memory_usage()

8128

In [39]:
diamonds.price.memory_usage(index=False)

8000

可以使用 `.memory_usage` 方法询问 DataFrame 或 Series 使用了多少字节。请注意，这还包括索引的内存要求。此外，您需要传递 `deep=True` 才能获取具有 object 类型的 Series 的使用情况：

In [41]:
diamonds.cut.memory_usage()

8128

In [40]:
diamonds.cut.memory_usage(deep=True)

63461

将数据保存为您喜欢的格式后，您可以将其保存为可跟踪类型的二进制格式，例如 Feather 格式（pandas 利用 pyarrow 库来实现此目的）。此格式旨在实现语言之间结构化数据的内存传输，并经过优化，以便可以按原样使用数据而无需进行内部转换。定义类型后，从此格式读取数据会更快、更轻松：

In [44]:
diamonds3.to_feather('data/d.arr')
diamonds5 = pd.read_feather('data/d.arr')

另一个二进制选项是 Parquet 格式。Feather 针对内存结构优化二进制数据，而 Parquet 针对磁盘格式进行优化。

In [45]:
diamonds3.to_parquet('data/d.pqt')

## 使用 Excel 文件

In [46]:
beatles.to_excel('data/beat.xls')

  beatles.to_excel('data/beat.xls')


In [47]:
beatles.to_excel('data/beat.xlsx')

In [50]:
beat2 = pd.read_excel('data/beat.xls')
beat2

Unnamed: 0.1,Unnamed: 0,first,last,birth
0,0,Paul,Paul,1942
1,1,John,John,1940
2,2,Richard,Richard,1940
3,3,George,George,1943


In [52]:
beat2 = pd.read_excel('data/beat.xls', index_col=0)
beat2

Unnamed: 0,first,last,birth
0,Paul,Paul,1942
1,John,John,1940
2,Richard,Richard,1940
3,George,George,1943


In [54]:
beat2.dtypes

first    object
last     object
birth     int64
dtype: object

可以传递 `sheet_name` 给 `.to_excel` 方法来告诉它要创建的工作表的名称：

In [57]:
xl_writer = pd.ExcelWriter('data/beat2.xlsx')

In [58]:
beatles.to_excel(xl_writer, sheet_name='All')
beatles[beatles.birth < 1941].to_excel(xl_writer, sheet_name='1940')
xl_writer.close()

## 使用 ZIP 文件

如果 CSV 文件是 ZIP 文件中的唯一文件，则只需调用 `read_csv` 函数即可

In [60]:
autos = pd.read_csv('data/vehicles.csv.zip')
autos

  autos = pd.read_csv('data/vehicles.csv.zip')


Unnamed: 0,barrels08,barrelsA08,charge120,charge240,city08,city08U,cityA08,cityA08U,cityCD,cityE,...,mfrCode,c240Dscr,charge240b,c240bDscr,createdOn,modifiedOn,startStop,phevCity,phevHwy,phevComb
0,15.695714,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
1,29.964545,0.0,0.0,0.0,9,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
2,12.207778,0.0,0.0,0.0,23,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
3,29.964545,0.0,0.0,0.0,10,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
4,17.347895,0.0,0.0,0.0,17,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39096,14.982273,0.0,0.0,0.0,19,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
39097,14.330870,0.0,0.0,0.0,20,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
39098,15.695714,0.0,0.0,0.0,18,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0
39099,15.695714,0.0,0.0,0.0,18,0.0,0,0.0,0.0,0.0,...,,,0.0,,Tue Jan 01 00:00:00 EST 2013,Tue Jan 01 00:00:00 EST 2013,,0,0,0


In [61]:
autos.modifiedOn.dtype

dtype('O')

In [62]:
autos.modifiedOn

0        Tue Jan 01 00:00:00 EST 2013
1        Tue Jan 01 00:00:00 EST 2013
2        Tue Jan 01 00:00:00 EST 2013
3        Tue Jan 01 00:00:00 EST 2013
4        Tue Jan 01 00:00:00 EST 2013
                     ...             
39096    Tue Jan 01 00:00:00 EST 2013
39097    Tue Jan 01 00:00:00 EST 2013
39098    Tue Jan 01 00:00:00 EST 2013
39099    Tue Jan 01 00:00:00 EST 2013
39100    Tue Jan 01 00:00:00 EST 2013
Name: modifiedOn, Length: 39101, dtype: object

In [63]:
pd.to_datetime(autos.modifiedOn)



0       2013-01-01
1       2013-01-01
2       2013-01-01
3       2013-01-01
4       2013-01-01
           ...    
39096   2013-01-01
39097   2013-01-01
39098   2013-01-01
39099   2013-01-01
39100   2013-01-01
Name: modifiedOn, Length: 39101, dtype: datetime64[ns]

In [64]:
autos = pd.read_csv('data/vehicles.csv.zip', parse_dates=['modifiedOn'])
autos.modifiedOn

  autos = pd.read_csv('data/vehicles.csv.zip', parse_dates=['modifiedOn'])


0       2013-01-01
1       2013-01-01
2       2013-01-01
3       2013-01-01
4       2013-01-01
           ...    
39096   2013-01-01
39097   2013-01-01
39098   2013-01-01
39099   2013-01-01
39100   2013-01-01
Name: modifiedOn, Length: 39101, dtype: datetime64[ns]

如果 ZIP 文件中有许多文件，则从中读取 CSV 文件会稍微复杂一些。read_csv函数无法指定 ZIP 文件内的文件。相反，我们将使用 Python 标准库中的 `zipfile` 模块。

In [65]:
import zipfile

In [68]:
with zipfile.ZipFile('data/kaggle-survey-2018.zip') as z:
    print('\n'.join(z.namelist()))
    kag = pd.read_csv(
        z.open('multipleChoiceResponses.csv')
    )
    kag_questions = kag.iloc[0]
    survey = kag.iloc[1:]

multipleChoiceResponses.csv
freeFormResponses.csv
SurveySchema.csv


  kag = pd.read_csv(


In [69]:
survey.head(2).T

Unnamed: 0,1,2
Time from Start to Finish (seconds),710,434
Q1,Female,Male
Q1_OTHER_TEXT,-1,-1
Q2,45-49,30-34
Q3,United States of America,Indonesia
...,...,...
Q50_Part_5,,
Q50_Part_6,,
Q50_Part_7,,
Q50_Part_8,,


## 使用数据库

In [81]:
import sqlite3
con = sqlite3.connect('data/beat.db')
with con:
    cur = con.cursor()
    cur.execute('DROP TABLE IF EXISTS Band;')
    cur.execute(
    """
    CREATE TABLE Band(id INTEGER PRIMARY KEY,
    fname TEXT, lname TEXT, birthyear INT)
    """
    )
    cur.execute(
    """INSERT INTO Band VALUES(
        0, 'Paul', 'McCartney', 1942
    )
    """
    )
    cur.execute("""INSERT INTO Band VALUES(
        1, 'John', 'Lennon', 1940)""")
    _ = con.commit()

In [84]:
import sqlalchemy as sa

engine = sa.create_engine(
    "sqlite:///data/beat.db", echo=True
)
sa_connection = engine.connect()
beat = pd.read_sql(
    'Band', sa_connection, index_col='id'
)
beat

2024-07-22 14:08:57,683 INFO sqlalchemy.engine.Engine PRAGMA main.table_info("Band")
2024-07-22 14:08:57,684 INFO sqlalchemy.engine.Engine [raw sql] ()
2024-07-22 14:08:57,686 INFO sqlalchemy.engine.Engine SELECT name FROM sqlite_master WHERE type='table' ORDER BY name
2024-07-22 14:08:57,686 INFO sqlalchemy.engine.Engine [raw sql] ()
2024-07-22 14:08:57,687 INFO sqlalchemy.engine.Engine PRAGMA main.table_xinfo("Band")
2024-07-22 14:08:57,688 INFO sqlalchemy.engine.Engine [raw sql] ()
2024-07-22 14:08:57,691 INFO sqlalchemy.engine.Engine SELECT sql FROM  (SELECT * FROM sqlite_master UNION ALL   SELECT * FROM sqlite_temp_master) WHERE name = ? AND type = 'table'
2024-07-22 14:08:57,693 INFO sqlalchemy.engine.Engine [raw sql] ('Band',)
2024-07-22 14:08:57,694 INFO sqlalchemy.engine.Engine PRAGMA main.foreign_key_list("Band")
2024-07-22 14:08:57,695 INFO sqlalchemy.engine.Engine [raw sql] ()
2024-07-22 14:08:57,696 INFO sqlalchemy.engine.Engine PRAGMA temp.foreign_key_list("Band")
2024-07

Unnamed: 0_level_0,fname,lname,birthyear
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Paul,McCartney,1942
1,John,Lennon,1940


In [85]:
sql = "SELECT fname, birthyear from Band"
fnames = pd.read_sql(sql, con)
fnames

Unnamed: 0,fname,birthyear
0,Paul,1942
1,John,1940


## 读取 JSON

JavaScript 对象表示法(JSON)是一种用于通过互联网传输数据的常见格式。与名称相反，它不需要 JavaScript 来读取或创建。Python 标准库附带了 json 库，该库将对 JSON 进行编码和解码：

In [86]:
import json
encoded = json.dumps(people)
encoded

'{"first": ["Paul", "John", "Richard", "George"], "last": ["Paul", "John", "Richard", "George"], "birth": [1942, 1940, 1940, 1943]}'

In [87]:
json.loads(encoded)

{'first': ['Paul', 'John', 'Richard', 'George'],
 'last': ['Paul', 'John', 'Richard', 'George'],
 'birth': [1942, 1940, 1940, 1943]}

使用 read_json 函数读取数据。如果您的 JSON 是字典映射到列表的形式，则可以毫不费力地提取它。这种方向在 pandas 中称为列：

In [88]:
beatles = pd.read_json(encoded)
beatles

Unnamed: 0,first,last,birth
0,Paul,Paul,1942
1,John,John,1940
2,Richard,Richard,1940
3,George,George,1943


读取 JSON 时要注意的一点是，它需要采用特定的格式，以便 pandas 加载它。但是，pandas 支持面向几种样式的数据。

- columns – （默认）列名到列中值列表的映射。
- records – 行列表。每行都是将列映射到值的字典。
- values – 每行数据的列表（每行也是一个列表）。这不包括列或索引值。
- split – 列到列名、索引到索引值以及数据到每行数据列表的映射（每行也是一个表）。
- index – 索引值到行的映射。行是将列映射到值的字典。
- table – 模式到 DataFrame 模式的映射，以及数据到字典列表的映射。

In [89]:
records = beatles.to_json(orient='records')
records
# list[dict[str:str]]

'[{"first":"Paul","last":"Paul","birth":1942},{"first":"John","last":"John","birth":1940},{"first":"Richard","last":"Richard","birth":1940},{"first":"George","last":"George","birth":1943}]'

In [90]:
pd.read_json(records, orient='records')

Unnamed: 0,first,last,birth
0,Paul,Paul,1942
1,John,John,1940
2,Richard,Richard,1940
3,George,George,1943


In [91]:
split = beatles.to_json(orient='split')
split
# dict[str:list]

'{"columns":["first","last","birth"],"index":[0,1,2,3],"data":[["Paul","Paul",1942],["John","John",1940],["Richard","Richard",1940],["George","George",1943]]}'

In [92]:
pd.read_json(split, orient='split')

Unnamed: 0,first,last,birth
0,Paul,Paul,1942
1,John,John,1940
2,Richard,Richard,1940
3,George,George,1943


In [100]:
index = beatles.to_json(orient='index')
index
# dict[str:dict]

'{"0":{"first":"Paul","last":"Paul","birth":1942},"1":{"first":"John","last":"John","birth":1940},"2":{"first":"Richard","last":"Richard","birth":1940},"3":{"first":"George","last":"George","birth":1943}}'

In [94]:
pd.read_json(index, orient='index')

Unnamed: 0,first,last,birth
0,Paul,Paul,1942
1,John,John,1940
2,Richard,Richard,1940
3,George,George,1943


In [95]:
values = beatles.to_json(orient='values')
values
# list[list]

'[["Paul","Paul",1942],["John","John",1940],["Richard","Richard",1940],["George","George",1943]]'

In [96]:
pd.read_json(values, orient='values')

Unnamed: 0,0,1,2
0,Paul,Paul,1942
1,John,John,1940
2,Richard,Richard,1940
3,George,George,1943


In [97]:
table = beatles.to_json(orient='table')
table

'{"schema":{"fields":[{"name":"index","type":"integer"},{"name":"first","type":"string"},{"name":"last","type":"string"},{"name":"birth","type":"integer"}],"primaryKey":["index"],"pandas_version":"1.4.0"},"data":[{"index":0,"first":"Paul","last":"Paul","birth":1942},{"index":1,"first":"John","last":"John","birth":1940},{"index":2,"first":"Richard","last":"Richard","birth":1940},{"index":3,"first":"George","last":"George","birth":1943}]}'

In [98]:
pd.read_json(table, orient='table')

Unnamed: 0,first,last,birth
0,Paul,Paul,1942
1,John,John,1940
2,Richard,Richard,1940
3,George,George,1943


如果您正在开发 Web 服务并需要向 JSON 添加其他数据，只需使用 `.to_dict` 方法生成字典。您可以将新数据添加到字典中，然后将该字典转换为 JSON：

In [101]:
output = beat.to_dict()
output

{'fname': {0: 'Paul', 1: 'John'},
 'lname': {0: 'McCartney', 1: 'Lennon'},
 'birthyear': {0: 1942, 1: 1940}}

In [102]:
output['version'] = '0.4.1'
json.dumps(output)

'{"fname": {"0": "Paul", "1": "John"}, "lname": {"0": "McCartney", "1": "Lennon"}, "birthyear": {"0": 1942, "1": 1940}, "version": "0.4.1"}'

## 读取 HTML 表格

以使用 pandas 读取网站上的 HTML 表格。这样可以轻松提取维基百科或其他网站上的表格。

In [104]:
url = 'https://en.wikipedia.org/wiki/The_Beatles_discography'
dfs = pd.read_html(url)
len(dfs)

60

In [105]:
dfs[0]

Unnamed: 0,The Beatles discography,The Beatles discography.1
0,The Beatles in 1965,The Beatles in 1965
1,Studio albums,"12 (UK), 17 (US)"
2,Live albums,5
3,Compilation albums,51
4,Video albums,22
5,Music videos,53
6,EPs,36
7,Singles,63
8,Mash-ups,2
9,Box sets,17


我们可以循环遍历 `read_html` 创建的每个表，或者我们可以给它一个提示来查找特定的表。该函数具有 `match` 参数，可以是字符串或正则表达式。它还有一个 `attrs` 参数，允许您传入 HTML 标签属性键和值（在字典中）并使用它来识别表。

In [106]:
dfs = pd.read_html(url, match='List of studio albums', na_values='-')

In [107]:
len(dfs)

2

In [111]:
dfs[0].columns

MultiIndex([(               'Title',            'Title'),
            (    'Album details[A]', 'Album details[A]'),
            ('Peak chart positions',        'UK [8][9]'),
            ('Peak chart positions',         'AUS [10]'),
            ('Peak chart positions',         'CAN [11]'),
            ('Peak chart positions',         'FRA [12]'),
            ('Peak chart positions',         'GER [13]'),
            ('Peak chart positions',         'NOR [14]'),
            ('Peak chart positions',      'US [15][16]'),
            (      'Certifications',   'Certifications'),
            (               'Sales',            'Sales')],
           )

In [113]:
dfs[0]

Unnamed: 0_level_0,Title,Album details[A],Peak chart positions,Peak chart positions,Peak chart positions,Peak chart positions,Peak chart positions,Peak chart positions,Peak chart positions,Certifications,Sales
Unnamed: 0_level_1,Title,Album details[A],UK [8][9],AUS [10],CAN [11],FRA [12],GER [13],NOR [14],US [15][16],Certifications,Sales
0,Please Please Me,Released: 22 March 1963 Label: Parlophone,1,—,—,5,5,—,155,BPI: Platinum[17] ARIA: Gold[18] MC: Gold[19] ...,
1,With the Beatles[B],Released: 22 November 1963 Label: Parlophone (...,1,—,—,5,1,—,179,BPI: Gold[17] ARIA: Gold[18] BVMI: Gold[21] MC...,
2,A Hard Day's Night,Released: 10 July 1964 Label: Parlophone,1,1,—,—,1,—,—,BPI: Platinum[17] ARIA: Gold[18],
3,Beatles for Sale,Released: 4 December 1964 Label: Parlophone,1,1,—,—,1,—,—,BPI: Gold[17] ARIA: Gold[18] MC: Gold[19] RIAA...,"UK: 750,000[22]"
4,Help!,Released: 6 August 1965 Label: Parlophone,1,1,—,5,1,—,—,BPI: Platinum[17] ARIA: Gold[18],
5,Rubber Soul,Released: 3 December 1965 Label: Parlophone,1,1,—,5,1,—,—,BPI: 2× Platinum[17] ARIA: Platinum[18] BVMI: ...,
6,Revolver,Released: 5 August 1966 Label: Parlophone,1,1,—,5,1,14,—,BPI: 2× Platinum[17] ARIA: Platinum[18],
7,Sgt. Pepper's Lonely Hearts Club Band,Released: 26 May 1967[23] Label: Parlophone,1,1,1,4,1,1,1,BPI: 18× Platinum[17] ARIA: 4× Platinum[18] BV...,"UK: 5,340,000[25]"
8,"The Beatles (""The White Album"")",Released: 22 November 1968 Label: Apple,1,1,1,1,1,1,1,BPI: 2× Platinum[17] ARIA: 2× Platinum[18] MC:...,
9,Yellow Submarine[C],Released: 17 January 1969 Label: Apple,3,4,1,4,5,1,2,BPI: Gold[17] MC: Gold[19] RIAA: Platinum[20],


In [114]:
dfs = pd.read_html(
    url,
    match='List of studio albums',
    na_values='-',
    header=[0, 1],
)

In [115]:
len(dfs)

2

In [116]:
dfs[0]

Unnamed: 0_level_0,Title,Album details[A],Peak chart positions,Peak chart positions,Peak chart positions,Peak chart positions,Peak chart positions,Peak chart positions,Peak chart positions,Certifications,Sales
Unnamed: 0_level_1,Title,Album details[A],UK [8][9],AUS [10],CAN [11],FRA [12],GER [13],NOR [14],US [15][16],Certifications,Sales
0,Please Please Me,Released: 22 March 1963 Label: Parlophone,1,—,—,5,5,—,155,BPI: Platinum[17] ARIA: Gold[18] MC: Gold[19] ...,
1,With the Beatles[B],Released: 22 November 1963 Label: Parlophone (...,1,—,—,5,1,—,179,BPI: Gold[17] ARIA: Gold[18] BVMI: Gold[21] MC...,
2,A Hard Day's Night,Released: 10 July 1964 Label: Parlophone,1,1,—,—,1,—,—,BPI: Platinum[17] ARIA: Gold[18],
3,Beatles for Sale,Released: 4 December 1964 Label: Parlophone,1,1,—,—,1,—,—,BPI: Gold[17] ARIA: Gold[18] MC: Gold[19] RIAA...,"UK: 750,000[22]"
4,Help!,Released: 6 August 1965 Label: Parlophone,1,1,—,5,1,—,—,BPI: Platinum[17] ARIA: Gold[18],
5,Rubber Soul,Released: 3 December 1965 Label: Parlophone,1,1,—,5,1,—,—,BPI: 2× Platinum[17] ARIA: Platinum[18] BVMI: ...,
6,Revolver,Released: 5 August 1966 Label: Parlophone,1,1,—,5,1,14,—,BPI: 2× Platinum[17] ARIA: Platinum[18],
7,Sgt. Pepper's Lonely Hearts Club Band,Released: 26 May 1967[23] Label: Parlophone,1,1,1,4,1,1,1,BPI: 18× Platinum[17] ARIA: 4× Platinum[18] BV...,"UK: 5,340,000[25]"
8,"The Beatles (""The White Album"")",Released: 22 November 1968 Label: Apple,1,1,1,1,1,1,1,BPI: 2× Platinum[17] ARIA: 2× Platinum[18] MC:...,
9,Yellow Submarine[C],Released: 17 January 1969 Label: Apple,3,4,1,4,5,1,2,BPI: Gold[17] MC: Gold[19] RIAA: Platinum[20],


In [117]:
dfs[0].columns

MultiIndex([(               'Title',            'Title'),
            (    'Album details[A]', 'Album details[A]'),
            ('Peak chart positions',        'UK [8][9]'),
            ('Peak chart positions',         'AUS [10]'),
            ('Peak chart positions',         'CAN [11]'),
            ('Peak chart positions',         'FRA [12]'),
            ('Peak chart positions',         'GER [13]'),
            ('Peak chart positions',         'NOR [14]'),
            ('Peak chart positions',      'US [15][16]'),
            (      'Certifications',   'Certifications'),
            (               'Sales',            'Sales')],
           )

In [118]:
df = dfs[0]

In [122]:
df.columns = [
    "Title",
    "Release",
    "UK",
    "AUS",
    "CAN",
    "FRA",
    "GER",
    "NOR",
    "US",
    "Certifications",
    "Sales",
]

> 网站内容变化太快，应该要保存处理后的数据