In [49]:
print("""
@File         : 11_combining_pandas_objects.ipynb
@Author(s)    : Stephen CUI
@LastEditor(s): Stephen CUI
@CreatedTime  : 2024-01-31 22:07:13
@Email        : cuixuanstephen@gmail.com
@Description  : 
""")


@File         : 11_combining_pandas_objects.ipynb
@Author(s)    : Stephen CUI
@LastEditor(s): Stephen CUI
@CreatedTime  : 2024-01-31 22:07:13
@Email        : cuixuanstephen@gmail.com
@Description  : 



In [50]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning)

## Appending new rows to DataFrames

In [51]:
names = pd.read_csv('../data/names.csv')
names

Unnamed: 0,Name,Age
0,Cornelia,70
1,Abbas,69
2,Penelope,4
3,Niko,2


In [52]:
new_data_list = ['Aria', 1]
names.loc[4] = new_data_list
names

Unnamed: 0,Name,Age
0,Cornelia,70
1,Abbas,69
2,Penelope,4
3,Niko,2
4,Aria,1


In [53]:
names.loc['five'] = ['Zach', 3]
names

Unnamed: 0,Name,Age
0,Cornelia,70
1,Abbas,69
2,Penelope,4
3,Niko,2
4,Aria,1
five,Zach,3


In [54]:
names.loc[len(names)] = {'Name': 'Zayd', "Age": 2}
names

Unnamed: 0,Name,Age
0,Cornelia,70
1,Abbas,69
2,Penelope,4
3,Niko,2
4,Aria,1
five,Zach,3
6,Zayd,2


In [55]:
pd.Series({'Age': 32, 'Name': 'Dean'})

Age       32
Name    Dean
dtype: object

In [56]:
names.loc[len(names)] = pd.Series({'Age': 32, 'Name': 'Dean'})
names

Unnamed: 0,Name,Age
0,Cornelia,70
1,Abbas,69
2,Penelope,4
3,Niko,2
4,Aria,1
five,Zach,3
6,Zayd,2
7,Dean,32


We will look at the `.append` method, which does not modify the calling DataFrame.

In [57]:
names = pd.read_csv('../data/names.csv')

The first argument to `.append` must be either another DataFrame, Series, dictionary, or a list of these

In [58]:
try:
    names.append({'Name': "Aria", "Age": 1})
except TypeError as e:
    print(e)

Can only append a dict if ignore_index=True


In [59]:
names.append({'Name': "Aria", "Age": 1}, ignore_index=True)

Unnamed: 0,Name,Age
0,Cornelia,70
1,Abbas,69
2,Penelope,4
3,Niko,2
4,Aria,1


In [60]:
names.index = ['Canada', 'Canada', 'USA', 'USA']
names

Unnamed: 0,Name,Age
Canada,Cornelia,70
Canada,Abbas,69
USA,Penelope,4
USA,Niko,2


In [61]:
names.append({'Name': "Aria", "Age": 1}, ignore_index=True)

Unnamed: 0,Name,Age
0,Cornelia,70
1,Abbas,69
2,Penelope,4
3,Niko,2
4,Aria,1


缺陷在于 index 信息全部丢失

In [63]:
s = pd.Series({'Name': 'Zach', 'Age': 3}, name=len(names))
s

Name    Zach
Age        3
Name: 4, dtype: object

In [65]:
names.append(s)

Unnamed: 0,Name,Age
Canada,Cornelia,70
Canada,Abbas,69
USA,Penelope,4
USA,Niko,2
4,Zach,3


The `.append` method is more flexible than the `.loc` attribute. It supports appending multiple rows at the same time. One way to accomplish this is by passing in a list of Series:

In [67]:
s1 = pd.Series({'Name': 'Zach', 'Age': 3}, name=len(names))
s2 = pd.Series({'Name': 'Zayd', 'Age': 2}, name='USA')
names.append([s1, s2])

Unnamed: 0,Name,Age
Canada,Cornelia,70
Canada,Abbas,69
USA,Penelope,4
USA,Niko,2
4,Zach,3
USA,Zayd,2


In [68]:
bball_16 = pd.read_csv('../data/baseball16.csv')
bball_16.head()

Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,2B,...,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
0,altuvjo01,2016,1,HOU,AL,161,640,108,216,42,...,96.0,30.0,10.0,60,70.0,11.0,7.0,3.0,7.0,15.0
1,bregmal01,2016,1,HOU,AL,49,201,31,53,13,...,34.0,2.0,0.0,15,52.0,0.0,0.0,0.0,1.0,1.0
2,castrja01,2016,1,HOU,AL,113,329,41,69,16,...,32.0,2.0,1.0,45,123.0,0.0,1.0,1.0,0.0,9.0
3,correca01,2016,1,HOU,AL,153,577,76,158,36,...,96.0,13.0,3.0,75,139.0,5.0,5.0,0.0,3.0,12.0
4,gattiev01,2016,1,HOU,AL,128,447,58,112,19,...,72.0,2.0,1.0,43,127.0,6.0,4.0,0.0,5.0,12.0


In [70]:
data_dict = bball_16.iloc[0].to_dict()
data_dict

{'playerID': 'altuvjo01',
 'yearID': 2016,
 'stint': 1,
 'teamID': 'HOU',
 'lgID': 'AL',
 'G': 161,
 'AB': 640,
 'R': 108,
 'H': 216,
 '2B': 42,
 '3B': 5,
 'HR': 24,
 'RBI': 96.0,
 'SB': 30.0,
 'CS': 10.0,
 'BB': 60,
 'SO': 70.0,
 'IBB': 11.0,
 'HBP': 7.0,
 'SH': 3.0,
 'SF': 7.0,
 'GIDP': 15.0}

In [71]:
new_data_dict = {k: '' if isinstance(v, str) else np.nan for k, v in data_dict.items()}
new_data_dict

{'playerID': '',
 'yearID': nan,
 'stint': nan,
 'teamID': '',
 'lgID': '',
 'G': nan,
 'AB': nan,
 'R': nan,
 'H': nan,
 '2B': nan,
 '3B': nan,
 'HR': nan,
 'RBI': nan,
 'SB': nan,
 'CS': nan,
 'BB': nan,
 'SO': nan,
 'IBB': nan,
 'HBP': nan,
 'SH': nan,
 'SF': nan,
 'GIDP': nan}

手动添加行的话，用这个字典作为模板，防止键入名称时出错。这优雅吗？

### There's more…

In [73]:
random_data = []
for i in range(1000):
    d = dict()
    for k, v in data_dict.items():
        if isinstance(v, str):
            d[k] = np.random.choice(list('abcd'))
        else:
            d[k] = np.random.randint(10)
    random_data.append(pd.Series(d, name=i + len(bball_16)))

In [74]:
random_data[0]

playerID    b
yearID      7
stint       4
teamID      a
lgID        c
G           1
AB          6
R           5
H           0
2B          3
3B          5
HR          4
RBI         5
SB          6
CS          1
BB          2
SO          4
IBB         8
HBP         5
SH          4
SF          3
GIDP        0
Name: 16, dtype: object

In [75]:
%%timeit
bball_16_copy = bball_16.copy()
for row in random_data:
    bball_16_copy = bball_16_copy.append(row)

1.9 s ± 90.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [77]:
%%timeit
bball_16_copy = bball_16.copy()
bball_16_copy = bball_16_copy.append(random_data)

41.9 ms ± 1.43 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


<div class='alert alert-warning'>
Appending a single row to a DataFrame is a fairly expensive operation and if you find yourself writing a loop to append single rows of data to a DataFrame, then you are doing it wrong. 
</div>

如果您传入 Series 对象列表，时间将减少到十分之一秒以下。 在内部，pandas 将 Series 列表转换为单个 DataFrame，然后附加数据。

## Concatenating multiple DataFrames together

The `concat` function enables concatenating two or more DataFrames (or Series) together, both vertically and horizontally. 像往常一样，当同时处理多个 pandas 对象时，concatenation 不会随意发生，而是按索引对齐每个对象。

**这个方法更重要，因为未来版本 append 被弃用了**

In [78]:
stocks_2016 = pd.read_csv('../data/stocks_2016.csv', index_col='Symbol')
stocks_2017 = pd.read_csv('../data/stocks_2017.csv', index_col='Symbol')

In [82]:
stocks_2016

Unnamed: 0_level_0,Shares,Low,High
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAPL,80,95,110
TSLA,50,80,130
WMT,40,55,70


In [81]:
stocks_2017

Unnamed: 0_level_0,Shares,Low,High
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAPL,50,120,140
GE,100,30,40
IBM,87,75,95
SLB,20,55,85
TXN,500,15,23
TSLA,100,100,300


In [83]:
s_list = [stocks_2016, stocks_2017]
pd.concat(s_list)

Unnamed: 0_level_0,Shares,Low,High
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAPL,80,95,110
TSLA,50,80,130
WMT,40,55,70
AAPL,50,120,140
GE,100,30,40
IBM,87,75,95
SLB,20,55,85
TXN,500,15,23
TSLA,100,100,300


By default, the `concat` function concatenates DataFrames vertically, one on top of the other. The `concat` function allows each piece of the resulting DataFrame to be labeled with the `keys` parameter. This label will appear in the outermost index level of the concatenated frame and force the creation of a `MultiIndex`. Also, the names parameter has the ability to rename each index level for clarity

In [85]:
pd.concat(s_list, keys=['2016', '2017'], names=['Year', 'Symbol'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Shares,Low,High
Year,Symbol,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016,AAPL,80,95,110
2016,TSLA,50,80,130
2016,WMT,40,55,70
2017,AAPL,50,120,140
2017,GE,100,30,40
2017,IBM,87,75,95
2017,SLB,20,55,85
2017,TXN,500,15,23
2017,TSLA,100,100,300


It is also possible to concatenate horizontally by changing the `axis` parameter to `columns` or 1.

In [92]:
pd.concat(s_list, keys=['2016', '2017'], axis='columns', names=['Year', None])

Year,2016,2016,2016,2017,2017,2017
Unnamed: 0_level_1,Shares,Low,High,Shares,Low,High
Symbol,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
AAPL,80.0,95.0,110.0,50.0,120.0,140.0
TSLA,50.0,80.0,130.0,100.0,100.0,300.0
WMT,40.0,55.0,70.0,,,
GE,,,,100.0,30.0,40.0
IBM,,,,87.0,75.0,95.0
SLB,,,,20.0,55.0,85.0
TXN,,,,500.0,15.0,23.0


The `concat` function, by default, uses an *outer join*, keeping all rows from each DataFrame in the list. However, it gives us an option to keep only
rows that have the same index values in both DataFrames. This is referred to as an *inner join*.

In [93]:
pd.concat(s_list, join='inner', keys=['2016', '2017'], axis='columns', names=['Year', None])

Year,2016,2016,2016,2017,2017,2017
Unnamed: 0_level_1,Shares,Low,High,Shares,Low,High
Symbol,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
AAPL,80,95,110,50,120,140
TSLA,50,80,130,100,100,300
