In [1]:
print("""
@File         : 11_combining_pandas_objects.ipynb
@Author(s)    : Stephen CUI
@LastEditor(s): Stephen CUI
@CreatedTime  : 2024-01-31 22:07:13
@Email        : cuixuanstephen@gmail.com
@Description  : 
""")


@File         : 11_combining_pandas_objects.ipynb
@Author(s)    : Stephen CUI
@LastEditor(s): Stephen CUI
@CreatedTime  : 2024-01-31 22:07:13
@Email        : cuixuanstephen@gmail.com
@Description  : 



In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore', category=FutureWarning)

## Appending new rows to DataFrames

In [3]:
names = pd.read_csv('../data/names.csv')
names

Unnamed: 0,Name,Age
0,Cornelia,70
1,Abbas,69
2,Penelope,4
3,Niko,2


In [4]:
new_data_list = ['Aria', 1]
names.loc[4] = new_data_list
names

Unnamed: 0,Name,Age
0,Cornelia,70
1,Abbas,69
2,Penelope,4
3,Niko,2
4,Aria,1


In [5]:
names.loc['five'] = ['Zach', 3]
names

Unnamed: 0,Name,Age
0,Cornelia,70
1,Abbas,69
2,Penelope,4
3,Niko,2
4,Aria,1
five,Zach,3


In [6]:
names.loc[len(names)] = {'Name': 'Zayd', "Age": 2}
names

Unnamed: 0,Name,Age
0,Cornelia,70
1,Abbas,69
2,Penelope,4
3,Niko,2
4,Aria,1
five,Zach,3
6,Zayd,2


In [7]:
pd.Series({'Age': 32, 'Name': 'Dean'})

Age       32
Name    Dean
dtype: object

In [8]:
names.loc[len(names)] = pd.Series({'Age': 32, 'Name': 'Dean'})
names

Unnamed: 0,Name,Age
0,Cornelia,70
1,Abbas,69
2,Penelope,4
3,Niko,2
4,Aria,1
five,Zach,3
6,Zayd,2
7,Dean,32


We will look at the `.append` method, which does not modify the calling DataFrame.

In [9]:
names = pd.read_csv('../data/names.csv')

The first argument to `.append` must be either another DataFrame, Series, dictionary, or a list of these

In [10]:
try:
    names.append({'Name': "Aria", "Age": 1})
except TypeError as e:
    print(e)

Can only append a dict if ignore_index=True


In [11]:
names.append({'Name': "Aria", "Age": 1}, ignore_index=True)

Unnamed: 0,Name,Age
0,Cornelia,70
1,Abbas,69
2,Penelope,4
3,Niko,2
4,Aria,1


In [12]:
names.index = ['Canada', 'Canada', 'USA', 'USA']
names

Unnamed: 0,Name,Age
Canada,Cornelia,70
Canada,Abbas,69
USA,Penelope,4
USA,Niko,2


In [13]:
names.append({'Name': "Aria", "Age": 1}, ignore_index=True)

Unnamed: 0,Name,Age
0,Cornelia,70
1,Abbas,69
2,Penelope,4
3,Niko,2
4,Aria,1


缺陷在于 index 信息全部丢失

In [14]:
s = pd.Series({'Name': 'Zach', 'Age': 3}, name=len(names))
s

Name    Zach
Age        3
Name: 4, dtype: object

In [15]:
names.append(s)

Unnamed: 0,Name,Age
Canada,Cornelia,70
Canada,Abbas,69
USA,Penelope,4
USA,Niko,2
4,Zach,3


The `.append` method is more flexible than the `.loc` attribute. It supports appending multiple rows at the same time. One way to accomplish this is by passing in a list of Series:

In [16]:
s1 = pd.Series({'Name': 'Zach', 'Age': 3}, name=len(names))
s2 = pd.Series({'Name': 'Zayd', 'Age': 2}, name='USA')
names.append([s1, s2])

Unnamed: 0,Name,Age
Canada,Cornelia,70
Canada,Abbas,69
USA,Penelope,4
USA,Niko,2
4,Zach,3
USA,Zayd,2


In [17]:
bball_16 = pd.read_csv('../data/baseball16.csv')
bball_16.head()

Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,2B,...,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
0,altuvjo01,2016,1,HOU,AL,161,640,108,216,42,...,96.0,30.0,10.0,60,70.0,11.0,7.0,3.0,7.0,15.0
1,bregmal01,2016,1,HOU,AL,49,201,31,53,13,...,34.0,2.0,0.0,15,52.0,0.0,0.0,0.0,1.0,1.0
2,castrja01,2016,1,HOU,AL,113,329,41,69,16,...,32.0,2.0,1.0,45,123.0,0.0,1.0,1.0,0.0,9.0
3,correca01,2016,1,HOU,AL,153,577,76,158,36,...,96.0,13.0,3.0,75,139.0,5.0,5.0,0.0,3.0,12.0
4,gattiev01,2016,1,HOU,AL,128,447,58,112,19,...,72.0,2.0,1.0,43,127.0,6.0,4.0,0.0,5.0,12.0


In [18]:
data_dict = bball_16.iloc[0].to_dict()
data_dict

{'playerID': 'altuvjo01',
 'yearID': 2016,
 'stint': 1,
 'teamID': 'HOU',
 'lgID': 'AL',
 'G': 161,
 'AB': 640,
 'R': 108,
 'H': 216,
 '2B': 42,
 '3B': 5,
 'HR': 24,
 'RBI': 96.0,
 'SB': 30.0,
 'CS': 10.0,
 'BB': 60,
 'SO': 70.0,
 'IBB': 11.0,
 'HBP': 7.0,
 'SH': 3.0,
 'SF': 7.0,
 'GIDP': 15.0}

In [19]:
new_data_dict = {k: '' if isinstance(v, str) else np.nan for k, v in data_dict.items()}
new_data_dict

{'playerID': '',
 'yearID': nan,
 'stint': nan,
 'teamID': '',
 'lgID': '',
 'G': nan,
 'AB': nan,
 'R': nan,
 'H': nan,
 '2B': nan,
 '3B': nan,
 'HR': nan,
 'RBI': nan,
 'SB': nan,
 'CS': nan,
 'BB': nan,
 'SO': nan,
 'IBB': nan,
 'HBP': nan,
 'SH': nan,
 'SF': nan,
 'GIDP': nan}

手动添加行的话，用这个字典作为模板，防止键入名称时出错。这优雅吗？

### There's more…

In [20]:
random_data = []
for i in range(1000):
    d = dict()
    for k, v in data_dict.items():
        if isinstance(v, str):
            d[k] = np.random.choice(list('abcd'))
        else:
            d[k] = np.random.randint(10)
    random_data.append(pd.Series(d, name=i + len(bball_16)))

In [21]:
random_data[0]

playerID    d
yearID      5
stint       3
teamID      a
lgID        d
G           9
AB          6
R           9
H           9
2B          5
3B          5
HR          6
RBI         0
SB          6
CS          9
BB          7
SO          5
IBB         5
HBP         5
SH          9
SF          0
GIDP        9
Name: 16, dtype: object

In [22]:
%%timeit
bball_16_copy = bball_16.copy()
for row in random_data:
    bball_16_copy = bball_16_copy.append(row)

3.42 s ± 821 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [23]:
%%timeit
bball_16_copy = bball_16.copy()
bball_16_copy = bball_16_copy.append(random_data)

54.4 ms ± 4.45 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


<div class='alert alert-warning'>
Appending a single row to a DataFrame is a fairly expensive operation and if you find yourself writing a loop to append single rows of data to a DataFrame, then you are doing it wrong. 
</div>

如果您传入 Series 对象列表，时间将减少到十分之一秒以下。 在内部，pandas 将 Series 列表转换为单个 DataFrame，然后附加数据。

## Concatenating multiple DataFrames together

The `concat` function enables concatenating two or more DataFrames (or Series) together, both vertically and horizontally. 像往常一样，当同时处理多个 pandas 对象时，concatenation 不会随意发生，而是按索引对齐每个对象。

**这个方法更重要，因为未来版本 append 被弃用了**

In [24]:
stocks_2016 = pd.read_csv('../data/stocks_2016.csv', index_col='Symbol')
stocks_2017 = pd.read_csv('../data/stocks_2017.csv', index_col='Symbol')

In [25]:
stocks_2016

Unnamed: 0_level_0,Shares,Low,High
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAPL,80,95,110
TSLA,50,80,130
WMT,40,55,70


In [26]:
stocks_2017

Unnamed: 0_level_0,Shares,Low,High
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAPL,50,120,140
GE,100,30,40
IBM,87,75,95
SLB,20,55,85
TXN,500,15,23
TSLA,100,100,300


In [27]:
s_list = [stocks_2016, stocks_2017]
pd.concat(s_list)

Unnamed: 0_level_0,Shares,Low,High
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAPL,80,95,110
TSLA,50,80,130
WMT,40,55,70
AAPL,50,120,140
GE,100,30,40
IBM,87,75,95
SLB,20,55,85
TXN,500,15,23
TSLA,100,100,300


By default, the `concat` function concatenates DataFrames vertically, one on top of the other. The `concat` function allows each piece of the resulting DataFrame to be labeled with the `keys` parameter. This label will appear in the outermost index level of the concatenated frame and force the creation of a `MultiIndex`. Also, the names parameter has the ability to rename each index level for clarity

In [28]:
pd.concat(s_list, keys=['2016', '2017'], names=['Year', 'Symbol'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Shares,Low,High
Year,Symbol,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016,AAPL,80,95,110
2016,TSLA,50,80,130
2016,WMT,40,55,70
2017,AAPL,50,120,140
2017,GE,100,30,40
2017,IBM,87,75,95
2017,SLB,20,55,85
2017,TXN,500,15,23
2017,TSLA,100,100,300


It is also possible to concatenate horizontally by changing the `axis` parameter to `columns` or 1.

In [29]:
pd.concat(s_list, keys=['2016', '2017'], axis='columns', names=['Year', None])

Year,2016,2016,2016,2017,2017,2017
Unnamed: 0_level_1,Shares,Low,High,Shares,Low,High
Symbol,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
AAPL,80.0,95.0,110.0,50.0,120.0,140.0
TSLA,50.0,80.0,130.0,100.0,100.0,300.0
WMT,40.0,55.0,70.0,,,
GE,,,,100.0,30.0,40.0
IBM,,,,87.0,75.0,95.0
SLB,,,,20.0,55.0,85.0
TXN,,,,500.0,15.0,23.0


The `concat` function, by default, uses an *outer join*, keeping all rows from each DataFrame in the list. However, it gives us an option to keep only
rows that have the same index values in both DataFrames. This is referred to as an *inner join*.

In [30]:
pd.concat(s_list, join='inner', keys=['2016', '2017'], axis='columns', names=['Year', None])

Year,2016,2016,2016,2017,2017,2017
Unnamed: 0_level_1,Shares,Low,High,Shares,Low,High
Symbol,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
AAPL,80,95,110,50,120,140
TSLA,50,80,130,100,100,300


### There's more…

The `.append` method is a heavily watered-down version of concat that can only append new rows to a DataFrame. Internally, `.append` just calls the concat function. 

In [31]:
stocks_2016.append(stocks_2017)

Unnamed: 0_level_0,Shares,Low,High
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAPL,80,95,110
TSLA,50,80,130
WMT,40,55,70
AAPL,50,120,140
GE,100,30,40
IBM,87,75,95
SLB,20,55,85
TXN,500,15,23
TSLA,100,100,300


## Understanding the differences between concat, join, and merge

- concat
  - A pandas function
  - Combines two or more pandas objects vertically or horizontally
  - Aligns only on the index
  - Errors whenever a duplicate appears in the index
  - Defaults to outer join with the option for inner join
- .join
  - A DataFrame method
  - Combines two or more pandas objects horizontally
  - Aligns the calling DataFrame's column(s) or index with the other object's index (and not the columns)
  - Handles duplicate values on the joining columns/index by performing a Cartesian product
  - Defaults to left join with options for inner, outer, and right
- .merge
  - A DataFrame method
  - Combines exactly two DataFrames horizontally
  - Aligns the calling DataFrame's column(s) or index with the other DataFrame's column(s) or index
  - Handles duplicate values on the joining columns or index by performing a cartesian product
  - Defaults to inner join with options for left, outer, and right

In [34]:
years = 2016, 2017, 2018
stock_tables = [pd.read_csv(f'../data/stocks_{year}.csv', index_col='Symbol') for year in years]
stocks_2016, stocks_2017, stocks_2018 = stock_tables

In [35]:
stocks_2016

Unnamed: 0_level_0,Shares,Low,High
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAPL,80,95,110
TSLA,50,80,130
WMT,40,55,70


In [36]:
stocks_2017

Unnamed: 0_level_0,Shares,Low,High
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAPL,50,120,140
GE,100,30,40
IBM,87,75,95
SLB,20,55,85
TXN,500,15,23
TSLA,100,100,300


In [37]:
stocks_2018

Unnamed: 0_level_0,Shares,Low,High
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAPL,40,135,170
AMZN,8,900,1125
TSLA,50,220,400


In [38]:
pd.concat(stock_tables, keys=[2016, 2017, 2018])

Unnamed: 0_level_0,Unnamed: 1_level_0,Shares,Low,High
Unnamed: 0_level_1,Symbol,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016,AAPL,80,95,110
2016,TSLA,50,80,130
2016,WMT,40,55,70
2017,AAPL,50,120,140
2017,GE,100,30,40
2017,IBM,87,75,95
2017,SLB,20,55,85
2017,TXN,500,15,23
2017,TSLA,100,100,300
2018,AAPL,40,135,170


In [42]:
pd.concat(stock_tables, axis='columns', keys=years)

Unnamed: 0_level_0,2016,2016,2016,2017,2017,2017,2018,2018,2018
Unnamed: 0_level_1,Shares,Low,High,Shares,Low,High,Shares,Low,High
Symbol,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
AAPL,80.0,95.0,110.0,50.0,120.0,140.0,40.0,135.0,170.0
TSLA,50.0,80.0,130.0,100.0,100.0,300.0,50.0,220.0,400.0
WMT,40.0,55.0,70.0,,,,,,
GE,,,,100.0,30.0,40.0,,,
IBM,,,,87.0,75.0,95.0,,,
SLB,,,,20.0,55.0,85.0,,,
TXN,,,,500.0,15.0,23.0,,,
AMZN,,,,,,,8.0,900.0,1125.0


In [43]:
pd.concat(dict(zip(years, stock_tables)), axis='columns')

Unnamed: 0_level_0,2016,2016,2016,2017,2017,2017,2018,2018,2018
Unnamed: 0_level_1,Shares,Low,High,Shares,Low,High,Shares,Low,High
Symbol,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
AAPL,80.0,95.0,110.0,50.0,120.0,140.0,40.0,135.0,170.0
TSLA,50.0,80.0,130.0,100.0,100.0,300.0,50.0,220.0,400.0
WMT,40.0,55.0,70.0,,,,,,
GE,,,,100.0,30.0,40.0,,,
IBM,,,,87.0,75.0,95.0,,,
SLB,,,,20.0,55.0,85.0,,,
TXN,,,,500.0,15.0,23.0,,,
AMZN,,,,,,,8.0,900.0,1125.0


We can use the `.join` and `.merge` methods to replicate this functionality of concat. Here, we use the .join method to combine the stock_2016 and stock_2017 DataFrames. By default, the DataFrames align on their index. If any of the columns have the same names, then you must supply a value to the `lsuffix` or `rsuffix` parameters to distinguish them in the result:

In [44]:
stocks_2016.join(stocks_2017, lsuffix='_2016', rsuffix='_2017', how='outer')

Unnamed: 0_level_0,Shares_2016,Low_2016,High_2016,Shares_2017,Low_2017,High_2017
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAPL,80.0,95.0,110.0,50.0,120.0,140.0
GE,,,,100.0,30.0,40.0
IBM,,,,87.0,75.0,95.0
SLB,,,,20.0,55.0,85.0
TSLA,50.0,80.0,130.0,100.0,100.0,300.0
TXN,,,,500.0,15.0,23.0
WMT,40.0,55.0,70.0,,,


In [47]:
other = [stocks_2017.add_suffix('_2017'), stocks_2018.add_suffix('_2018')]
stocks_2016.add_suffix('_2016').join(other, how='outer')

Unnamed: 0_level_0,Shares_2016,Low_2016,High_2016,Shares_2017,Low_2017,High_2017,Shares_2018,Low_2018,High_2018
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AAPL,80.0,95.0,110.0,50.0,120.0,140.0,40.0,135.0,170.0
TSLA,50.0,80.0,130.0,100.0,100.0,300.0,50.0,220.0,400.0
WMT,40.0,55.0,70.0,,,,,,
GE,,,,100.0,30.0,40.0,,,
IBM,,,,87.0,75.0,95.0,,,
SLB,,,,20.0,55.0,85.0,,,
TXN,,,,500.0,15.0,23.0,,,
AMZN,,,,,,,8.0,900.0,1125.0


In [59]:
stock_join = stocks_2016.add_suffix('_2016').join(other, how='outer')
stock_concat = (
    pd.concat(dict(zip(years, stock_tables)), axis='columns')
    .swaplevel(axis=1)
    .pipe(lambda df_: df_.set_axis(df_.columns.to_flat_index(), axis=1))
    .rename(lambda label: '_'.join([str(x) for x in label]), axis=1)
)
assert stock_join.equals(stock_concat)

Now, let's turn to the `.merge` method that, unlike `concat` and `.join`, can only combine two DataFrames together. By default, `.merge` attempts to align the values in the columns that have the same name for each of the DataFrames. However, you can choose to have it align on the index by setting the Boolean parameters `left_index` and `right_index` to True.

默认得每个字段中的值都相同才行

In [63]:
stocks_2016.merge(stocks_2017)

Unnamed: 0,Shares,Low,High


In [64]:
stocks_2016.merge(stocks_2017, left_index=True, right_index=True)

Unnamed: 0_level_0,Shares_x,Low_x,High_x,Shares_y,Low_y,High_y
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AAPL,80,95,110,50,120,140
TSLA,50,80,130,100,100,300


By default, `.merge` uses an inner join and automatically supplies suffixes for identically named columns. Note that in pandas 1.0, the `merge` index will be sorted and the `concat` version won't be:

In [68]:
stock_merge = (
    stocks_2016
    .merge(stocks_2017, left_index=True, right_index=True, suffixes=('_2016', '_2017'), how='outer')
    .merge(stocks_2018.add_suffix('_2018'), left_index=True, right_index=True, how='outer')
)
assert stock_concat.sort_index().equals(stock_merge)

Now let's turn our comparison to datasets where we are interested in aligning together the values of columns and not the index or column labels themselves. The `.merge` method is built for this situation. 

In [70]:
names = ['prices', 'transactions']
food_tables = [pd.read_csv(f'../data/food_{name}.csv') for name in names]
food_prices, food_transactions = food_tables

In [71]:
food_prices.sample(5)

Unnamed: 0,item,store,price,Date
2,peach,A,2.99,2017
8,steak,B,4.99,2015
7,steak,B,6.99,2017
1,pear,B,1.99,2017
5,banana,B,0.49,2017


In [72]:
food_transactions

Unnamed: 0,custid,item,store,quantity
0,1,pear,A,5
1,1,banana,A,10
2,2,steak,B,3
3,2,pear,B,1
4,2,peach,B,2
5,2,steak,B,1
6,2,coconut,B,4


In [75]:
food_transactions.merge(food_prices, on=['item', 'store'])

Unnamed: 0,custid,item,store,quantity,price,Date
0,1,pear,A,5,0.99,2017
1,1,banana,A,10,0.39,2017
2,2,steak,B,3,6.99,2017
3,2,steak,B,3,4.99,2015
4,2,steak,B,1,6.99,2017
5,2,steak,B,1,4.99,2015
6,2,pear,B,1,1.99,2017
7,2,peach,B,2,3.49,2017


In [76]:
food_transactions.merge(food_prices.query('Date == 2017'), on=['item', 'store'], how='left')

Unnamed: 0,custid,item,store,quantity,price,Date
0,1,pear,A,5,0.99,2017.0
1,1,banana,A,10,0.39,2017.0
2,2,steak,B,3,6.99,2017.0
3,2,pear,B,1,1.99,2017.0
4,2,peach,B,2,3.49,2017.0
5,2,steak,B,1,6.99,2017.0
6,2,coconut,B,4,,


The `.join` method only aligns with the index of the passed DataFrame but can use the index or the columns of the calling DataFrame. 

In [77]:
food_prices_join = food_prices.query('Date == 2017').set_index(['item', 'store'])
food_prices_join

Unnamed: 0_level_0,Unnamed: 1_level_0,price,Date
item,store,Unnamed: 2_level_1,Unnamed: 3_level_1
pear,A,0.99,2017
pear,B,1.99,2017
peach,A,2.99,2017
peach,B,3.49,2017
banana,A,0.39,2017
banana,B,0.49,2017
steak,A,5.99,2017
steak,B,6.99,2017


In [80]:
food_transactions.join(food_prices_join, on=['item', 'store'])
# 因为 other 必须用 index 来连接， not columns

Unnamed: 0,custid,item,store,quantity,price,Date
0,1,pear,A,5,0.99,2017.0
1,1,banana,A,10,0.39,2017.0
2,2,steak,B,3,6.99,2017.0
3,2,pear,B,1,1.99,2017.0
4,2,peach,B,2,3.49,2017.0
5,2,steak,B,1,6.99,2017.0
6,2,coconut,B,4,,


In [82]:
from pandas.errors import InvalidIndexError
try:
    pd.concat([food_transactions.set_index(['item', 'store']),
               food_prices.set_index(['item', 'store'])], axis='columns')
except InvalidIndexError as e:
    print(e)

Reindexing only valid with uniquely valued Index objects


It is possible to use `.join` in these instances, but all the columns in the passed DataFrame must be moved into the index first. Finally, `concat` is going to be a poor choice whenever you intend to align data by values in their columns.

In summary, I find myself using `.merge` unless I know that the indexes align.

### There's more…

It is possible to read all files from a particular directory into DataFrames without knowing their names. Python provides a few ways to iterate through directories, with the `glob` module being a popular choice. 

In [90]:
import glob
df_list = [pd.read_csv(file, index_col='Week', parse_dates=['Week']) for file in glob.glob('../data/gas_prices/*.csv')]
gas = pd.concat(df_list, axis='columns', join='outer')
# 默认是 outer

## Connecting to SQL databases

In [95]:
from sqlalchemy import create_engine
engine = create_engine('sqlite:///../data/chinook.db')

Let's complete a command and read in the tracks table with the `read_sql_table` function. The name of the table is the first argument and the SQLAlchemy engine is the second:

In [96]:
tracks = pd.read_sql_table('tracks', engine)
tracks.sample(5)

Unnamed: 0,TrackId,Name,AlbumId,MediaTypeId,GenreId,Composer,Milliseconds,Bytes,UnitPrice
2265,2266,Play The Game,185,1,1,"Mercury, Freddie",213368,6915832,0.99
2727,2728,Pulse,220,1,4,The Tea Party,250253,8183872,0.99
999,1000,What If I Do?,80,1,1,"Dave Grohl, Taylor Hawkins, Nate Mendel, Chris...",302994,9929799,0.99
953,954,Cuckoo For Caca,76,1,1,"Mike Bordin, Billy Gould, Mike Patton, Trey Sp...",222902,7388369,0.99
3405,3406,"Concerto No. 1 in E Major, RV 269 ""Spring"": I....",275,2,24,Antonio Vivaldi,199086,3347810,0.99
