<h1 style="color:cadetblue; font-size:2em;">Appending & concatenating Series</h1>

<div style="border:2px dashed darkcyan; padding:15px; font-size:18px;">
    <strong>concat() & .append()</strong>
    <ul>
        <li>Equivalence of concat() & .append():</li>
            <ul>
                <li>result1 = pd.concat([s1, s2, s3])</li>
                <li>result2 = s1.append(s2).append(s3)</li>
            </ul>
        <li>result1 == result2 elementwise</li>
    </ul>
</div>

In [1]:
import pandas as pd
# Series of US States
northeast = pd.Series(['CT', 'ME', 'MA', 'NH', 'RI', 'VT', 'NJ', 'NY', 'PA'])
south = pd.Series(['DE', 'FL', 'GA', 'MD', 'NC', 'SC', 'VA', 'DC', 'WV', 'AL', 'KY', 'MS', 'TN', 'AR', 'LA', 'OK', 'TX'])
midwest = pd.Series(['IL', 'IN', 'MN', 'MO', 'NE', 'ND', 'SD', 'IA', 'KS', 'MI', 'OH', 'WI'])
west = pd.Series(['AZ', 'CO', 'ID', 'MT', 'NV', 'NM', 'UT', 'WY', 'AK', 'CA', 'HI', 'OR','WA'])


In [2]:
# Using .append()
east = northeast.append(south)
print(east)

0     CT
1     ME
2     MA
3     NH
4     RI
5     VT
6     NJ
7     NY
8     PA
0     DE
1     FL
2     GA
3     MD
4     NC
5     SC
6     VA
7     DC
8     WV
9     AL
10    KY
11    MS
12    TN
13    AR
14    LA
15    OK
16    TX
dtype: object


In [3]:
# The appended Index
print(east.index)

Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  0,  1,  2,  3,  4,  5,  6,  7,
             8,  9, 10, 11, 12, 13, 14, 15, 16],
           dtype='int64')


In [4]:
print(east.loc[3])

3    NH
3    MD
dtype: object


In [5]:
# Using .reset_index()
new_east = northeast.append(south).reset_index(drop=True)

In [6]:
print(new_east.head(11))

0     CT
1     ME
2     MA
3     NH
4     RI
5     VT
6     NJ
7     NY
8     PA
9     DE
10    FL
dtype: object


In [7]:
print(new_east.index)

RangeIndex(start=0, stop=26, step=1)


In [8]:
# Using concat()
east = pd.concat([northeast, south])
print(east.head(11))

0    CT
1    ME
2    MA
3    NH
4    RI
5    VT
6    NJ
7    NY
8    PA
0    DE
1    FL
dtype: object


In [9]:
print(east.index)

Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  0,  1,  2,  3,  4,  5,  6,  7,
             8,  9, 10, 11, 12, 13, 14, 15, 16],
           dtype='int64')


In [10]:
# Using ignore_index
new_east = pd.concat([northeast, south], ignore_index=True)
print(new_east.head(11))

0     CT
1     ME
2     MA
3     NH
4     RI
5     VT
6     NJ
7     NY
8     PA
9     DE
10    FL
dtype: object


In [11]:
print(new_east.index)

RangeIndex(start=0, stop=26, step=1)


<h1 style="color:cadetblue; font-size:2em;">Appending & concatenating DataFrames</h1>

In [13]:
import pandas as pd
pop1 = pd.read_csv('datasets/population_01.csv', index_col=0)
pop2 = pd.read_csv('datasets/population_02.csv', index_col=0)
print(type(pop1), pop1.shape)
print(type(pop2), pop2.shape)

<class 'pandas.core.frame.DataFrame'> (4, 1)
<class 'pandas.core.frame.DataFrame'> (4, 1)


In [14]:
print(pop1)

               2010 Census Population
Zip Code ZCTA                        
66407                             479
72732                            4716
50579                            2405
46241                           30670


In [15]:
print(pop2)

               2010 Census Population
Zip Code ZCTA                        
12776                            2180
76092                           26669
98360                           12221
49464                           27481


In [16]:
# Appending population DataFrames
pop1.append(pop2)

Unnamed: 0_level_0,2010 Census Population
Zip Code ZCTA,Unnamed: 1_level_1
66407,479
72732,4716
50579,2405
46241,30670
12776,2180
76092,26669
98360,12221
49464,27481


In [17]:
print(pop1.index.name, pop1.columns)

Zip Code ZCTA Index(['2010 Census Population'], dtype='object')


In [18]:
print(pop2.index.name, pop2.columns)

Zip Code ZCTA Index(['2010 Census Population'], dtype='object')


In [19]:
# Population & unemployment data
population = pd.read_csv('datasets/population_00.csv', index_col=0)
unemployment = pd.read_csv('datasets/unemployment_00.csv', index_col=0)

In [20]:
print(population)

               2010 Census Population
Zip Code ZCTA                        
57538                             322
59916                             130
37660                           40038
2860                            45199


In [21]:
print(unemployment)

       unemployment  participants
Zip                              
2860           0.11         34447
46167          0.02          4800
1097           0.33            42
80808          0.07          4310


In [22]:
# Appending population & unemployment
population.append(unemployment)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Unnamed: 0,2010 Census Population,participants,unemployment
57538,322.0,,
59916,130.0,,
37660,40038.0,,
2860,45199.0,,
2860,,34447.0,0.11
46167,,4800.0,0.02
1097,,42.0,0.33
80808,,4310.0,0.07


In [23]:
# Concatenating rows
pd.concat([population, unemployment], axis=0)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


Unnamed: 0,2010 Census Population,participants,unemployment
57538,322.0,,
59916,130.0,,
37660,40038.0,,
2860,45199.0,,
2860,,34447.0,0.11
46167,,4800.0,0.02
1097,,42.0,0.33
80808,,4310.0,0.07


In [25]:
# Concatenating columns
pd.concat([population, unemployment], axis=1)

Unnamed: 0,2010 Census Population,unemployment,participants
1097,,0.33,42.0
2860,45199.0,0.11,34447.0
37660,40038.0,,
46167,,0.02,4800.0
57538,322.0,,
59916,130.0,,
80808,,0.07,4310.0


<h1 style="color:cadetblue; font-size:2em;">Concatenation, keys, & MultiIndexes</h1>

In [28]:
import pandas as pd
file1 = 'datasets/q1_rainfall_2013.csv'
rain2013 = pd.read_csv(file1, index_col='Month', parse_dates=True)
file2 = 'datasets/q1_rainfall_2014.csv'
rain2014 = pd.read_csv(file2, index_col='Month', parse_dates=True)

In [29]:
print(rain2013)

       Precipitation
Month               
Jan         0.096129
Feb         0.067143
Mar         0.061613


In [30]:
print(rain2014)

       Precipitation
Month               
Jan         0.050323
Feb         0.082143
Mar         0.070968


In [31]:
# Concatenating rows
pd.concat([rain2013, rain2014], axis=0)

Unnamed: 0_level_0,Precipitation
Month,Unnamed: 1_level_1
Jan,0.096129
Feb,0.067143
Mar,0.061613
Jan,0.050323
Feb,0.082143
Mar,0.070968


In [32]:
# Using multi-index on rows
rain1314 = pd.concat([rain2013, rain2014], keys=[2013, 2014], axis=0)
print(rain1314)

            Precipitation
     Month               
2013 Jan         0.096129
     Feb         0.067143
     Mar         0.061613
2014 Jan         0.050323
     Feb         0.082143
     Mar         0.070968


In [33]:
# Accessing a multi-index
print(rain1314.loc[2014])

       Precipitation
Month               
Jan         0.050323
Feb         0.082143
Mar         0.070968


In [34]:
# Concatenating columns
rain1314 = pd.concat([rain2013, rain2014], axis='columns')
print(rain1314)

       Precipitation  Precipitation
Month                              
Jan         0.096129       0.050323
Feb         0.067143       0.082143
Mar         0.061613       0.070968


In [35]:
# Using a multi-index on columns
rain1314 = pd.concat([rain2013, rain2014], keys=[2013, 2014], axis='columns')
print(rain1314)

               2013          2014
      Precipitation Precipitation
Month                            
Jan        0.096129      0.050323
Feb        0.067143      0.082143
Mar        0.061613      0.070968


In [36]:
rain1314[2013]

Unnamed: 0_level_0,Precipitation
Month,Unnamed: 1_level_1
Jan,0.096129
Feb,0.067143
Mar,0.061613


In [37]:
# pd.concat() with dict
rain_dict = {2013: rain2013, 2014: rain2014}
rain1314 = pd.concat(rain_dict, axis='columns')
print(rain1314)

               2013          2014
      Precipitation Precipitation
Month                            
Jan        0.096129      0.050323
Feb        0.067143      0.082143
Mar        0.061613      0.070968


<h1 style="color:cadetblue; font-size:2em;">Outer & inner joins</h1>

In [43]:
# Using with arrays
import numpy as np
import pandas as pd

In [44]:
A = np.arange(8).reshape(2,4) + 0.1
print(A)

[[0.1 1.1 2.1 3.1]
 [4.1 5.1 6.1 7.1]]


In [45]:
B = np.arange(6).reshape(2,3) + 0.2
print(B)

[[0.2 1.2 2.2]
 [3.2 4.2 5.2]]


In [46]:
C = np.arange(12).reshape(3,4) + 0.3
print(C)

[[ 0.3  1.3  2.3  3.3]
 [ 4.3  5.3  6.3  7.3]
 [ 8.3  9.3 10.3 11.3]]


In [48]:
# Stacking arrays horizontally
np.hstack([B, A])

array([[0.2, 1.2, 2.2, 0.1, 1.1, 2.1, 3.1],
       [3.2, 4.2, 5.2, 4.1, 5.1, 6.1, 7.1]])

In [49]:
np.concatenate([B, A], axis=1)

array([[0.2, 1.2, 2.2, 0.1, 1.1, 2.1, 3.1],
       [3.2, 4.2, 5.2, 4.1, 5.1, 6.1, 7.1]])

In [50]:
# Stacking arrays vertically
np.vstack([A, C])

array([[ 0.1,  1.1,  2.1,  3.1],
       [ 4.1,  5.1,  6.1,  7.1],
       [ 0.3,  1.3,  2.3,  3.3],
       [ 4.3,  5.3,  6.3,  7.3],
       [ 8.3,  9.3, 10.3, 11.3]])

In [51]:
np.concatenate([A, C], axis=0)

array([[ 0.1,  1.1,  2.1,  3.1],
       [ 4.1,  5.1,  6.1,  7.1],
       [ 0.3,  1.3,  2.3,  3.3],
       [ 4.3,  5.3,  6.3,  7.3],
       [ 8.3,  9.3, 10.3, 11.3]])

In [53]:
# Incompatible array dimensions
np.concatenate([A, B], axis=0) # incompatible columns

ValueError: all the input array dimensions except for the concatenation axis must match exactly

In [54]:
np.concatenate([A, C], axis=1) # incompatible rows

ValueError: all the input array dimensions except for the concatenation axis must match exactly

In [56]:
# Population & unemployment data
population = pd.read_csv('datasets/population_00.csv', index_col=0)
unemployment = pd.read_csv('datasets/unemployment_00.csv', index_col=0)

In [57]:
print(population)

               2010 Census Population
Zip Code ZCTA                        
57538                             322
59916                             130
37660                           40038
2860                            45199


In [58]:
print(unemployment)

       unemployment  participants
Zip                              
2860           0.11         34447
46167          0.02          4800
1097           0.33            42
80808          0.07          4310


In [59]:
# Converting to arrays
population_array = np.array(population)
print(population_array) # Index info is lost

[[  322]
 [  130]
 [40038]
 [45199]]


In [60]:
unemployment_array = np.array(unemployment)
print(population_array)

[[  322]
 [  130]
 [40038]
 [45199]]


In [62]:
# Manipulating data as arrays
print(np.concatenate([population_array, unemployment_array], axis=1))

[[3.2200e+02 1.1000e-01 3.4447e+04]
 [1.3000e+02 2.0000e-02 4.8000e+03]
 [4.0038e+04 3.3000e-01 4.2000e+01]
 [4.5199e+04 7.0000e-02 4.3100e+03]]


<div style="border:2px dashed darkcyan; padding:15px; font-size:18px;">
    <strong>Joins</strong>
    <ul>
        <li>Joining tables: Combining rows of multiple tables</li>
        <li>Outer join</li>
            <ul>
                <li>Union of index sets (all labels, no repetition)</li>
                <li>Missing fields filled with NaN</li>
            </ul>
        <li>Inner join</li>
            <ul>
                <li>Intersection of index sets (only common labels)</li>
            </ul>
    </ul>
</div>

In [63]:
# Concatenation & inner join
pd.concat([population, unemployment], axis=1, join='inner')

Unnamed: 0,2010 Census Population,unemployment,participants
2860,45199,0.11,34447


In [64]:
# Concatenation & outer join
pd.concat([population, unemployment], axis=1, join='outer')

Unnamed: 0,2010 Census Population,unemployment,participants
1097,,0.33,42.0
2860,45199.0,0.11,34447.0
37660,40038.0,,
46167,,0.02,4800.0
57538,322.0,,
59916,130.0,,
80808,,0.07,4310.0


In [66]:
# Inner join on other axis
pd.concat([population, unemployment], join='inner', axis=0)

57538
59916
37660
2860
2860
46167
1097
80808
