### Appending and concatenating Series

#### append()
- .append(): Series & DataFrame *method*
- Invocation:
 - s1.append(s2) <br>
- Stacks rows of s2 below s1
- Method for Series & DataFrames

#### concat()
- concat(): pandas module *function*
- Invocation;
 - pd.concat([s1, s2, s3])
- Can stack row-wise or column-wise

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
sns.set()

In [3]:
bronze = pd.read_csv('summer_olympic_medals/bronze_top5.csv', index_col=0)
silver = pd.read_csv('summer_olympic_medals/silver_top5.csv', index_col=0)
gold = pd.read_csv('summer_olympic_medals/gold_top5.csv', index_col=0)

In [4]:
jan = pd.read_csv('Sales/sales-jan-2015.csv', parse_dates=True, index_col='Date')
feb = pd.read_csv('Sales/sales-feb-2015.csv', parse_dates=True, index_col='Date')
mar = pd.read_csv('Sales/sales-mar-2015.csv', parse_dates=True, index_col='Date')

In [5]:
jan_units = jan['Units']
feb_units = feb['Units']
mar_units = mar['Units']

In [6]:
quarter1 = jan_units.append(feb_units).append(mar_units)

In [7]:
quarter1.loc['jan 27, 2015':'feb 2, 2015']

Date
2015-01-27 07:11:55    18
2015-02-02 08:33:01     3
2015-02-02 20:54:49     9
Name: Units, dtype: int64

In [8]:
quarter1.loc['feb 26, 2015':'mar 7, 2015']

Date
2015-02-26 08:57:45     4
2015-02-26 08:58:51     1
2015-03-06 10:11:45    17
2015-03-06 02:03:56    17
Name: Units, dtype: int64

In [9]:
quarter1.sum()

642

In [10]:
units = []

In [11]:
for month in [jan, feb, mar]:
    units.append(month['Units'])

In [12]:
quarter1 = pd.concat(units, axis='rows')

In [13]:
quarter1.head()

Date
2015-01-21 19:13:21    11
2015-01-09 05:23:51     8
2015-01-06 17:19:34    17
2015-01-02 09:51:06    16
2015-01-11 14:51:02    11
Name: Units, dtype: int64

### Appending & concatenating DataFrames

In [14]:
pop1 = pd.read_csv('population_01.csv', index_col=0)
pop2 = pd.read_csv('population_02.csv', index_col=0)
print(type(pop1), pop1.shape)
print(type(pop2), pop2.shape)

<class 'pandas.core.frame.DataFrame'> (4, 1)
<class 'pandas.core.frame.DataFrame'> (4, 1)


In [15]:
pop1

Unnamed: 0_level_0,2010 Census Population
Zip Code ZCTA,Unnamed: 1_level_1
66407,479
72732,4716
50579,2405
46241,30670


In [16]:
pop2

Unnamed: 0_level_0,2010 Census Population
Zip Code ZCTA,Unnamed: 1_level_1
12776,2180
76092,26669
98360,12221
49464,27481


In [17]:
pop1.append(pop2)

Unnamed: 0_level_0,2010 Census Population
Zip Code ZCTA,Unnamed: 1_level_1
66407,479
72732,4716
50579,2405
46241,30670
12776,2180
76092,26669
98360,12221
49464,27481


In [18]:
print(pop1.index.name)
print(pop1.columns)
print(pop2.index.name)
print(pop2.columns)

Zip Code ZCTA
Index(['2010 Census Population'], dtype='object')
Zip Code ZCTA
Index(['2010 Census Population'], dtype='object')


In [19]:
population = pd.read_csv('population_00.csv', index_col=0)
unemployment = pd.read_csv('unemployment_00.csv', index_col=0)
print(population)
print(unemployment)

               2010 Census Population
Zip Code ZCTA                        
57538                             322
59916                             130
37660                           40038
2860                            45199
       unemployment  participants
Zip                              
2860           0.11         34447
46167          0.02          4800
1097           0.33            42
80808          0.07          4310


In [20]:
population.append(unemployment)

Unnamed: 0,2010 Census Population,participants,unemployment
57538,322.0,,
59916,130.0,,
37660,40038.0,,
2860,45199.0,,
2860,,34447.0,0.11
46167,,4800.0,0.02
1097,,42.0,0.33
80808,,4310.0,0.07


In [21]:
pd.concat([population, unemployment], axis=0)

Unnamed: 0,2010 Census Population,participants,unemployment
57538,322.0,,
59916,130.0,,
37660,40038.0,,
2860,45199.0,,
2860,,34447.0,0.11
46167,,4800.0,0.02
1097,,42.0,0.33
80808,,4310.0,0.07


In [22]:
pd.concat([population, unemployment], axis=1)

Unnamed: 0,2010 Census Population,unemployment,participants
1097,,0.33,42.0
2860,45199.0,0.11,34447.0
37660,40038.0,,
46167,,0.02,4800.0
57538,322.0,,
59916,130.0,,
80808,,0.07,4310.0


#### Appending DataFrames with ignore_index

In [23]:
names_1981 = pd.read_csv('baby_names/names1981.csv', 
                         header=None, 
                         names=['name','gender','count'])
names_1881 = pd.read_csv('baby_names/names1881.csv', 
                         header=None, 
                         names=['name','gender','count'])

In [24]:
names_1881['year'] = 1881
names_1981['year'] = 1981

In [25]:
combined_names = names_1881.append(names_1981, ignore_index=True)
print(names_1981.shape)
print(names_1881.shape)
print(combined_names.shape)

(19455, 4)
(1935, 4)
(21390, 4)


In [26]:
print(combined_names[combined_names['name'] == 'Morgan'])

         name gender  count  year
1283   Morgan      M     23  1881
2096   Morgan      F   1769  1981
14390  Morgan      M    766  1981


#### Concatenating pandas DataFrames along column axis

In [27]:
df = pd.read_csv('pittsburgh2013.csv', index_col="Date", parse_dates=True)
by_quarter = df.groupby(pd.PeriodIndex(df.index, freq="Q")).mean()
weather_mean = by_quarter['Mean TemperatureF']
weather_mean.index = ['Jan', 'Apr', 'Jul', 'Oct']
weather_mean.index.name = 'Month'
weather_mean

Month
Jan    32.133333
Apr    61.956044
Jul    68.934783
Oct    43.434783
Name: Mean TemperatureF, dtype: float64

In [28]:
by_quarter = df.groupby(pd.PeriodIndex(df.index, freq="Q")).max()
weather_max = by_quarter['Max TemperatureF']
weather_max.index = ['Jan', 'Apr', 'Jul', 'Oct']
weather_max.index.name = 'Month'
weather_max

Month
Jan    68
Apr    89
Jul    91
Oct    84
Name: Max TemperatureF, dtype: int64

In [29]:
weather = pd.concat([weather_max, weather_mean], axis=1)
print(weather)

       Max TemperatureF  Mean TemperatureF
Month                                     
Jan                  68          32.133333
Apr                  89          61.956044
Jul                  91          68.934783
Oct                  84          43.434783


In [30]:
medals = []
medal_type = ['bronze','silver','gold']

In [31]:
for medal in medal_type:
    file_name = 'summer_olympic_medals/%s_top5.csv' % medal
    columns = ['Country', medal]
    medal_df = pd.read_csv(file_name, header=0, index_col='Country', names=columns)
    medals.append(medal_df)

In [32]:
medals = pd.concat(medals, axis='columns')
medals

Unnamed: 0,bronze,silver,gold
France,475.0,461.0,
Germany,454.0,,407.0
Italy,,394.0,460.0
Soviet Union,584.0,627.0,838.0
United Kingdom,505.0,591.0,498.0
United States,1052.0,1195.0,2088.0


### Concatenation, keys, & MultiIndexes

In [33]:
rain2013 = pd.read_csv('q1_rainfall_2013.csv', index_col='Month', parse_dates=True)
rain2014 = pd.read_csv('q1_rainfall_2014.csv', index_col='Month', parse_dates=True)
rain2013

Unnamed: 0_level_0,Precipitation
Month,Unnamed: 1_level_1
Jan,0.096129
Feb,0.067143
Mar,0.061613


In [34]:
rain2014

Unnamed: 0_level_0,Precipitation
Month,Unnamed: 1_level_1
Jan,0.050323
Feb,0.082143
Mar,0.070968


In [35]:
pd.concat([rain2013, rain2014], axis=0)

Unnamed: 0_level_0,Precipitation
Month,Unnamed: 1_level_1
Jan,0.096129
Feb,0.067143
Mar,0.061613
Jan,0.050323
Feb,0.082143
Mar,0.070968


In [36]:
rain1314 = pd.concat([rain2013, rain2014], keys=[2013,2014], axis=0)
rain1314

Unnamed: 0_level_0,Unnamed: 1_level_0,Precipitation
Unnamed: 0_level_1,Month,Unnamed: 2_level_1
2013,Jan,0.096129
2013,Feb,0.067143
2013,Mar,0.061613
2014,Jan,0.050323
2014,Feb,0.082143
2014,Mar,0.070968


In [37]:
rain1314.loc[2014]

Unnamed: 0_level_0,Precipitation
Month,Unnamed: 1_level_1
Jan,0.050323
Feb,0.082143
Mar,0.070968


In [38]:
rain1314 = pd.concat([rain2013, rain2014], axis='columns')
rain1314

Unnamed: 0_level_0,Precipitation,Precipitation
Month,Unnamed: 1_level_1,Unnamed: 2_level_1
Jan,0.096129,0.050323
Feb,0.067143,0.082143
Mar,0.061613,0.070968


In [39]:
rain1314 = pd.concat([rain2013, rain2014], keys=[2013,2014], axis='columns')
rain1314

Unnamed: 0_level_0,2013,2014
Unnamed: 0_level_1,Precipitation,Precipitation
Month,Unnamed: 1_level_2,Unnamed: 2_level_2
Jan,0.096129,0.050323
Feb,0.067143,0.082143
Mar,0.061613,0.070968


In [40]:
rain1314[2013]

Unnamed: 0_level_0,Precipitation
Month,Unnamed: 1_level_1
Jan,0.096129
Feb,0.067143
Mar,0.061613


In [41]:
rain_dict = {2013: rain2013, 2014: rain2014}
rain1314 = pd.concat(rain_dict, axis='columns')
rain1314

Unnamed: 0_level_0,2013,2014
Unnamed: 0_level_1,Precipitation,Precipitation
Month,Unnamed: 1_level_2,Unnamed: 2_level_2
Jan,0.096129,0.050323
Feb,0.067143,0.082143
Mar,0.061613,0.070968


In [42]:
rain1314 = pd.concat(rain_dict, axis=0)
rain1314

Unnamed: 0_level_0,Unnamed: 1_level_0,Precipitation
Unnamed: 0_level_1,Month,Unnamed: 2_level_1
2013,Jan,0.096129
2013,Feb,0.067143
2013,Mar,0.061613
2014,Jan,0.050323
2014,Feb,0.082143
2014,Mar,0.070968


#### Concatenating vertically to get MultiIndexed rows

In [43]:
medals=[]
for medal in medal_type:
    file_name = 'summer_olympic_medals/%s_top5.csv' % medal
    medal_df = pd.read_csv(file_name, index_col='Country')
    medals.append(medal_df)
    
medals = pd.concat(medals, keys=['bronze','silver','gold'], axis=0)
medals

Unnamed: 0_level_0,Unnamed: 1_level_0,Total
Unnamed: 0_level_1,Country,Unnamed: 2_level_1
bronze,United States,1052.0
bronze,Soviet Union,584.0
bronze,United Kingdom,505.0
bronze,France,475.0
bronze,Germany,454.0
silver,United States,1195.0
silver,Soviet Union,627.0
silver,United Kingdom,591.0
silver,France,461.0
silver,Italy,394.0


In [44]:
medals_sorted = medals.sort_index(level=0)
medals_sorted

Unnamed: 0_level_0,Unnamed: 1_level_0,Total
Unnamed: 0_level_1,Country,Unnamed: 2_level_1
bronze,France,475.0
bronze,Germany,454.0
bronze,Soviet Union,584.0
bronze,United Kingdom,505.0
bronze,United States,1052.0
gold,Germany,407.0
gold,Italy,460.0
gold,Soviet Union,838.0
gold,United Kingdom,498.0
gold,United States,2088.0


In [45]:
idx = pd.IndexSlice

In [46]:
UK = medals_sorted.loc[idx[:,'United Kingdom'], :]
UK

Unnamed: 0_level_0,Unnamed: 1_level_0,Total
Unnamed: 0_level_1,Country,Unnamed: 2_level_1
bronze,United Kingdom,505.0
gold,United Kingdom,498.0
silver,United Kingdom,591.0


In [47]:
dataframes = []
sale_type = ['Hardware', 'Software', 'Service']
for frame in sale_type:
    file_name = 'Sales/feb-sales-%s.csv' % frame
    sale_df = pd.read_csv(file_name, index_col='Date')
    dataframes.append(sale_df)

dataframes

[                             Company   Product  Units
 Date                                                 
 2015-02-04 21:52:45  Acme Coporation  Hardware     14
 2015-02-07 22:58:10  Acme Coporation  Hardware      1
 2015-02-19 10:59:33        Mediacore  Hardware     16
 2015-02-02 20:54:49        Mediacore  Hardware      9
 2015-02-21 20:41:47            Hooli  Hardware      3,
                              Company   Product  Units
 Date                                                 
 2015-02-16 12:09:19            Hooli  Software     10
 2015-02-03 14:14:18          Initech  Software     13
 2015-02-02 08:33:01            Hooli  Software      3
 2015-02-05 01:53:06  Acme Coporation  Software     19
 2015-02-11 20:03:08          Initech  Software      7
 2015-02-09 13:09:55        Mediacore  Software      7
 2015-02-11 22:50:44            Hooli  Software      4
 2015-02-04 15:36:29        Streeplex  Software     13
 2015-02-21 05:01:26        Mediacore  Software      3,
        

In [48]:
february = pd.concat(dataframes, axis=1, keys=['Hardware', 'Software', 'Service'])
print(february.info())

<class 'pandas.core.frame.DataFrame'>
Index: 20 entries, 2015-02-02 08:33:01 to 2015-02-26 08:58:51
Data columns (total 9 columns):
(Hardware, Company)    5 non-null object
(Hardware, Product)    5 non-null object
(Hardware, Units)      5 non-null float64
(Software, Company)    9 non-null object
(Software, Product)    9 non-null object
(Software, Units)      9 non-null float64
(Service, Company)     6 non-null object
(Service, Product)     6 non-null object
(Service, Units)       6 non-null float64
dtypes: float64(3), object(6)
memory usage: 1.6+ KB
None


In [49]:
february

Unnamed: 0_level_0,Hardware,Hardware,Hardware,Software,Software,Software,Service,Service,Service
Unnamed: 0_level_1,Company,Product,Units,Company,Product,Units,Company,Product,Units
2015-02-02 08:33:01,,,,Hooli,Software,3.0,,,
2015-02-02 20:54:49,Mediacore,Hardware,9.0,,,,,,
2015-02-03 14:14:18,,,,Initech,Software,13.0,,,
2015-02-04 15:36:29,,,,Streeplex,Software,13.0,,,
2015-02-04 21:52:45,Acme Coporation,Hardware,14.0,,,,,,
2015-02-05 01:53:06,,,,Acme Coporation,Software,19.0,,,
2015-02-05 22:05:03,,,,,,,Hooli,Service,10.0
2015-02-07 22:58:10,Acme Coporation,Hardware,1.0,,,,,,
2015-02-09 08:57:30,,,,,,,Streeplex,Service,19.0
2015-02-09 13:09:55,,,,Mediacore,Software,7.0,,,


In [50]:
idx = pd.IndexSlice
slice_2_8 = february.loc['2015-02-02':'2015-02-08', idx[:, 'Company']]
slice_2_8

Unnamed: 0_level_0,Hardware,Software,Service
Unnamed: 0_level_1,Company,Company,Company
2015-02-02 08:33:01,,Hooli,
2015-02-02 20:54:49,Mediacore,,
2015-02-03 14:14:18,,Initech,
2015-02-04 15:36:29,,Streeplex,
2015-02-04 21:52:45,Acme Coporation,,
2015-02-05 01:53:06,,Acme Coporation,
2015-02-05 22:05:03,,,Hooli
2015-02-07 22:58:10,Acme Coporation,,


In [51]:
jan = pd.read_csv('Sales/sales-jan-2015.csv')
feb = pd.read_csv('Sales/sales-feb-2015.csv')
mar = pd.read_csv('Sales/sales-mar-2015.csv')
mar.head(3)

Unnamed: 0,Date,Company,Product,Units
0,2015-03-22 14:42:25,Mediacore,Software,6
1,2015-03-12 18:33:06,Initech,Service,19
2,2015-03-22 03:58:28,Streeplex,Software,8


In [52]:
month_list = [('january', jan), ('february', feb), ('march', mar)]
month_dict = {}

for month_name, month_data in month_list:
    month_dict[month_name] = month_data.groupby('Company').sum()
    
sales = pd.concat(month_dict)
sales

Unnamed: 0_level_0,Unnamed: 1_level_0,Units
Unnamed: 0_level_1,Company,Unnamed: 2_level_1
february,Acme Coporation,34
february,Hooli,30
february,Initech,30
february,Mediacore,45
february,Streeplex,37
january,Acme Coporation,76
january,Hooli,70
january,Initech,37
january,Mediacore,15
january,Streeplex,50


In [53]:
idx = pd.IndexSlice
print(sales.loc[idx[:, 'Mediacore'], :])

                    Units
         Company         
february Mediacore     45
january  Mediacore     15
march    Mediacore     68


### Outer and inner joins

In [54]:
A = np.arange(8).reshape(2,4) + 0.1
print(A)

[[0.1 1.1 2.1 3.1]
 [4.1 5.1 6.1 7.1]]


In [55]:
B = np.arange(6).reshape(2,3) + 0.2
print(B)

[[0.2 1.2 2.2]
 [3.2 4.2 5.2]]


In [56]:
C = np.arange(12).reshape(3,4) + 0.3
print(C)

[[ 0.3  1.3  2.3  3.3]
 [ 4.3  5.3  6.3  7.3]
 [ 8.3  9.3 10.3 11.3]]


In [57]:
np.hstack([B,A])

array([[0.2, 1.2, 2.2, 0.1, 1.1, 2.1, 3.1],
       [3.2, 4.2, 5.2, 4.1, 5.1, 6.1, 7.1]])

In [58]:
np.concatenate([B,A], axis=1)

array([[0.2, 1.2, 2.2, 0.1, 1.1, 2.1, 3.1],
       [3.2, 4.2, 5.2, 4.1, 5.1, 6.1, 7.1]])

In [59]:
np.vstack([A,C])

array([[ 0.1,  1.1,  2.1,  3.1],
       [ 4.1,  5.1,  6.1,  7.1],
       [ 0.3,  1.3,  2.3,  3.3],
       [ 4.3,  5.3,  6.3,  7.3],
       [ 8.3,  9.3, 10.3, 11.3]])

In [60]:
np.concatenate([A,C], axis=0)

array([[ 0.1,  1.1,  2.1,  3.1],
       [ 4.1,  5.1,  6.1,  7.1],
       [ 0.3,  1.3,  2.3,  3.3],
       [ 4.3,  5.3,  6.3,  7.3],
       [ 8.3,  9.3, 10.3, 11.3]])

In [61]:
print(population)
print(unemployment)

               2010 Census Population
Zip Code ZCTA                        
57538                             322
59916                             130
37660                           40038
2860                            45199
       unemployment  participants
Zip                              
2860           0.11         34447
46167          0.02          4800
1097           0.33            42
80808          0.07          4310


In [62]:
population_array = np.array(population)
print(population_array)

[[  322]
 [  130]
 [40038]
 [45199]]


In [63]:
unemployment_array = np.array(unemployment)
unemployment_array

array([[1.1000e-01, 3.4447e+04],
       [2.0000e-02, 4.8000e+03],
       [3.3000e-01, 4.2000e+01],
       [7.0000e-02, 4.3100e+03]])

In [64]:
print(np.concatenate([population_array, unemployment_array], axis=1))

[[3.2200e+02 1.1000e-01 3.4447e+04]
 [1.3000e+02 2.0000e-02 4.8000e+03]
 [4.0038e+04 3.3000e-01 4.2000e+01]
 [4.5199e+04 7.0000e-02 4.3100e+03]]


### Joins
- Joining tables: Combining rows of multiple tables
- Outer join
 - Union of index sets (all labels, no repitition)
 - Missing fields filled with NaN
- Inner join
 - Intersection of index sets (only common labels)

In [65]:
pd.concat([population, unemployment], axis=1, join='inner')

Unnamed: 0,2010 Census Population,unemployment,participants
2860,45199,0.11,34447


In [66]:
pd.concat([population, unemployment], axis=1, join='outer')

Unnamed: 0,2010 Census Population,unemployment,participants
1097,,0.33,42.0
2860,45199.0,0.11,34447.0
37660,40038.0,,
46167,,0.02,4800.0
57538,322.0,,
59916,130.0,,
80808,,0.07,4310.0


In [67]:
medal_list = []
for medal in medal_type:
    file_name = 'summer_olympic_medals/%s_top5.csv' % medal
    medal_df = pd.read_csv(file_name, index_col='Country')
    medal_list.append(medal_df)
medal_list

[                 Total
 Country               
 United States   1052.0
 Soviet Union     584.0
 United Kingdom   505.0
 France           475.0
 Germany          454.0,                  Total
 Country               
 United States   1195.0
 Soviet Union     627.0
 United Kingdom   591.0
 France           461.0
 Italy            394.0,                  Total
 Country               
 United States   2088.0
 Soviet Union     838.0
 United Kingdom   498.0
 Italy            460.0
 Germany          407.0]

In [68]:
medals = pd.concat(medal_list, keys=['bronze','silver','gold'], axis=1, join='inner')
medals

Unnamed: 0_level_0,bronze,silver,gold
Unnamed: 0_level_1,Total,Total,Total
Country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
United States,1052.0,1195.0,2088.0
Soviet Union,584.0,627.0,838.0
United Kingdom,505.0,591.0,498.0


In [70]:
us = pd.read_csv('GDP/gdp_usa.csv', parse_dates=True, index_col='DATE')
us.index.names = ['Year']
us.rename(columns={'VALUE': 'USA'}, inplace=True)
us.head()

Unnamed: 0_level_0,USA
Year,Unnamed: 1_level_1
1947-01-01,243.1
1947-04-01,246.3
1947-07-01,250.1
1947-10-01,260.3
1948-01-01,266.2


In [73]:
china = pd.read_csv('GDP/gdp_china.csv', parse_dates=True, index_col='Year')
china.rename(columns={'GDP': 'China'}, inplace=True)
china.head()

Unnamed: 0_level_0,China
Year,Unnamed: 1_level_1
1960-01-01,59.184116
1961-01-01,49.55705
1962-01-01,46.685179
1963-01-01,50.097303
1964-01-01,59.062255


In [74]:
china_annual = china.resample('A').last().pct_change(10).dropna()
china_annual.head()

Unnamed: 0_level_0,China
Year,Unnamed: 1_level_1
1970-12-31,0.546128
1971-12-31,0.98886
1972-12-31,1.402472
1973-12-31,1.730085
1974-12-31,1.408556


In [75]:
us_annual = us.resample('A').last().pct_change(10).dropna()
us_annual.head()

Unnamed: 0_level_0,USA
Year,Unnamed: 1_level_1
1957-12-31,0.827507
1958-12-31,0.782686
1959-12-31,0.953137
1960-12-31,0.689354
1961-12-31,0.630959


In [76]:
gdp = pd.concat([china_annual, us_annual], join='inner', axis=1)
print(gdp.resample('10A').last())

               China       USA
Year                          
1970-12-31  0.546128  1.017187
1980-12-31  1.072537  1.742556
1990-12-31  0.892820  1.012126
2000-12-31  2.357522  0.738632
2010-12-31  4.011081  0.454332
2020-12-31  3.789936  0.361780
