# Chapter 8

# Data Wrangling: Join, Combine, and Reshape

### Heirarchical Indexing

In [1]:
import sys
import numpy as np
import pandas as pd
import csv
import json
from lxml import objectify
import pyarrow
import openpyxl
import xlrd
from bs4 import BeautifulSoup
import sqlalchemy as sqla
import sqlite3
import os
from pathlib import Path

In [2]:
data = pd.Series(np.random.uniform(size=9), 
                 index=[["a", "a", "a", "b", "b", "c", "c", "d", "d"], [1,2,3,1,3,1,2,2,3]])

In [3]:
data

a  1    0.121465
   2    0.849694
   3    0.447863
b  1    0.047320
   3    0.272606
c  1    0.877291
   2    0.840403
d  2    0.955843
   3    0.809467
dtype: float64

In [4]:
data.index

MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('b', 1),
            ('b', 3),
            ('c', 1),
            ('c', 2),
            ('d', 2),
            ('d', 3)],
           )

In [5]:
data['b']

1    0.047320
3    0.272606
dtype: float64

In [6]:
data['b':'c']

b  1    0.047320
   3    0.272606
c  1    0.877291
   2    0.840403
dtype: float64

In [7]:
data.loc[["b", "d"]]

b  1    0.047320
   3    0.272606
d  2    0.955843
   3    0.809467
dtype: float64

In [8]:
data.loc[:,2]

a    0.849694
c    0.840403
d    0.955843
dtype: float64

In [9]:
data.unstack()

Unnamed: 0,1,2,3
a,0.121465,0.849694,0.447863
b,0.04732,,0.272606
c,0.877291,0.840403,
d,,0.955843,0.809467


In [10]:
data.unstack().stack()

a  1    0.121465
   2    0.849694
   3    0.447863
b  1    0.047320
   3    0.272606
c  1    0.877291
   2    0.840403
d  2    0.955843
   3    0.809467
dtype: float64

In [11]:
frame = pd.DataFrame(np.arange(12).reshape((4,3,)),
                     index=[['a','a','b','b'], [1,2,1,2]],
                     columns=[["Ohio", "Ohio", "Colorado"], 
                              ["Green", "Red", "Green"]])

In [12]:
frame

Unnamed: 0_level_0,Unnamed: 1_level_0,Ohio,Ohio,Colorado
Unnamed: 0_level_1,Unnamed: 1_level_1,Green,Red,Green
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [13]:
frame.index.names = ['key1', 'key2']

In [14]:
frame.columns.names= ["state", "color"]

In [15]:
frame

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
a,2,3,4,5
b,1,6,7,8
b,2,9,10,11


In [16]:
frame.index.nlevels

2

In [17]:
frame["Ohio"]

Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,0,1
a,2,3,4
b,1,6,7
b,2,9,10


In [18]:
pd.MultiIndex.from_arrays([["Ohio", "Ohio", "Colorado"], ["Green", "Red", "Green"]], names = ["state", "color"])

MultiIndex([(    'Ohio', 'Green'),
            (    'Ohio',   'Red'),
            ('Colorado', 'Green')],
           names=['state', 'color'])

### Reordering and Sorting Levels

In [19]:
frame.swaplevel("key1", "key2")

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
2,a,3,4,5
1,b,6,7,8
2,b,9,10,11


In [20]:
frame.sort_index(level=1)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key1,key2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,1,0,1,2
b,1,6,7,8
a,2,3,4,5
b,2,9,10,11


In [21]:
frame.swaplevel(0,1).sort_index(level=0)

Unnamed: 0_level_0,state,Ohio,Ohio,Colorado
Unnamed: 0_level_1,color,Green,Red,Green
key2,key1,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,a,0,1,2
1,b,6,7,8
2,a,3,4,5
2,b,9,10,11


### Summary Statistics By Level

In [22]:
frame.groupby(level="key2").sum()

state,Ohio,Ohio,Colorado
color,Green,Red,Green
key2,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,6,8,10
2,12,14,16


In [23]:
frame.groupby(level="color", axis="columns").sum()

  frame.groupby(level="color", axis="columns").sum()


Unnamed: 0_level_0,color,Green,Red
key1,key2,Unnamed: 2_level_1,Unnamed: 3_level_1
a,1,2,1
a,2,8,4
b,1,14,7
b,2,20,10


### Indexing with DataFrame's Columns

In [24]:
frame = pd.DataFrame({"a": range(7), "b": range(7,0,-1), "c": ["one", "one", "one", "two", "two", "two", "two"], "d": [0,1,2,0,1,2,3]})

In [25]:
frame

Unnamed: 0,a,b,c,d
0,0,7,one,0
1,1,6,one,1
2,2,5,one,2
3,3,4,two,0
4,4,3,two,1
5,5,2,two,2
6,6,1,two,3


In [26]:
frame2= frame.set_index(["c", "d"])

In [27]:
frame2

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1
one,0,0,7
one,1,1,6
one,2,2,5
two,0,3,4
two,1,4,3
two,2,5,2
two,3,6,1


In [28]:
frame.set_index(["c", "d"], drop = False)

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b,c,d
c,d,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,0,0,7,one,0
one,1,1,6,one,1
one,2,2,5,one,2
two,0,3,4,two,0
two,1,4,3,two,1
two,2,5,2,two,2
two,3,6,1,two,3


In [29]:
frame2.reset_index()

Unnamed: 0,c,d,a,b
0,one,0,0,7
1,one,1,1,6
2,one,2,2,5
3,two,0,3,4
4,two,1,4,3
5,two,2,5,2
6,two,3,6,1


### Combining and Merging Datasets<br>
### Database Style DataFrame Joins

In [30]:
df1 = pd.DataFrame({"key": ["b", "b", "a", "c", "a", "a", "b"],
                    "data1": pd.Series(range(7), dtype="Int64")})

In [31]:
df2 = pd.DataFrame({"key": ["a", "b", "d"],
                    "data2": pd.Series(range(3), dtype="Int64")})

In [32]:
df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,a,5
6,b,6


In [33]:
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,d,2


In [34]:
pd.merge(df1, df2)

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,a,2,0
3,a,4,0
4,a,5,0
5,b,6,1


In [35]:
pd.merge(df1, df2, on="key")

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,1,1
2,a,2,0
3,a,4,0
4,a,5,0
5,b,6,1


In [36]:
df3 = pd.DataFrame({"lkey": ["b", "b", "a", "c", "a", "a", "b"], 
                   "data1": pd.Series(range(7), dtype="Int64")})

In [37]:
df4 = pd.DataFrame({"rkey": ["a", "b", "d"],
                   "data2": pd.Series(range(3), dtype="Int64")})

Inner join is default:

In [38]:
pd.merge(df3, df4, left_on="lkey", right_on="rkey")

Unnamed: 0,lkey,data1,rkey,data2
0,b,0,b,1
1,b,1,b,1
2,a,2,a,0
3,a,4,a,0
4,a,5,a,0
5,b,6,b,1


In [39]:
pd.merge(df1, df2, how="outer")

Unnamed: 0,key,data1,data2
0,a,2.0,0.0
1,a,4.0,0.0
2,a,5.0,0.0
3,b,0.0,1.0
4,b,1.0,1.0
5,b,6.0,1.0
6,c,3.0,
7,d,,2.0


In an outer join the rows that don't match appear as NA values on the other DataFrame's columns

In [40]:
pd.merge(df3, df4, left_on="lkey", right_on="rkey", how ="outer")

Unnamed: 0,lkey,data1,rkey,data2
0,a,2.0,a,0.0
1,a,4.0,a,0.0
2,a,5.0,a,0.0
3,b,0.0,b,1.0
4,b,1.0,b,1.0
5,b,6.0,b,1.0
6,c,3.0,,
7,,,d,2.0


### Different join types with the how argument <br>
**how="inner"** <br> 
*Use only the key combinations observed in **both** tables.* <br><br>
**how="left"** <br> 
*Use all the key combinations found on left table.* <br><br>
**how="right"** <br> 
*Use all the key combinations found on right table.* <br><br>
**how="outer"** <br> 
*Use all the key combinations found on both tables.* <br><br>

In [41]:
df1 = pd.DataFrame({"key": ["b", "b", "a", "c", "a", "b"],
                   "data1": pd.Series(range(6), dtype="Int64")})

In [42]:
df2= pd.DataFrame({"key": ["a", "b", "a", "b", "d"], 
                  "data2": pd.Series(range(5), dtype= "Int64")})

In [43]:
df1

Unnamed: 0,key,data1
0,b,0
1,b,1
2,a,2
3,c,3
4,a,4
5,b,5


In [44]:
df2

Unnamed: 0,key,data2
0,a,0
1,b,1
2,a,2
3,b,3
4,d,4


In [45]:
pd.merge(df1, df2, on="key", how="left")

Unnamed: 0,key,data1,data2
0,b,0,1.0
1,b,0,3.0
2,b,1,1.0
3,b,1,3.0
4,a,2,0.0
5,a,2,2.0
6,c,3,
7,a,4,0.0
8,a,4,2.0
9,b,5,1.0


In [46]:
pd.merge(df1, df2, how="inner")

Unnamed: 0,key,data1,data2
0,b,0,1
1,b,0,3
2,b,1,1
3,b,1,3
4,a,2,0
5,a,2,2
6,a,4,0
7,a,4,2
8,b,5,1
9,b,5,3


In [47]:
left = pd.DataFrame({"key1": ["foo", "foo", "bar"],
                    "key2": ["one", "two", "one"],
                    "lval": pd.Series([1,2,3], dtype="Int64")})

In [48]:
right = pd.DataFrame({"key1": ["foo", "foo", "bar", "bar"],
                      "key2": ["one", "one", "one", "two"],
                      "rval": pd.Series([4,5,6,7], dtype='Int64')})

In [49]:
pd.merge(left, right, on=["key1", "key2"], how="outer")

Unnamed: 0,key1,key2,lval,rval
0,bar,one,3.0,6.0
1,bar,two,,7.0
2,foo,one,1.0,4.0
3,foo,one,1.0,5.0
4,foo,two,2.0,


In [50]:
pd.merge(left, right, on="key1")

Unnamed: 0,key1,key2_x,lval,key2_y,rval
0,foo,one,1,one,4
1,foo,one,1,one,5
2,foo,two,2,one,4
3,foo,two,2,one,5
4,bar,one,3,one,6
5,bar,one,3,two,7


In [51]:
pd.merge(left,right, on = "key1", suffixes=("_left", "_right"))

Unnamed: 0,key1,key2_left,lval,key2_right,rval
0,foo,one,1,one,4
1,foo,one,1,one,5
2,foo,two,2,one,4
3,foo,two,2,one,5
4,bar,one,3,one,6
5,bar,one,3,two,7


### Pandas Merge Function Arguments<br>

**left**<br>	*DataFrame to be merged on the left side.* <br><br>
**right**<br>	*DataFrame to be merged on the right side.* <br><br>
**how**<br>	*Type of join to apply: one of "inner", "outer", "left", or "right"; defaults to "inner".* <br><br>
**on**<br>	*Column names to join on. Must be found in both DataFrame objects. If not specified and no other join keys given, will use the intersection of the column names in left and right as the join keys.* <br><br>
**left_on**<br>	*Columns in left DataFrame to use as join keys. Can be a single column name or a list of column names.*<br><br>
**right_on**<br>	*Analogous to left_on for right DataFrame.*
**left_index**<br>	*Use row index in left as its join key (or keys, if a MultiIndex).* <br><br>
**right_index**<br>	*Analogous to left_index.* <br><br>
**sort**<br>	*Sort merged data lexicographically by join keys; False by default.* <br>
**suffixes**<br>	*Tuple of string values to append to column names in case of overlap; defaults to ("_x", "_y") (e.g., if "data" in both DataFrame objects, would appear as "data_x" and "data_y" in result).* <br><br>
**copy**<br>	*If False, avoid copying data into resulting data structure in some exceptional cases; by default always copies.* <br><br>
**validate** <br>	*Verifies if the merge is of the specified type, whether one-to-one, one-to-many, or many-to-many. See the docstring for full details on the options.* <br><br>
**indicator** <br>	*Adds a special column _merge that indicates the source of each row; values will be "left_only", "right_only", or "both" based on the origin of the joined data in each row.*

### Merging On Index

In [52]:
left1 = pd.DataFrame({'key': ["a", 'b', 'a', "a", 'b', 'c'],
                     "value": pd.Series(range(6), dtype="Int64")})

In [53]:
right1 = pd.DataFrame({"group_val": [3.5,7]}, index=["a", "b"])

In [54]:
left1

Unnamed: 0,key,value
0,a,0
1,b,1
2,a,2
3,a,3
4,b,4
5,c,5


In [55]:
right1

Unnamed: 0,group_val
a,3.5
b,7.0


In [56]:
pd.merge(left1, right1, left_on="key", right_index=True)

Unnamed: 0,key,value,group_val
0,a,0,3.5
1,b,1,7.0
2,a,2,3.5
3,a,3,3.5
4,b,4,7.0


In [57]:
pd.merge(left1, right1, left_on="key", right_index=True, how = "outer")

Unnamed: 0,key,value,group_val
0,a,0,3.5
2,a,2,3.5
3,a,3,3.5
1,b,1,7.0
4,b,4,7.0
5,c,5,


In [58]:
lefth= pd.DataFrame({"key1": ["Ohio", "Ohio", "Ohio", "Nevada", "Nevada"],
                    "key2": [2000, 2001, 2002, 2001, 2002],
                     "data": pd.Series(range(5), dtype="Int64")})

In [59]:
righth_index = pd.MultiIndex.from_arrays(
    [
        ["Nevada", "Nevada","Ohio","Ohio","Ohio","Ohio"],
        [2001, 2000, 2000, 2000, 2001, 2002]
    ]
)

In [60]:
righth = pd.DataFrame({"event1": pd.Series([0,2,4,6,8,10], dtype="Int64", 
                                          index= righth_index),
                       "event2": pd.Series([1,3,5,7,9,11], dtype= "Int64",
                                          index= righth_index)})

In [61]:
lefth

Unnamed: 0,key1,key2,data
0,Ohio,2000,0
1,Ohio,2001,1
2,Ohio,2002,2
3,Nevada,2001,3
4,Nevada,2002,4


In [62]:
righth

Unnamed: 0,Unnamed: 1,event1,event2
Nevada,2001,0,1
Nevada,2000,2,3
Ohio,2000,4,5
Ohio,2000,6,7
Ohio,2001,8,9
Ohio,2002,10,11


In [63]:
pd.merge(lefth, righth, left_on=["key1", "key2"], right_index=True)

Unnamed: 0,key1,key2,data,event1,event2
0,Ohio,2000,0,4,5
0,Ohio,2000,0,6,7
1,Ohio,2001,1,8,9
2,Ohio,2002,2,10,11
3,Nevada,2001,3,0,1


In [64]:
pd.merge(lefth, righth, left_on=["key1", "key2"], 
        right_index=True, how="outer")

Unnamed: 0,key1,key2,data,event1,event2
4,Nevada,2000,,2.0,3.0
3,Nevada,2001,3.0,0.0,1.0
4,Nevada,2002,4.0,,
0,Ohio,2000,0.0,4.0,5.0
0,Ohio,2000,0.0,6.0,7.0
1,Ohio,2001,1.0,8.0,9.0
2,Ohio,2002,2.0,10.0,11.0


In [65]:
left2= pd.DataFrame([[1.,2.], [3.,4.],[5.,6.]],
                   index=["a", "c", "e"],
                   columns=["Ohio", "Nevada"]).astype("Int64")

In [66]:
right2 = pd.DataFrame([[7.,8.], [9., 10.], [11.,12.],[13,14]],
                     index=["b", "c", "d", "e"],
                     columns=["Missouri", "Alabama"]).astype("Int64")

In [67]:
left2

Unnamed: 0,Ohio,Nevada
a,1,2
c,3,4
e,5,6


In [68]:
right2

Unnamed: 0,Missouri,Alabama
b,7,8
c,9,10
d,11,12
e,13,14


In [69]:
pd.merge(left2, right2, how="outer", left_index=True, right_index=True)

Unnamed: 0,Ohio,Nevada,Missouri,Alabama
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


In [70]:
left2.join(right2, how="outer")

Unnamed: 0,Ohio,Nevada,Missouri,Alabama
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


In [71]:
left1.join(right1, on="key")

Unnamed: 0,key,value,group_val
0,a,0,3.5
1,b,1,7.0
2,a,2,3.5
3,a,3,3.5
4,b,4,7.0
5,c,5,


In [72]:
another = pd.DataFrame([[7.,8.], [9.,10.], [11.,12.], [16., 17.]],
                      index=['a', 'c', 'e', 'f'],
                      columns=['New York', 'Oregon'])

In [73]:
another

Unnamed: 0,New York,Oregon
a,7.0,8.0
c,9.0,10.0
e,11.0,12.0
f,16.0,17.0


In [74]:
left2.join([right2, another])

Unnamed: 0,Ohio,Nevada,Missouri,Alabama,New York,Oregon
a,1,2,,,7.0,8.0
c,3,4,9.0,10.0,9.0,10.0
e,5,6,13.0,14.0,11.0,12.0


In [75]:
left2.join([right2, another], how = "outer")

Unnamed: 0,Ohio,Nevada,Missouri,Alabama,New York,Oregon
a,1.0,2.0,,,7.0,8.0
c,3.0,4.0,9.0,10.0,9.0,10.0
e,5.0,6.0,13.0,14.0,11.0,12.0
b,,,7.0,8.0,,
d,,,11.0,12.0,,
f,,,,,16.0,17.0


### Concatenating Along an Axis

In [76]:
arr = np.arange(12).reshape((3,4))

In [77]:
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [78]:
np.concatenate([arr, arr], axis=1)

array([[ 0,  1,  2,  3,  0,  1,  2,  3],
       [ 4,  5,  6,  7,  4,  5,  6,  7],
       [ 8,  9, 10, 11,  8,  9, 10, 11]])

In [79]:
s1 = pd.Series([0,1], index = ["a", "b"], dtype="Int64")

In [80]:
s2 = pd.Series([2,3,4], index=["c","d","e"], dtype="Int64")

In [81]:
s3 = pd.Series([5,6], index= ["f", "g"], dtype="Int64")

In [82]:
s1

a    0
b    1
dtype: Int64

In [83]:
s2

c    2
d    3
e    4
dtype: Int64

In [84]:
s3

f    5
g    6
dtype: Int64

In [85]:
pd.concat([s1,s2,s3])

a    0
b    1
c    2
d    3
e    4
f    5
g    6
dtype: Int64

In [86]:
s4 = pd.concat([s1, s3])

In [87]:
pd.concat([s1,s4], axis="columns")

Unnamed: 0,0,1
a,0.0,0
b,1.0,1
f,,5
g,,6


In [88]:
pd.concat([s1,s4], axis="columns", join="inner")

Unnamed: 0,0,1
a,0,0
b,1,1


In [89]:
result = pd.concat([s1, s1, s3], keys=["one", "two", "three"])

In [90]:
result

one    a    0
       b    1
two    a    0
       b    1
three  f    5
       g    6
dtype: Int64

In [91]:
result.unstack()

Unnamed: 0,a,b,f,g
one,0.0,1.0,,
two,0.0,1.0,,
three,,,5.0,6.0


In [92]:
pd.concat([s1,s2,s3], axis= "columns", keys=["one", "two", "three"])

Unnamed: 0,one,two,three
a,0.0,,
b,1.0,,
c,,2.0,
d,,3.0,
e,,4.0,
f,,,5.0
g,,,6.0


In [93]:
df1 = pd.DataFrame(np.arange(6).reshape(3,2), index=["a", "b", "c"],
                  columns=['one', 'two'])

In [94]:
df2 = pd.DataFrame(5 + np.arange(4).reshape(2,2), index=["a", "c"],
                  columns=['three', 'four'])

In [95]:
df1

Unnamed: 0,one,two
a,0,1
b,2,3
c,4,5


In [96]:
df2

Unnamed: 0,three,four
a,5,6
c,7,8


In [97]:
pd.concat([df1, df2], axis="columns", keys=["level1", "level2"])

Unnamed: 0_level_0,level1,level1,level2,level2
Unnamed: 0_level_1,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [98]:
pd.concat({"level1": df1, "level2": df2}, axis="columns")

Unnamed: 0_level_0,level1,level1,level2,level2
Unnamed: 0_level_1,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [99]:
pd.concat([df1, df2], axis = "columns", keys=["level1", "level2"], names =['upper', 'lower'])

upper,level1,level1,level2,level2
lower,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [100]:
df1= pd.DataFrame(np.random.standard_normal((3,4)), columns=["b", "d", "a"])

ValueError: Shape of passed values is (3, 4), indices imply (3, 3)

In [101]:
df2= pd.DataFrame(np.random.standard_normal((2,3)), columns=["b", "d", "a"])

In [102]:
df1

Unnamed: 0,one,two
a,0,1
b,2,3
c,4,5


In [103]:
df2

Unnamed: 0,b,d,a
0,0.818561,0.502072,-1.226262
1,0.283071,-0.243562,-1.376158


In [104]:
pd.concat([df1, df2], ignore_index=True)

Unnamed: 0,one,two,b,d,a
0,0.0,1.0,,,
1,2.0,3.0,,,
2,4.0,5.0,,,
3,,,0.818561,0.502072,-1.226262
4,,,0.283071,-0.243562,-1.376158


### Pandas.Concat Function Arguments <br> 

**objs**<br>	*List or dictionary of pandas objects to be concatenated; this is the only required argument*<br><br>
**axis**<br> *Axis to concatenate along; defaults to concatenating along rows (axis="index")* <br><br>
**join**<br>	*Either "inner" or "outer" ("outer" by default); whether to intersect (inner) or union (outer) indexes along the other axes*<br><br>
**keys**<br>	*Values to associate with objects being concatenated, forming a hierarchical index along the concatenation axis; can be a list or array of arbitrary values, an array of tuples, or a list of arrays (if multiple-level arrays passed in levels)*<br><br>
**levels**<br>	*Specific indexes to use as hierarchical index level or levels if keys passed*<br><br>
**names**<br>	*Names for created hierarchical levels if keys and/or levels passed*<br><br>
**verify_integrity**<br>	*Check new axis in concatenated object for duplicates and raise an exception if so; by default (False) allows duplicates*<br><br>
**ignore_index**<br>	*Do not preserve indexes along concatenation axis, instead produce a new range(total_length) index* 

### Combining Data with Overlap

In [105]:
a = pd.Series([np.nan, 2.5, 0.0, 3.5, 4.5, np.nan], index=["f", "e", "d", "c", "b", "a"])

In [106]:
b = pd.Series([0., np.nan, 2., np.nan, np.nan, 5.],
             index=["a", "b", "c", "d", "e", "f"])

In [107]:
a

f    NaN
e    2.5
d    0.0
c    3.5
b    4.5
a    NaN
dtype: float64

In [108]:
b

a    0.0
b    NaN
c    2.0
d    NaN
e    NaN
f    5.0
dtype: float64

In [109]:
np.where(pd.isna(a), b, a)

array([0. , 2.5, 0. , 3.5, 4.5, 5. ])

In [110]:
a.combine_first(b)

a    0.0
b    4.5
c    3.5
d    0.0
e    2.5
f    5.0
dtype: float64

In [111]:
df1 = pd.DataFrame({"a": [1., np.nan, 5., np.nan], 
                    "b": [np.nan, 2., np.nan, 6],
                    "c": range(2,18,4)})

In [112]:
df2 = pd.DataFrame({"a": [5., 4., np.nan, 3., 7.],
                   "b": [np.nan, 3., 4., 6., 8.]})

In [113]:
df1

Unnamed: 0,a,b,c
0,1.0,,2
1,,2.0,6
2,5.0,,10
3,,6.0,14


In [114]:
df2

Unnamed: 0,a,b
0,5.0,
1,4.0,3.0
2,,4.0
3,3.0,6.0
4,7.0,8.0


In [115]:
df1.combine_first(df2)

Unnamed: 0,a,b,c
0,1.0,,2.0
1,4.0,2.0,6.0
2,5.0,4.0,10.0
3,3.0,6.0,14.0
4,7.0,8.0,


# Reshaping and Pivoting
### Reshaping with Heirachical Indexing

In [116]:
data = pd.DataFrame(np.arange(6).reshape((2,3)),
                   index=pd.Index(["Ohio", "Colorado"], name="state"),
                   columns=pd.Index(["one", "two", "three"],
                   name="number"))

In [117]:
data

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [118]:
result = data.stack()

In [119]:
result

state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int64

In [120]:
result.unstack()

number,one,two,three
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,0,1,2
Colorado,3,4,5


In [121]:
result.unstack(level=0)

state,Ohio,Colorado
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


In [122]:
result.unstack(level="state")

state,Ohio,Colorado
number,Unnamed: 1_level_1,Unnamed: 2_level_1
one,0,3
two,1,4
three,2,5


In [123]:
s1= pd.Series([0,1,2,3], index=["a", "b", "c", "d"], dtype="Int64")

In [124]:
s2= pd.Series([4,5,6], index=["c", "d", "e"], dtype="Int64")

In [125]:
data2 = pd.concat([s1, s2], keys=["one", "two"])

In [126]:
data2

one  a    0
     b    1
     c    2
     d    3
two  c    4
     d    5
     e    6
dtype: Int64

In [127]:
data2.unstack()


Unnamed: 0,a,b,c,d,e
one,0.0,1.0,2,3,
two,,,4,5,6.0


In [128]:
data2.unstack().stack()

one  a    0
     b    1
     c    2
     d    3
two  c    4
     d    5
     e    6
dtype: Int64

In [129]:
data2.unstack().stack(dropna=False)

  data2.unstack().stack(dropna=False)


one  a       0
     b       1
     c       2
     d       3
     e    <NA>
two  a    <NA>
     b    <NA>
     c       4
     d       5
     e       6
dtype: Int64

In [130]:
df = pd.DataFrame({"left": result, "right": result + 5},
                 columns=pd.Index(["left", "right"], name = "side"))

In [131]:
df

Unnamed: 0_level_0,side,left,right
state,number,Unnamed: 2_level_1,Unnamed: 3_level_1
Ohio,one,0,5
Ohio,two,1,6
Ohio,three,2,7
Colorado,one,3,8
Colorado,two,4,9
Colorado,three,5,10


In [132]:
df.unstack(level="state")

side,left,left,right,right
state,Ohio,Colorado,Ohio,Colorado
number,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
one,0,3,5,8
two,1,4,6,9
three,2,5,7,10


In [133]:
df.unstack(level="state").stack(level="side")

  df.unstack(level="state").stack(level="side")


Unnamed: 0_level_0,state,Ohio,Colorado
number,side,Unnamed: 2_level_1,Unnamed: 3_level_1
one,left,0,3
one,right,5,8
two,left,1,4
two,right,6,9
three,left,2,5
three,right,7,10


### Pivoting "Long" to "Wide" Format

In [141]:
data = pd.read_csv("examples/macrodata.csv")

In [142]:
data 

Unnamed: 0,year,quarter,realgdp,realcons,realinv,realgovt,realdpi,cpi,m1,tbilrate,unemp,pop,infl,realint
0,1959.0,1.0,2710.349,1707.4,286.898,470.045,1886.9,28.980,139.7,2.82,5.8,177.146,0.00,0.00
1,1959.0,2.0,2778.801,1733.7,310.859,481.301,1919.7,29.150,141.7,3.08,5.1,177.830,2.34,0.74
2,1959.0,3.0,2775.488,1751.8,289.226,491.260,1916.4,29.350,140.5,3.82,5.3,178.657,2.74,1.09
3,1959.0,4.0,2785.204,1753.7,299.356,484.052,1931.3,29.370,140.0,4.33,5.6,179.386,0.27,4.06
4,1960.0,1.0,2847.699,1770.5,331.722,462.199,1955.5,29.540,139.6,3.50,5.2,180.007,2.31,1.19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,2008.0,3.0,13324.600,9267.7,1990.693,991.551,9838.3,216.889,1474.7,1.17,6.0,305.270,-3.16,4.33
199,2008.0,4.0,13141.920,9195.3,1857.661,1007.273,9920.4,212.174,1576.5,0.12,6.9,305.952,-8.79,8.91
200,2009.0,1.0,12925.410,9209.2,1558.494,996.287,9926.4,212.671,1592.8,0.22,8.1,306.547,0.94,-0.71
201,2009.0,2.0,12901.504,9189.0,1456.678,1023.528,10077.5,214.469,1653.6,0.18,9.2,307.226,3.37,-3.19


In [143]:
data = data.loc[:, ["year", "quarter", "realgdp", "infl", "unemp"]]

In [144]:
data.head

<bound method NDFrame.head of        year  quarter    realgdp  infl  unemp
0    1959.0      1.0   2710.349  0.00    5.8
1    1959.0      2.0   2778.801  2.34    5.1
2    1959.0      3.0   2775.488  2.74    5.3
3    1959.0      4.0   2785.204  0.27    5.6
4    1960.0      1.0   2847.699  2.31    5.2
..      ...      ...        ...   ...    ...
198  2008.0      3.0  13324.600 -3.16    6.0
199  2008.0      4.0  13141.920 -8.79    6.9
200  2009.0      1.0  12925.410  0.94    8.1
201  2009.0      2.0  12901.504  3.37    9.2
202  2009.0      3.0  12990.341  3.56    9.6

[203 rows x 5 columns]>

In [139]:
periods = pd.PeriodIndex(year=data.pop("year"),
                        quarter=data.pop("quarter"), 
                        name="date")

KeyError: 'year'

In [145]:
periods

NameError: name 'periods' is not defined

In [146]:
data.index = periods.to_timestamp("D")

NameError: name 'periods' is not defined

In [147]:
data.head()

Unnamed: 0,year,quarter,realgdp,infl,unemp
0,1959.0,1.0,2710.349,0.0,5.8
1,1959.0,2.0,2778.801,2.34,5.1
2,1959.0,3.0,2775.488,2.74,5.3
3,1959.0,4.0,2785.204,0.27,5.6
4,1960.0,1.0,2847.699,2.31,5.2


In [148]:
data = data.reindex(columns=["realgdp", "infl", "unemp"])

In [149]:
data.columns.name= "item"

In [150]:
data.head()

item,realgdp,infl,unemp
0,2710.349,0.0,5.8
1,2778.801,2.34,5.1
2,2775.488,2.74,5.3
3,2785.204,0.27,5.6
4,2847.699,2.31,5.2


In [152]:
long_data=(data.stack()
           .reset_index()
           .rename(columns={0: "value"}))

In [153]:
long_data[:10]

Unnamed: 0,level_0,item,value
0,0,realgdp,2710.349
1,0,infl,0.0
2,0,unemp,5.8
3,1,realgdp,2778.801
4,1,infl,2.34
5,1,unemp,5.1
6,2,realgdp,2775.488
7,2,infl,2.74
8,2,unemp,5.3
9,3,realgdp,2785.204


In [154]:
pivoted = long_data.pivot(index="date", columns="item", values="value")

KeyError: 'date'

In [155]:
long_data["value2"]=np.random.standard_normal(len(long_data))

In [156]:
long_data[:10]

Unnamed: 0,level_0,item,value,value2
0,0,realgdp,2710.349,0.033154
1,0,infl,0.0,0.185773
2,0,unemp,5.8,-1.873095
3,1,realgdp,2778.801,-0.06826
4,1,infl,2.34,0.62931
5,1,unemp,5.1,0.281512
6,2,realgdp,2775.488,0.873847
7,2,infl,2.74,-1.798442
8,2,unemp,5.3,-1.454125
9,3,realgdp,2785.204,0.539004


In [157]:
pivoted = long_data.pivot(index="date", columns="item")

KeyError: "None of ['date'] are in the columns"

In [158]:
pivoted.head()

NameError: name 'pivoted' is not defined

In [159]:
pivoted["value"].head()

NameError: name 'pivoted' is not defined

In [160]:
unstacked = long_data.set_index(["date", "item"]).unstack(level="item")

KeyError: "None of ['date'] are in the columns"

In [161]:
unstacked.head()

NameError: name 'unstacked' is not defined

### Pivoting Wide to Long Format

In [162]:
df= pd.DataFrame({"key": ["foo", "bar", "baz"],
                 "A": [1,2,3],
                 "B": [4,5,6],
                 "C": [7,8,9]})

In [163]:
df

Unnamed: 0,key,A,B,C
0,foo,1,4,7
1,bar,2,5,8
2,baz,3,6,9


In [164]:
melted = pd.melt(df, id_vars="key")

In [165]:
melted

Unnamed: 0,key,variable,value
0,foo,A,1
1,bar,A,2
2,baz,A,3
3,foo,B,4
4,bar,B,5
5,baz,B,6
6,foo,C,7
7,bar,C,8
8,baz,C,9


In [168]:
reshaped =melted.pivot(index="key", columns="variable", values = "value")

In [169]:
reshaped

variable,A,B,C
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,2,5,8
baz,3,6,9
foo,1,4,7


In [170]:
reshaped.reset_index()

variable,key,A,B,C
0,bar,2,5,8
1,baz,3,6,9
2,foo,1,4,7


In [171]:
pd.melt(df, id_vars="key", value_vars=["A", "B"])

Unnamed: 0,key,variable,value
0,foo,A,1
1,bar,A,2
2,baz,A,3
3,foo,B,4
4,bar,B,5
5,baz,B,6


In [None]:
pd.melt(df, value_vars=["A", "B", ])