In [2]:
# Standard data science libraries
import pandas as pd
import numpy as np
from scipy import stats
import featuretools as ft
# Visualization
import matplotlib.pyplot as plt
%matplotlib notebook
import seaborn as sns
plt.style.use('bmh')
# Options for pandas
pd.options.display.max_columns = 20
# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'



## Exercise 1 ##
- create a `DataFrame` object with multiple index columns
- swap index rows, and sort from the outermost index in
- summarize the dataframe at the level of the multi-index
- select subsets of rows by each index, then by all indices

In [3]:
# - create a `DataFrame` object with multiple index columns

df1=pd.read_csv('./data/quarterly.csv', index_col=['quarter','year'])
df1
df1_index=df1.index
df1_index
df1_index.levels
# - swap index rows, and sort from the outermost index in
df1=df1.swaplevel().sort_index(level=1).sort_index(level=0)
df1
# - summarize the dataframe at the level of the multi-index
df1.sum(level=1)
# - select subsets of rows by each index, then by all indices
df1.loc[2017]
df1.loc[:,'Q2']

Unnamed: 0_level_0,Unnamed: 1_level_0,count,amount
quarter,year,Unnamed: 2_level_1,Unnamed: 3_level_1
Q3,2017,1,891.11
Q3,2018,8,878.02
Q4,2017,5,805.45
Q2,2016,3,760.61
Q1,2017,1,696.12
Q3,2016,4,674.76
Q4,2016,1,541.3
Q2,2017,5,533.21
Q1,2018,7,486.81
Q1,2016,4,255.93


MultiIndex([('Q3', 2017),
            ('Q3', 2018),
            ('Q4', 2017),
            ('Q2', 2016),
            ('Q1', 2017),
            ('Q3', 2016),
            ('Q4', 2016),
            ('Q2', 2017),
            ('Q1', 2018),
            ('Q1', 2016),
            ('Q4', 2018),
            ('Q2', 2018)],
           names=['quarter', 'year'])

FrozenList([['Q1', 'Q2', 'Q3', 'Q4'], [2016, 2017, 2018]])

Unnamed: 0_level_0,Unnamed: 1_level_0,count,amount
year,quarter,Unnamed: 2_level_1,Unnamed: 3_level_1
2016,Q1,4,255.93
2016,Q2,3,760.61
2016,Q3,4,674.76
2016,Q4,1,541.3
2017,Q1,1,696.12
2017,Q2,5,533.21
2017,Q3,1,891.11
2017,Q4,5,805.45
2018,Q1,7,486.81
2018,Q2,8,101.93


Unnamed: 0_level_0,count,amount
quarter,Unnamed: 1_level_1,Unnamed: 2_level_1
Q1,12,1438.86
Q2,16,1395.75
Q3,13,2443.89
Q4,12,1474.34


Unnamed: 0_level_0,count,amount
quarter,Unnamed: 1_level_1,Unnamed: 2_level_1
Q1,1,696.12
Q2,5,533.21
Q3,1,891.11
Q4,5,805.45


KeyError: 'Q2'

## Exercise 2 ##
- create a `MultiIndex` object using `.from_arrays()` method
- create a dataframe using the multindex object for columns and using a list of lists to create a multindex for the index
- calculate row sums by each column index
- use `stack()` to move the column headings into row labels
- flatten the row labels and create an integer index

In [None]:
# - create a `MultiIndex` object using `.from_arrays()` method
mi1=pd.MultiIndex.from_arrays([['X','X','X','X','Y','Y','Y'],['x1','x2','x3','x4','y1','y2','y3']],
                              names=['vector','element'])
# - create a dataframe using the multindex object for columns and using a list of lists to create a multindex for the index
df2=pd.DataFrame(np.random.rand(70).reshape(10,7),
                index=[['A','A','A','A','B','B','B','B','B','B'],
                      ['a1','a2','a3','a4','b1','b2','b3','b4','b5','b6']],
                columns=mi1)
df2
# - calculate row sums by each column index
df2.sum(axis=1)
df2.sum(axis=1,level=0)
# - use `stack()` to move the column headings into row labels
df2=df2.stack()
# - flatten the row labels and create an integer index
df2.reset_index()
df2

## Exercise 3 ##
- create two dataframes with at least one column name in common and another column to join on but with differnet column names and some shared values in that column (e.g., from `merge1.csv` and `merge2.csv`)
- use the `merge()` function without optional paramaters to combine the dataframes
- use the `merge()` function with a parameter to deal with the different column names
- use the `merge()` function with a parameter to retain join column values that appear in only one of the two dataframes, include an column indicating which of the original dataframes the row used
- use the `merge()` function to join on index instead of column values, retaining all data from both dataframes, specify suffixes for the column labels taht appear in both dataframes

In [None]:
# - create two dataframes with at least one column name in common and another column to join 
# on but with differnet column names and some shared values in that column 
# (e.g., from `merge1.csv` and `merge2.csv`)
df3=pd.read_csv('./data/merge1.csv')
df3
df4=pd.read_csv('./data/merge2.csv')
df4
# - use the `merge()` function without optional paramaters to combine the dataframes
pd.merge(df3,df4)
# - use the `merge()` function with a parameter to deal with the different column names
pd.merge(df3,df4,left_on=['year','qtr'],right_on=['year','quarter'])
# - use the `merge()` function with a parameter to retain join column values that appear in 
# only one of the two dataframes, include an column indicating which of the original 
# dataframes the row used
pd.merge(df3,df4,left_on=['year','qtr'],right_on=['year','quarter'],how='outer',indicator=True)
# - use the `merge()` function to join on index instead of column values, retaining all data 
# from both dataframes, specify suffixes for the column labels taht appear in both dataframes
pd.merge(df3,df4,left_index=True,right_index=True,how='outer',indicator=True)

## Exercise 4 ##
- create two dataframes with at least one column name in common and another column to join on but with differnet column names and some shared values in that column (e.g., from `merge1.csv` and `merge2.csv`)
- use the `.join()` method to combine the dataframes on index, you will have to specify a suffix for at least one side for columns that are in both dataframes
- make the correct columns into indices so the `.join()` method will combine the data correctly

In [None]:
# - create two dataframes with at least one column name in common and another column to join 
# on but with differnet column names and some shared values in that column 
# (e.g., from `merge1.csv` and `merge2.csv`)
df3=pd.read_csv('./data/merge1.csv')
df3
df4=pd.read_csv('./data/merge2.csv')
df4
# - use the `.join()` method to combine the dataframes, you will have to specify a suffix for 
# at least one side for columns that are in both dataframes
df3.join(df4,lsuffix='_l')
# - make the correct columns into indices so the `.join()` method will combine the data correctly, 
# include all rows from both dataframes
df3.rename(columns={'qtr':'quarter'},inplace=True)
df3.set_index(['year','quarter'],inplace=True)
df3
df4.set_index(['year','quarter'],inplace=True)
df4
df3.join(df4,lsuffix='_l',how='outer')

## Exercise 5 ##
- create two dataframes with at least one column name in common and another column to join on but with differnet column names and some shared values in that column (e.g., from `merge1.csv` and `merge2.csv`)
- use the `concat` function to append one dataframe to the other vertically
- use the `concat` function to append one dataframe to the other vertically and create an index indicating which dataframe the row came from, keeping only collumns that appear in both dataframes
- use the `concat` function to append one dataframe to the other horizontally, include column group labels by passing a dictionary argument

In [None]:
# - create two dataframes with at least one column name in common and another column to join 
# on but with differnet column names and some shared values in that column 
# (e.g., from `merge1.csv` and `merge2.csv`)
df3=pd.read_csv('./data/merge1.csv').rename(columns={'qtr':'quarter'})
# df3
df4=pd.read_csv('./data/merge2.csv')
# df4
# - use the `concat` function to append one dataframe to the other vertically
pd.concat([df3,df4])
# - use the `concat` function to append one dataframe to the other vertically and create 
# an index indicating which dataframe the row came from, keeping only collumns that appear 
# in both dataframes
pd.concat([df3,df4],keys=['dataframe1','daataframe2'],join='inner')
# - use the `concat` function to append one dataframe to the other horizontally, include 
# column group labels by passing a dictionary argument
pd.concat({'dataframe1':df3,'daataframe2':df4},axis=1,names=['source','parameter'])


## Exercise 6 ##
- create two dataframes with at least one column name in common and another column to join on but with different column names and some shared values in that column (e.g., from `merge1.csv` and `merge2.csv`), make sure there are overlapping indices and columns
- combine the datasets so that when there are overlapping cells, the values from the first dataframe are used

In [4]:
# - create two dataframes with at least one column name in common and another column to join 
# on but with different column names and some shared values in that column 
# (e.g., from `merge1.csv` and `merge2.csv`), make sure there are overlapping indices and columns
df3=pd.read_csv('./data/merge1.csv').rename(columns={'qtr':'quarter'})
df3
df4=pd.read_csv('./data/merge2.csv')
df4['a']=.5
df4
# - combine the datasets so that when there are overlapping cells, the values from the first 
# dataframe are used
df3.combine_first(df4)

Unnamed: 0,year,quarter,a,b,c
0,2010,1,0.35375,0.006548,0.623233
1,2011,1,0.385441,0.45164,0.525797
2,2012,1,0.841329,0.025067,0.780011
3,2010,3,0.680465,0.391174,0.191392
4,2011,3,0.603404,0.863669,0.645043
5,2012,3,0.989605,0.123076,0.657214
6,2010,2,0.833007,0.345412,0.4986
7,2011,2,0.775425,0.333903,0.113928
8,2012,2,0.296919,0.525885,0.136487
9,2010,4,0.419915,0.015624,0.021092


Unnamed: 0,year,quarter,x,y,z,a
0,2011,1,0.642814,0.962332,0.467752,0.5
1,2011,2,0.692682,0.085931,0.296144,0.5
2,2011,3,0.200236,0.465231,0.516087,0.5
3,2011,4,0.42745,0.352248,0.12334,0.5
4,2012,1,0.859883,0.877451,0.249366,0.5
5,2012,2,0.931486,0.540727,0.168296,0.5
6,2012,3,0.228344,0.051946,0.995317,0.5
7,2012,4,0.771774,0.162715,0.547585,0.5
8,2013,1,0.803939,0.023856,0.520435,0.5
9,2013,2,0.525905,0.9894,0.451665,0.5


Unnamed: 0,a,b,c,quarter,x,y,year,z
0,0.35375,0.006548,0.623233,1.0,0.642814,0.962332,2010.0,0.467752
1,0.385441,0.45164,0.525797,1.0,0.692682,0.085931,2011.0,0.296144
2,0.841329,0.025067,0.780011,1.0,0.200236,0.465231,2012.0,0.516087
3,0.680465,0.391174,0.191392,3.0,0.42745,0.352248,2010.0,0.12334
4,0.603404,0.863669,0.645043,3.0,0.859883,0.877451,2011.0,0.249366
5,0.989605,0.123076,0.657214,3.0,0.931486,0.540727,2012.0,0.168296
6,0.833007,0.345412,0.4986,2.0,0.228344,0.051946,2010.0,0.995317
7,0.775425,0.333903,0.113928,2.0,0.771774,0.162715,2011.0,0.547585
8,0.296919,0.525885,0.136487,2.0,0.803939,0.023856,2012.0,0.520435
9,0.419915,0.015624,0.021092,4.0,0.525905,0.9894,2010.0,0.451665


## Exercise 7 ##
- create two dataframes with at least one column name in common and another column to join on but with differnet column names and some shared values in that column (e.g., from `merge1.csv` and `merge2.csv`)
- `.stack()` both of the dataframes and swap levels
- `unstack()` the dataframes and combine them vertically

In [5]:
df3=pd.read_csv('./data/merge1.csv').rename(columns={'qtr':'quarter'})
df3
df4=pd.read_csv('./data/merge2.csv')
# df4
# - `.stack()` both of the dataframes and swap levels
df5=df3.stack().swaplevel()
df5
df6=df4.stack().swaplevel()
df6
# - `unstack()` the dataframes and combine them
df7=pd.concat([df5.unstack(),df6.unstack()])
df7


Unnamed: 0,year,quarter,a,b,c
0,2010,1,0.35375,0.006548,0.623233
1,2011,1,0.385441,0.45164,0.525797
2,2012,1,0.841329,0.025067,0.780011
3,2010,3,0.680465,0.391174,0.191392
4,2011,3,0.603404,0.863669,0.645043
5,2012,3,0.989605,0.123076,0.657214
6,2010,2,0.833007,0.345412,0.4986
7,2011,2,0.775425,0.333903,0.113928
8,2012,2,0.296919,0.525885,0.136487
9,2010,4,0.419915,0.015624,0.021092


year     0     2010.000000
quarter  0        1.000000
a        0        0.353750
b        0        0.006548
c        0        0.623233
year     1     2011.000000
quarter  1        1.000000
a        1        0.385441
b        1        0.451640
c        1        0.525797
year     2     2012.000000
quarter  2        1.000000
a        2        0.841329
b        2        0.025067
c        2        0.780011
year     3     2010.000000
quarter  3        3.000000
a        3        0.680465
b        3        0.391174
c        3        0.191392
year     4     2011.000000
quarter  4        3.000000
a        4        0.603404
b        4        0.863669
c        4        0.645043
year     5     2012.000000
quarter  5        3.000000
a        5        0.989605
b        5        0.123076
c        5        0.657214
year     6     2010.000000
quarter  6        2.000000
a        6        0.833007
b        6        0.345412
c        6        0.498600
year     7     2011.000000
quarter  7        2.000000
a

year     0     2011.000000
quarter  0        1.000000
x        0        0.642814
y        0        0.962332
z        0        0.467752
                  ...     
year     15    2014.000000
quarter  15       4.000000
x        15       0.678974
y        15       0.172293
z        15       0.916729
Length: 80, dtype: float64

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
year,2010.0,2011.0,2012.0,2010.0,2011.0,2012.0,2010.0,2011.0,2012.0,2010.0,2011.0,2012.0,,,,
quarter,1.0,1.0,1.0,3.0,3.0,3.0,2.0,2.0,2.0,4.0,4.0,4.0,,,,
a,0.35375,0.385441,0.841329,0.680465,0.603404,0.989605,0.833007,0.775425,0.296919,0.419915,0.374054,0.988209,,,,
b,0.006548,0.45164,0.025067,0.391174,0.863669,0.123076,0.345412,0.333903,0.525885,0.015624,0.281761,0.963099,,,,
c,0.623233,0.525797,0.780011,0.191392,0.645043,0.657214,0.4986,0.113928,0.136487,0.021092,0.823217,0.48738,,,,
year,2011.0,2011.0,2011.0,2011.0,2012.0,2012.0,2012.0,2012.0,2013.0,2013.0,2013.0,2013.0,2014.0,2014.0,2014.0,2014.0
quarter,1.0,2.0,3.0,4.0,1.0,2.0,3.0,4.0,1.0,2.0,3.0,4.0,1.0,2.0,3.0,4.0
x,0.642814,0.692682,0.200236,0.42745,0.859883,0.931486,0.228344,0.771774,0.803939,0.525905,0.731953,0.984646,0.358981,0.408011,0.481476,0.678974
y,0.962332,0.085931,0.465231,0.352248,0.877451,0.540727,0.051946,0.162715,0.023856,0.9894,0.019131,0.953599,0.995009,0.931475,0.897445,0.172293
z,0.467752,0.296144,0.516087,0.12334,0.249366,0.168296,0.995317,0.547585,0.520435,0.451665,0.468323,0.266323,0.993527,0.418217,0.528483,0.916729


## Exercise 8 ##
- create a dataframes
- use `.pivot()` to extract one of the variable columns
- use `.melt()` to put the pivot into a long ("skinny") layout
- melt the original dataframe, specifying which columns are value columns and key columns

In [6]:
# - create a dataframes
df3=pd.read_csv('./data/merge1.csv').rename(columns={'qtr':'quarter'})
# - use `.pivot()` to extract one of the variable columns
df8=df3.pivot(index='quarter',columns='year',values='a')
df8
# - use `.melt()` to put the pivot into a long ("skinny") layout
df8.melt()
# - melt the original dataframe, specifying which columns are value columns and key columns
df3.melt(value_vars=['a','b','c'],id_vars=['year','quarter'])

year,2010,2011,2012
quarter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.35375,0.385441,0.841329
2,0.833007,0.775425,0.296919
3,0.680465,0.603404,0.989605
4,0.419915,0.374054,0.988209


Unnamed: 0,year,value
0,2010,0.35375
1,2010,0.833007
2,2010,0.680465
3,2010,0.419915
4,2011,0.385441
5,2011,0.775425
6,2011,0.603404
7,2011,0.374054
8,2012,0.841329
9,2012,0.296919


Unnamed: 0,year,quarter,variable,value
0,2010,1,a,0.35375
1,2011,1,a,0.385441
2,2012,1,a,0.841329
3,2010,3,a,0.680465
4,2011,3,a,0.603404
5,2012,3,a,0.989605
6,2010,2,a,0.833007
7,2011,2,a,0.775425
8,2012,2,a,0.296919
9,2010,4,a,0.419915
