In [2]:
# Standard data science libraries
import pandas as pd
import numpy as np
from scipy import stats
import featuretools as ft
# Visualization
import matplotlib.pyplot as plt
%matplotlib notebook
import seaborn as sns
plt.style.use('bmh')
# Options for pandas
pd.options.display.max_columns = 20
# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'


## Exercise 1 ##
- create or load a dataframe
- `GroupBy` one of the columns and apply an aggregate function

In [23]:
df1=pd.read_csv('./data/quarterly.csv')
df1
df1.groupby('year').sum()

Unnamed: 0,quarter,year,count,amount
0,Q3,2017,1,891.11
1,Q3,2018,8,878.02
2,Q4,2017,5,805.45
3,Q2,2016,3,760.61
4,Q1,2017,1,696.12
5,Q3,2016,4,674.76
6,Q4,2016,1,541.3
7,Q2,2017,5,533.21
8,Q1,2018,7,486.81
9,Q1,2016,4,255.93


Unnamed: 0_level_0,count,amount
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2016,12,2232.6
2017,12,2925.89
2018,29,1594.35


## Exercise 2 ##
- create or load a dataframe with multiple groupby columns
- `GroupBy` more than one column 
- select a subset of columns and apply an aggregate function


In [24]:
type(df1.groupby(['year','quarter']).median()['count'])
type(df1.groupby(['year','quarter']).median()[['count']])
df1.groupby(['year','quarter']).median()[['count']]

pandas.core.series.Series

pandas.core.frame.DataFrame

Unnamed: 0_level_0,Unnamed: 1_level_0,count
year,quarter,Unnamed: 2_level_1
2016,Q1,4
2016,Q2,3
2016,Q3,4
2016,Q4,1
2017,Q1,1
2017,Q2,5
2017,Q3,1
2017,Q4,5
2018,Q1,7
2018,Q2,8


## Exercise 3 ##
- create or load a dataframe with hierarchical columns
- `GroupBy` one of the columns and apply an aggregate function

In [32]:
mi1=pd.MultiIndex.from_arrays([['X','X','X','X','Y','Y','Y'],['x1','x2','x3','x4','y1','y2','y3']],
                              names=['vector','element'])
# - create a dataframe using the multindex object for columns and using a list of lists to create a multindex for the index
df2=pd.DataFrame(np.random.rand(70).reshape(10,7),
                index=[['A','A','A','A','B','B','B','B','B','B'],
                      ['a1','a2','a3','a4','b1','b2','b3','b4','b5','b6']],
                columns=mi1)
df2
df2.groupby('vector',axis=1).max()

Unnamed: 0_level_0,vector,X,X,X,X,Y,Y,Y
Unnamed: 0_level_1,element,x1,x2,x3,x4,y1,y2,y3
A,a1,0.538448,0.11879,0.611184,0.497389,0.583318,0.366015,0.318531
A,a2,0.048315,0.879891,0.297876,0.058874,0.648632,0.024112,0.946247
A,a3,0.157243,0.826071,0.506038,0.556523,0.152718,0.804498,0.55897
A,a4,0.407986,0.65782,0.212958,0.506303,0.161045,0.09018,0.17491
B,b1,0.138016,0.088412,0.375228,0.649843,0.606831,0.486236,0.664308
B,b2,0.746011,0.595627,0.583776,0.714912,0.986667,0.960092,0.963833
B,b3,0.15017,0.277193,0.831247,0.888379,0.737681,0.627701,0.628004
B,b4,0.122706,0.290511,0.666921,0.994391,0.781334,0.477827,0.372372
B,b5,0.797865,0.971486,0.196642,0.614936,0.852256,0.707435,0.677283
B,b6,0.53628,0.956013,0.023594,0.596004,0.79385,0.108515,0.692252


Unnamed: 0,vector,X,Y
A,a1,0.611184,0.583318
A,a2,0.879891,0.946247
A,a3,0.826071,0.804498
A,a4,0.65782,0.17491
B,b1,0.649843,0.664308
B,b2,0.746011,0.986667
B,b3,0.888379,0.737681
B,b4,0.994391,0.781334
B,b5,0.971486,0.852256
B,b6,0.956013,0.79385


## Exercise 4 ##
- create or load a dataframe
- `GroupBy` using dictionary or series
- apply multiple aggregate functions `.agg()`

In [49]:
df3=pd.read_csv('./data/quarterly.csv', index_col=['quarter'])
df3.groupby({'Q1':'first half','Q2':'first half','Q3':'second half','Q4':'second half'})\
        .agg(['median','mean'])

Unnamed: 0_level_0,year,year,count,count,amount,amount
Unnamed: 0_level_1,median,mean,median,mean,median,mean
first half,2017,2017,4.5,4.666667,510.01,472.435
second half,2017,2017,4.5,4.166667,740.105,653.038333


## Exercise 5 ##
- create or load a dataframe with a `MultiIndex`
- `GroupBy` using the `MultiIndex` and apply an aggregate function

In [51]:
df2.groupby(level=0,axis=1).std()


Unnamed: 0,vector,X,Y
A,a1,0.220194,0.141178
A,a2,0.389855,0.470626
A,a3,0.274779,0.329176
A,a4,0.18642,0.045448
B,b1,0.257161,0.090881
B,b2,0.082396,0.014385
B,b3,0.377352,0.06341
B,b4,0.390355,0.212323
B,b5,0.33261,0.09354
B,b6,0.383976,0.369855


## Exercise 6 ##
- create or load a dataframe
- `GroupBy` using a built-in or custom function
- apply an aggregate function

In [57]:
excel_file=pd.ExcelFile('data/sales_summary.xlsx')
df4=pd.read_excel(excel_file,'Summary',skiprows=3,index_col='location')
df4
df4.groupby(len).sum()

Unnamed: 0_level_0,start_date,units_sold,total_sales
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
London,2010-07-01,12500000,312500000
Oslo,2011-08-15,9800000,245000000
Berlin,2009-01-01,14000000,350000000
Paris,2016-04-01,2100000,52500000
Istanbul,2018-10-01,400000,10000000


Unnamed: 0,units_sold,total_sales
4,9800000,245000000
5,2100000,52500000
6,26500000,662500000
8,400000,10000000


## Exercise 7 ##
- create or load a dataframe with some missing values for some rows of the groupby
- `GroupBy` one of the columns and apply an aggregate function
- fill missing values with the group mean

In [62]:
df5=pd.read_csv('data/missing-values.csv',
                     na_values=np.nan)
df5
df5.groupby('year').apply(lambda x: x.fillna(x.mean))

Unnamed: 0,year,month,count,sum,exposure,users
0,2010,Jan,67830.0,621521.0,812240.0,335.0
1,2010,Feb,93099.0,178392.0,,
2,2010,Mar,310739.0,707974.0,396605.0,604.0
3,2010,Apr,116191.0,499533.0,700482.0,888.0
4,2010,May,48907.0,45271.0,546622.0,
5,2010,Jun,243890.0,467405.0,201393.0,703.0
6,2010,Jul,284681.0,431272.0,640114.0,964.0
7,2010,Aug,66161.0,,,
8,2010,Sep,54391.0,748262.0,475010.0,90.0
9,2010,Oct,28566.0,187409.0,409017.0,688.0


ValueError: max() arg is an empty sequence

ValueError: max() arg is an empty sequence

## Exercise 8 ##
- create or load a dataframe
- calculate a group weighted average

In [85]:
get_wavg = lambda x: np.average(x['count'],weights=x['amount'])
df1.groupby('year').apply(get_wavg)

year
2016    2.931958
2017    2.830089
2018    7.534613
dtype: float64

## Exercise 9 ##
- create or load a dataframe
- group by quantiles

In [90]:
df7=pd.DataFrame(np.random.rand(120).reshape(20,6),\
    columns={'a','b','c','d','e','f'})
df7.groupby(pd.qcut(df7['a'],4)).mean()

Unnamed: 0_level_0,b,e,d,f,a,c
a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"(-0.00087, 0.166]",0.410996,0.412326,0.389743,0.512927,0.076622,0.58046
"(0.166, 0.339]",0.317207,0.314986,0.366318,0.733726,0.271913,0.587904
"(0.339, 0.587]",0.580386,0.749127,0.51503,0.511278,0.43244,0.458088
"(0.587, 0.974]",0.745348,0.365824,0.61537,0.454058,0.772985,0.659125


## Exercise 10 ##
- create or load a dataframe
- use `.pivot_table()` to sum by groups
- include row and column totals

In [96]:
df1.pivot_table(['count','amount'],index='quarter',columns='year',aggfunc=sum,margins=True)


Unnamed: 0_level_0,amount,amount,amount,amount,count,count,count,count
year,2016,2017,2018,All,2016,2017,2018,All
quarter,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Q1,255.93,696.12,486.81,1438.86,4,1,7,12
Q2,760.61,533.21,101.93,1395.75,3,5,8,16
Q3,674.76,891.11,878.02,2443.89,4,1,8,13
Q4,541.3,805.45,127.59,1474.34,1,5,6,12
All,2232.6,2925.89,1594.35,6752.84,12,12,29,53


## Exercise 11 ##
- create or load a dataframe
- use `.crosstab()` to compute grouped frequencies
- include row and column totals

In [98]:
pd.crosstab(df1['quarter'],df1['year'],margins=True)

year,2016,2017,2018,All
quarter,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Q1,1,1,1,3
Q2,1,1,1,3
Q3,1,1,1,3
Q4,1,1,1,3
All,4,4,4,12
