# Digesting PythonDataScienceHandbook by Jake VanderPlas 

<div class="alert alert-block alert-success">
[PythonDataScienceHandbook](https://github.com/IRebri/PythonDataScienceHandbook/tree/master/notebooks)
<li>My notes

</div>


## Part 1. GropBy and Pivot tables

In [1]:
# "Author": Rinat R. Ismagilov <ismagil@polly.phys.msu.ru>
#
# License: GNU General Public License v3.0

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time 
import seaborn as sns
%matplotlib inline

pd.options.display.max_rows = 10
# pd.options.display.float_format = '{:.1f}'.format

In [2]:
# nice practice

%load_ext version_information
# pip install version_information

%version_information numpy, pandas, matplotlib, seaborn, version_information

Software,Version
Python,3.5.6 64bit [MSC v.1900 64 bit (AMD64)]
IPython,6.5.0
OS,Windows 10 10.0.18362 SP0
numpy,1.10.1
pandas,0.20.3
matplotlib,3.0.0
seaborn,0.9.0
version_information,1.0.3
Tue Aug 20 16:49:49 2019 RTZ 2 (ceia),Tue Aug 20 16:49:49 2019 RTZ 2 (ceia)


### Dataset <i>titanic</i> = sns.load_dataset('titanic')

In [3]:
titanic = sns.load_dataset('titanic')
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [9]:
# pivot_table by hand
titanic.groupby('sex')[['survived']].mean()

Unnamed: 0_level_0,survived
sex,Unnamed: 1_level_1
female,0.742038
male,0.188908


In [17]:
# one step deeper
# titanic.groupby(['sex', 'class'])[['survived']].aggregate('mean')
titanic.groupby(['sex', 'class'])['survived'].aggregate('mean').unstack()

class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


#### pivot_table

In [18]:
# The same by Pansdas pivot_table
titanic.pivot_table('survived', index='sex', columns='class')

class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


In [23]:
# multilevel index 2D vs 1D
age = pd.cut(titanic['age'], [0, 18, 80])
titanic.pivot_table('survived', ['sex', age], 'class')


Unnamed: 0_level_0,class,First,Second,Third
sex,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,"(0, 18]",0.909091,1.0,0.511628
female,"(18, 80]",0.972973,0.9,0.423729
male,"(0, 18]",0.8,0.6,0.215686
male,"(18, 80]",0.375,0.071429,0.133663


In [28]:
# multilevel index 2D vs 2D
fare = pd.qcut(titanic['fare'], 2)
titanic.pivot_table('survived', ['sex', age], [fare, 'class'])

fare            (-0.001, 14.454]                     (14.454, 512.329]  \
class                      First    Second     Third             First   
sex    age                                                               
female (0, 18]               NaN  1.000000  0.714286          0.909091   
       (18, 80]              NaN  0.880000  0.444444          0.972973   
male   (0, 18]               NaN  0.000000  0.260870          0.800000   
       (18, 80]              0.0  0.098039  0.125000          0.391304   

fare                                 
class              Second     Third  
sex    age                           
female (0, 18]   1.000000  0.318182  
       (18, 80]  0.914286  0.391304  
male   (0, 18]   0.818182  0.178571  
       (18, 80]  0.030303  0.192308  

In [30]:
# different aggfunc ()
titanic.pivot_table(index=['sex', age], columns='class',
                    aggfunc={'survived':sum, 'fare':'mean'})

Unnamed: 0_level_0,Unnamed: 1_level_0,fare,fare,fare,survived,survived,survived
Unnamed: 0_level_1,class,First,Second,Third,First,Second,Third
sex,age,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
female,"(0, 18]",127.474245,25.064286,17.370835,10,14,22
female,"(18, 80]",105.043469,21.224653,14.785453,72,54,25
male,"(0, 18]",114.63832,26.116947,20.639055,4,9,11
male,"(18, 80]",68.877389,20.219593,10.022624,36,6,27


In [31]:
# add ALL
titanic.pivot_table('survived', index='sex', columns='class', margins=True)

class,First,Second,Third,All
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.968085,0.921053,0.5,0.742038
male,0.368852,0.157407,0.135447,0.188908
All,0.62963,0.472826,0.242363,0.383838
