# Warsztaty Python w Data Science

# Data Wrangling 3
---

## Mistrz Pandas - zaawansowana funkcjonalność

### - Zmiana kształu `Dataframe`
### - MultiIndex
### - `stack` i `unstack`

---

# https://github.com/Bits-of-Data-PL/PythonDataScience

---
# Zmiana kształtu DataFrame
## Szeroki w długi - `melt`


In [1]:
import pandas as pd
import numpy as np

df = pd.DataFrame({'Student': {0: 'Kowalski J.', 1: 'Nowak A.', 2: 'Korzycki M.'},
                   'WuEf': {0: 5, 1: 4, 2: 2},
                   'Polski': {0: 4, 1: 4, 2: 2},
                   'Matma': {0: 5, 1: 3, 2: 2}})
df

Unnamed: 0,Student,WuEf,Polski,Matma
0,Kowalski J.,5,4,5
1,Nowak A.,4,4,3
2,Korzycki M.,2,2,2


In [2]:
df1 = pd.melt(df, id_vars=['Student'], value_vars=['WuEf', 'Matma', 'Polski'],
       var_name='Przedmiot', value_name='Ocena')
df1

Unnamed: 0,Student,Przedmiot,Ocena
0,Kowalski J.,WuEf,5
1,Nowak A.,WuEf,4
2,Korzycki M.,WuEf,2
3,Kowalski J.,Matma,5
4,Nowak A.,Matma,3
5,Korzycki M.,Matma,2
6,Kowalski J.,Polski,4
7,Nowak A.,Polski,4
8,Korzycki M.,Polski,2


---
## Długi w szeroki - `pivot`

![title](img/pivot.png)

Źródło: https://pandas.pydata.org/pandas-docs/stable/user_guide/reshaping.html

In [3]:
df_pivot = df1.pivot(index='Student', columns='Przedmiot')
df_pivot

Unnamed: 0_level_0,Ocena,Ocena,Ocena
Przedmiot,Matma,Polski,WuEf
Student,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Korzycki M.,2,2,2
Kowalski J.,5,4,5
Nowak A.,3,4,4


In [4]:
df_pivot.index

Index(['Korzycki M.', 'Kowalski J.', 'Nowak A.'], dtype='object', name='Student')

In [5]:
df_pivot.columns

MultiIndex([('Ocena',  'Matma'),
            ('Ocena', 'Polski'),
            ('Ocena',   'WuEf')],
           names=[None, 'Przedmiot'])

In [6]:
df_pivot.reset_index()

Unnamed: 0_level_0,Student,Ocena,Ocena,Ocena
Przedmiot,Unnamed: 1_level_1,Matma,Polski,WuEf
0,Korzycki M.,2,2,2
1,Kowalski J.,5,4,5
2,Nowak A.,3,4,4


In [7]:
df_pivot.reset_index().columns

MultiIndex([('Student',       ''),
            (  'Ocena',  'Matma'),
            (  'Ocena', 'Polski'),
            (  'Ocena',   'WuEf')],
           names=[None, 'Przedmiot'])

In [8]:
df = pd.DataFrame({'value': np.random.randint(0, 100, 10)})
df

Unnamed: 0,value
0,18
1,57
2,33
3,14
4,16
5,27
6,21
7,96
8,64
9,26


In [9]:
import pandas as pd
import numpy as np

labels = ["{0} - {1}".format(i, i + 9) for i in range(0, 100, 10)]
df['Group'] = pd.cut(df.value, range(0, 105, 10), right=False, labels=labels)
df

Unnamed: 0,value,Group
0,18,10 - 19
1,57,50 - 59
2,33,30 - 39
3,14,10 - 19
4,16,10 - 19
5,27,20 - 29
6,21,20 - 29
7,96,90 - 99
8,64,60 - 69
9,26,20 - 29


In [10]:
rez = df.groupby('Group').agg({'value': ['count', sum]})
rez

Unnamed: 0_level_0,value,value
Unnamed: 0_level_1,count,sum
Group,Unnamed: 1_level_2,Unnamed: 2_level_2
0 - 9,0,0
10 - 19,3,48
20 - 29,3,74
30 - 39,1,33
40 - 49,0,0
50 - 59,1,57
60 - 69,1,64
70 - 79,0,0
80 - 89,0,0
90 - 99,1,96


In [11]:
rez.columns

MultiIndex([('value', 'count'),
            ('value',   'sum')],
           )

---
## MultiIndex

In [12]:
df_grouped = df.groupby('Group').agg({'value': ['count', sum]})

In [13]:
df_grouped

Unnamed: 0_level_0,value,value
Unnamed: 0_level_1,count,sum
Group,Unnamed: 1_level_2,Unnamed: 2_level_2
0 - 9,0,0
10 - 19,3,48
20 - 29,3,74
30 - 39,1,33
40 - 49,0,0
50 - 59,1,57
60 - 69,1,64
70 - 79,0,0
80 - 89,0,0
90 - 99,1,96


In [14]:
df_grouped.columns

MultiIndex([('value', 'count'),
            ('value',   'sum')],
           )

In [15]:
stocks = pd.read_csv('data/stocks.csv')
stocks

Unnamed: 0,Date,Close,Volume,Symbol
0,2016-10-03,31.5,14070500,CSCO
1,2016-10-03,112.52,21701800,AAPL
2,2016-10-03,57.42,19189500,MSFT
3,2016-10-04,113.0,29736800,AAPL
4,2016-10-04,57.24,20085900,MSFT
5,2016-10-04,31.35,18460400,CSCO
6,2016-10-05,57.64,16726400,MSFT
7,2016-10-05,31.59,11808600,CSCO
8,2016-10-05,113.05,21453100,AAPL


In [16]:
stocks.index

RangeIndex(start=0, stop=9, step=1)

In [17]:
stocks.groupby('Symbol').Close.mean()

Symbol
AAPL    112.856667
CSCO     31.480000
MSFT     57.433333
Name: Close, dtype: float64

In [18]:
seria = stocks.groupby(['Symbol', 'Date']).Close.mean()
seria

Symbol  Date      
AAPL    2016-10-03    112.52
        2016-10-04    113.00
        2016-10-05    113.05
CSCO    2016-10-03     31.50
        2016-10-04     31.35
        2016-10-05     31.59
MSFT    2016-10-03     57.42
        2016-10-04     57.24
        2016-10-05     57.64
Name: Close, dtype: float64

![](img/unstack.png)

Źródło: https://pandas.pydata.org/pandas-docs/stable/user_guide/reshaping.html

In [19]:
df = seria.unstack()
df

Date,2016-10-03,2016-10-04,2016-10-05
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAPL,112.52,113.0,113.05
CSCO,31.5,31.35,31.59
MSFT,57.42,57.24,57.64


In [20]:
df.columns

Index(['2016-10-03', '2016-10-04', '2016-10-05'], dtype='object', name='Date')

### `groupby` i `unstack` - to to samo co `pivot_table`

In [21]:
stocks.pivot_table(values='Close', index='Symbol', columns='Date')

Date,2016-10-03,2016-10-04,2016-10-05
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAPL,112.52,113.0,113.05
CSCO,31.5,31.35,31.59
MSFT,57.42,57.24,57.64


In [22]:
stocks.pivot_table(values='Close', index='Symbol', aggfunc=max)

Unnamed: 0_level_0,Close
Symbol,Unnamed: 1_level_1
AAPL,113.05
CSCO,31.59
MSFT,57.64


In [23]:
stocks.pivot_table(values='Close', index='Symbol', aggfunc=np.mean)

Unnamed: 0_level_0,Close
Symbol,Unnamed: 1_level_1
AAPL,112.856667
CSCO,31.48
MSFT,57.433333


In [25]:
stocks_indexed = stocks.set_index(['Symbol','Date'])
stocks_indexed

Unnamed: 0_level_0,Unnamed: 1_level_0,Close,Volume
Symbol,Date,Unnamed: 2_level_1,Unnamed: 3_level_1
CSCO,2016-10-03,31.5,14070500
AAPL,2016-10-03,112.52,21701800
MSFT,2016-10-03,57.42,19189500
AAPL,2016-10-04,113.0,29736800
MSFT,2016-10-04,57.24,20085900
CSCO,2016-10-04,31.35,18460400
MSFT,2016-10-05,57.64,16726400
CSCO,2016-10-05,31.59,11808600
AAPL,2016-10-05,113.05,21453100


In [26]:
stocks_indexed.unstack()

Unnamed: 0_level_0,Close,Close,Close,Volume,Volume,Volume
Date,2016-10-03,2016-10-04,2016-10-05,2016-10-03,2016-10-04,2016-10-05
Symbol,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
AAPL,112.52,113.0,113.05,21701800,29736800,21453100
CSCO,31.5,31.35,31.59,14070500,18460400,11808600
MSFT,57.42,57.24,57.64,19189500,20085900,16726400


In [27]:
tuples = list(
    zip(
        *[
            ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
            ["one", "two", "one", "two", "one", "two", "one", "two"],
        ]
    )
)
tuples

[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]

In [28]:
index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"])
df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=["A", "B"])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.181743,-0.885355
bar,two,0.114307,0.286387
baz,one,-0.467161,1.226411
baz,two,0.705026,-0.214754
foo,one,1.106444,-1.127292
foo,two,2.152312,-0.855215
qux,one,-0.16462,-0.037585
qux,two,0.604289,0.456554


In [29]:
iterables =  [["bar", "baz", "foo",  "qux"] , ["one", "two"]]

index = pd.MultiIndex.from_product(iterables, names=["first", "second"])
df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=["A", "B"])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.139973,-0.122045
bar,two,0.114063,0.577797
baz,one,-1.034845,0.509806
baz,two,-2.025523,0.585107
foo,one,2.220696,1.025835
foo,two,-1.242206,0.684521
qux,one,-0.359738,1.069602
qux,two,-1.844441,0.459921


In [30]:
dfi = pd.DataFrame(
   tuples,
    columns=["first", "second"],
)
dfi

Unnamed: 0,first,second
0,bar,one
1,bar,two
2,baz,one
3,baz,two
4,foo,one
5,foo,two
6,qux,one
7,qux,two


In [31]:
index = pd.MultiIndex.from_frame(dfi, names=["first", "second"])
df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=["A", "B"])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.975973,2.650622
bar,two,1.167706,2.115502
baz,one,-1.34945,0.632971
baz,two,1.327878,0.7977
foo,one,0.510302,-0.215863
foo,two,0.383672,0.244366
qux,one,1.114605,-0.257038
qux,two,0.44254,1.594791


In [32]:
df.unstack()

Unnamed: 0_level_0,A,A,B,B
second,one,two,one,two
first,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
bar,-0.975973,1.167706,2.650622,2.115502
baz,-1.34945,1.327878,0.632971,0.7977
foo,0.510302,0.383672,-0.215863,0.244366
qux,1.114605,0.44254,-0.257038,1.594791


In [33]:
df.unstack().unstack()

   second  first
A  one     bar     -0.975973
           baz     -1.349450
           foo      0.510302
           qux      1.114605
   two     bar      1.167706
           baz      1.327878
           foo      0.383672
           qux      0.442540
B  one     bar      2.650622
           baz      0.632971
           foo     -0.215863
           qux     -0.257038
   two     bar      2.115502
           baz      0.797700
           foo      0.244366
           qux      1.594791
dtype: float64

In [34]:
df.unstack().unstack().unstack()

Unnamed: 0_level_0,first,bar,baz,foo,qux
Unnamed: 0_level_1,second,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A,one,-0.975973,-1.34945,0.510302,1.114605
A,two,1.167706,1.327878,0.383672,0.44254
B,one,2.650622,0.632971,-0.215863,-0.257038
B,two,2.115502,0.7977,0.244366,1.594791


In [35]:
df = pd.DataFrame({ 'foo': ['one', 'one', 'one', 'two', 'two', 'two'],
                    'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
                    'baz': [1, 2, 3, 4, 5, 6], 
                    'zoo': ['x', 'y', 'z', 'q', 'w', 't']})

In [36]:
df

Unnamed: 0,foo,bar,baz,zoo
0,one,A,1,x
1,one,B,2,y
2,one,C,3,z
3,two,A,4,q
4,two,B,5,w
5,two,C,6,t


## Stack 

![](img/stack.png)

Źródło: https://pandas.pydata.org/pandas-docs/stable/user_guide/reshaping.html

In [37]:
df.stack()

0  foo    one
   bar      A
   baz      1
   zoo      x
1  foo    one
   bar      B
   baz      2
   zoo      y
2  foo    one
   bar      C
   baz      3
   zoo      z
3  foo    two
   bar      A
   baz      4
   zoo      q
4  foo    two
   bar      B
   baz      5
   zoo      w
5  foo    two
   bar      C
   baz      6
   zoo      t
dtype: object

In [38]:
df.stack().reset_index()

Unnamed: 0,level_0,level_1,0
0,0,foo,one
1,0,bar,A
2,0,baz,1
3,0,zoo,x
4,1,foo,one
5,1,bar,B
6,1,baz,2
7,1,zoo,y
8,2,foo,one
9,2,bar,C


In [39]:
df.columns

Index(['foo', 'bar', 'baz', 'zoo'], dtype='object')

In [40]:
df.index

RangeIndex(start=0, stop=6, step=1)

In [41]:
df.stack().unstack()

Unnamed: 0,foo,bar,baz,zoo
0,one,A,1,x
1,one,B,2,y
2,one,C,3,z
3,two,A,4,q
4,two,B,5,w
5,two,C,6,t


In [42]:
df

Unnamed: 0,foo,bar,baz,zoo
0,one,A,1,x
1,one,B,2,y
2,one,C,3,z
3,two,A,4,q
4,two,B,5,w
5,two,C,6,t


---

### Długi w szeroki - `pivot` jeszcze raz


In [43]:
df.pivot(index='foo', columns='bar', values='baz')

bar,A,B,C
foo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,1,2,3
two,4,5,6


In [44]:
df.pivot(index='foo', columns='bar', values=['baz', 'zoo'])

Unnamed: 0_level_0,baz,baz,baz,zoo,zoo,zoo
bar,A,B,C,A,B,C
foo,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
one,1,2,3,x,y,z
two,4,5,6,q,w,t


In [45]:
df_e = pd.DataFrame({"foo": ['one', 'one', 'two', 'two'],
                   "bar": ['A', 'A', 'B', 'C'],
                   "baz": [1, 2, 3, 4]})
df_e

Unnamed: 0,foo,bar,baz
0,one,A,1
1,one,A,2
2,two,B,3
3,two,C,4


In [46]:
df_e.pivot(index='foo', columns='bar', values='baz')  # Błąd!!!

ValueError: Index contains duplicate entries, cannot reshape

In [47]:
df = pd.DataFrame({"foo": ['one', 'two', 'three', 'four'],
                   "bar": ['A', 'B', 'C', 'D'],
                   "baz": [1, 2, 3, 4]})
df

Unnamed: 0,foo,bar,baz
0,one,A,1
1,two,B,2
2,three,C,3
3,four,D,4


In [48]:
df.pivot(index='bar', columns='foo')

Unnamed: 0_level_0,baz,baz,baz,baz
foo,four,one,three,two
bar,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
A,,1.0,,
B,,,,2.0
C,,,3.0,
D,4.0,,,


In [49]:
df.pivot(index='foo', columns='bar')

Unnamed: 0_level_0,baz,baz,baz,baz
bar,A,B,C,D
foo,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
four,,,,4.0
one,1.0,,,
three,,,3.0,
two,,2.0,,


In [50]:
df.groupby(['foo', 'bar'])['baz'].aggregate('mean')

foo    bar
four   D      4.0
one    A      1.0
three  C      3.0
two    B      2.0
Name: baz, dtype: float64

In [51]:
df.groupby(['foo', 'bar'])['baz'].aggregate('mean').unstack()

bar,A,B,C,D
foo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
four,,,,4.0
one,1.0,,,
three,,,3.0,
two,,2.0,,


---

Napraw `df_e` biorąc maksimum wartości `baz` dla pary `foo`, `bar` (odrzuć konflikty)

In [54]:
import pandas as pd

df_e = pd.DataFrame({"foo": ['one', 'one', 'two', 'two'],
                   "bar": ['A', 'A', 'B', 'C'],
                   "baz": [1, 2, 3, 4]})
df_e

Unnamed: 0,foo,bar,baz
0,one,A,1
1,one,A,2
2,two,B,3
3,two,C,4


In [55]:
df_e.groupby(['foo', 'bar']).aggregate('max')

Unnamed: 0_level_0,Unnamed: 1_level_0,baz
foo,bar,Unnamed: 2_level_1
one,A,2
two,B,3
two,C,4


In [56]:
df_e.groupby(['foo', 'bar']).aggregate('max').reset_index().pivot(index='foo', columns='bar', values='baz')  # ... i błędu nie ma

bar,A,B,C
foo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,2.0,,
two,,3.0,4.0
