# More advance proccessing with Pandas

In [11]:
import pandas as pd
import numpy as np

## Operations on the metadata

### Transpose

In [22]:
df1.T

Unnamed: 0,0,1,2,3,4
state,VA,VA,VA,MD,MD
year,2012,2013,1214,2014,2015
pop,5.0,5.1,5.2,4.0,4.1
unempl,,,0.6,0.6,0.6


### Reindexing row(s)/column(s)

Create a new object with the data conformed to a new index.  Any missing values are set to NaN.

In [28]:
df2 = pd.DataFrame( {'year' : [2012, 2013, 2014, 2014, 2015],
                     'pop' : [5.0, 5.1, 5.2, 4.0, 4.1],
                     'unempl' : range(5)
                    } )
df2

Unnamed: 0,year,pop,unempl
0,2012,5.0,0
1,2013,5.1,1
2,2014,5.2,2
3,2014,4.0,3
4,2015,4.1,4


Reindexing rows returns a new frame with the specified index:

In [33]:
df2.reindex( range(6, 0, -1) )

Unnamed: 0,year,pop,unempl
6,,,
5,,,
4,2015.0,4.1,4.0
3,2014.0,4.0,3.0
2,2014.0,5.2,2.0
1,2013.0,5.1,1.0


Missing values can be set to something other than NaN:

In [38]:
df2.reindex(range(5, -1, -1), fill_value=0)

Unnamed: 0,year,pop,unempl
5,0,0.0,0
4,2015,4.1,4
3,2014,4.0,3
2,2014,5.2,2
1,2013,5.1,1
0,2012,5.0,0


Interpolate ordered data like a time series:

In [39]:
serie5 = pd.Series(['foo', 'bar', 'baz'], index=[0, 2, 4])
serie5

0    foo
2    bar
4    baz
dtype: object

In [41]:
serie5.reindex(range(5), method='ffill')

0    foo
1    foo
2    bar
3    bar
4    baz
dtype: object

In [43]:
serie5.reindex(range(5), method='bfill')

0    foo
1    bar
2    bar
3    baz
4    baz
dtype: object

Reindex columns:

In [44]:
df2.reindex(columns=['state', 'pop', 'unempl', 'year'])

Unnamed: 0,state,pop,unempl,year
0,,5.0,0,2012
1,,5.1,1,2013
2,,5.2,2,2014
3,,4.0,3,2014
4,,4.1,4,2015


Reindex rows and columns while filling rows:

In [49]:
df2.reindex(index = np.arange(6, -1, -1),
           fill_value=0,
           columns = ['state', 'pop', 'unempl', 'year'])

Unnamed: 0,state,pop,unempl,year
6,0,0.0,0,0
5,0,0.0,0,0
4,0,4.1,4,2015
3,0,4.0,3,2014
2,0,5.2,2,2014
1,0,5.1,1,2013
0,0,5.0,0,2012


Reindex using ix:

In [64]:
df3 = df2.iloc[np.arange(0, 5)][['pop', 'unempl', 'year']]
df3

Unnamed: 0,pop,unempl,year
0,5.0,0,2012
1,5.1,1,2013
2,5.2,2,2014
3,4.0,3,2014
4,4.1,4,2015


## Operations on the data

### Select data base on a  filter 

In [77]:
df3['pop'] > 5

0    False
1     True
2     True
3    False
4    False
Name: pop, dtype: bool

In [105]:
df3.loc[df3['pop'] > 5, ]

Unnamed: 0,pop,unempl,year
1,5.1,1,2013
2,5.2,2,2014


multiple filters.

In [83]:
df3.query('pop > 5 and year == 2013')

Unnamed: 0,pop,unempl,year
1,5.1,1,2013


In [87]:
df3.loc[ (df3['pop'] > 5) | (df3['year'] == 2013)]

Unnamed: 0,pop,unempl,year
1,5.1,1,2013
2,5.2,2,2014


Perform a scalar comparison on a DataFrame, retain the values that pass the filter:

In [90]:
df3[ df3 > 5 ]

Unnamed: 0,pop,unempl,year
0,,,2012
1,5.1,,2013
2,5.2,,2014
3,,,2014
4,,,2015


Select rows with specific column base on a filter.

In [95]:
df3

Unnamed: 0,pop,unempl,year
0,5.0,0,2012
1,5.1,1,2013
2,5.2,2,2014
3,4.0,3,2014
4,4.1,4,2015


In [103]:
df3.loc[df3['pop'] > 5.0, 'year']

1    2013
2    2014
Name: year, dtype: int64

<div class="alert alert-warning">
    <strong>Warning:</strong> Becareful with accessing columns with attributes because, ex. <code>df3.pop</code> may refer to a column named 'pop' or an inplicit method called 'pop'. For that it's better to use <code>df3['pop']</code>.
<div>

### Updating a column values

Create a **data1** and **df1**. 

In [23]:
data1 = {'state' : ['VA', 'VA', 'VA', 'MD', 'MD'],
         'year' : [2012, 2013, 1214, 2014, 2015],
         'pop' : [5.0, 5.1, 5.2, 4.0, 4.1]}

df1 = pd.DataFrame(data1)

Update with **arange()** method.

In [24]:
df1['unempl'] = np.arange(5)
df1

Unnamed: 0,state,year,pop,unempl
0,VA,2012,5.0,0
1,VA,2013,5.1,1
2,VA,1214,5.2,2
3,MD,2014,4.0,3
4,MD,2015,4.1,4


Update with **Series**.

In [21]:
unempl = pd.Series([0.6, 0.6, 0.6], index=[2, 3, 4])

df1['unempl'] = unempl
df1

Unnamed: 0,state,year,pop,unempl
0,VA,2012,5.0,
1,VA,2013,5.1,
2,VA,1214,5.2,0.6
3,MD,2014,4.0,0.6
4,MD,2015,4.1,0.6


<div class="alert alert-warning">
    <strong>Warning:</strong> If assigning a list or array, the lengt must match the dataframe, unlike Series.
</div>

### Dropping Entries

Drop rows from a Series or DataFrame:

In [69]:
df4 = df3.drop([0, 1])
df4

Unnamed: 0,pop,unempl,year
2,5.2,2,2014
3,4.0,3,2014
4,4.1,4,2015


Drop columns from a DatFrame:

In [70]:
df4 = df4.drop('unempl', axis=1)
df4

Unnamed: 0,pop,year
2,5.2,2014
3,4.0,2014
4,4.1,2015
