### Indexing and merging two

#### References
1. Python for data analysis
2. Think stats: exploratory data analysis
3. https://pandas.pydata,org

#### Purpose
1. Work in the abstract (its good for the brain)
2. Keep up to date with changes in the library
3. Explore new ways of doing common tasks --- get better

In [1]:
import pandas as pd
import numpy as np


In [6]:
# create some data frames
index = ['a', 'b', 'c', 'd', 'e']
index_2 = ['f', 'g', 'h', 'i', 'j']
columns = ['bacon', 'eggs', 'ham', 'spam', 'hash']
columns_2 = ['toast', 'jelly', 'butter', 'wheat', 'white']
index_3 = ['k', 'l', 'm']
columns_3 = ['pancakes', 'waffles', 'crepes']
index_4 = pd.date_range('04/01/2017', periods=30, freq='D')
columns_4 = ['Waffle house', 'Dennys', 'Panera']
names = ['Sheila', 'Dave', 'Pat']
resto = columns_4


a = pd.DataFrame(np.random.randint(6, size=(5,5)), index=index, columns=columns)
b = pd.DataFrame(np.random.randint(4, size=(5,5)), index=index_2, columns=columns_2)
c = pd.DataFrame(np.random.randint(4, size=(3,3)), index=index_3, columns=columns_3)
d = pd.DataFrame(6 * np.random.random_sample((30, 3)) + 3.99, dtype='float64', index=index_4, columns=columns_4)
e = pd.DataFrame(np.random.choice(resto,(30,3)), columns = names)
d[columns_4]=d[columns_4].round(2)

In [7]:
print(a)
print(b)
print(c)
print(d)
print(e)

   bacon  eggs  ham  spam  hash
a      4     1    5     3     1
b      5     5    5     0     0
c      0     2    1     0     4
d      2     1    1     4     1
e      4     0    0     4     2
   toast  jelly  butter  wheat  white
f      3      1       3      3      1
g      1      2       0      0      3
h      0      3       1      2      2
i      0      3       0      3      1
j      2      3       1      2      1
   pancakes  waffles  crepes
k         2        0       2
l         0        1       1
m         3        0       0
            Waffle house  Dennys  Panera
2017-04-01          9.07    6.09    4.79
2017-04-02          7.92    5.66    8.50
2017-04-03          5.82    5.79    4.72
2017-04-04          9.60    7.00    4.44
2017-04-05          6.95    7.41    6.32
2017-04-06          6.39    4.31    7.95
2017-04-07          9.03    4.93    7.34
2017-04-08          8.12    5.06    7.01
2017-04-09          6.66    7.52    5.89
2017-04-10          8.03    6.01    5.72
2017-04-11   

In [13]:
# switch the columns so that Dave = Pat and Pat = Sheila and Sheila = Dave
e.loc[:, ['Pat', 'Sheila', 'Dave']] = e[names].values

In [16]:
e.iloc[:4]

Unnamed: 0,Sheila,Dave,Pat
0,Waffle house,Waffle house,Panera
1,Waffle house,Panera,Dennys
2,Panera,Dennys,Waffle house
3,Waffle house,Panera,Waffle house


In [17]:
# can acces columns as attributes
e_sheila, e_dave, e_pat = e.Sheila, e.Dave, e.Pat
e_dave[:4]

0    Waffle house
1          Panera
2          Dennys
3          Panera
Name: Dave, dtype: object

In [18]:
type(e_dave)

pandas.core.series.Series

In [19]:
# so that means all series slicing syntax works here
e_dave[::5]
# start from the back and get every fifth element

0     Waffle house
5           Panera
10          Dennys
15    Waffle house
20          Panera
25    Waffle house
Name: Dave, dtype: object

In [21]:
# get the whole thing in reverse
e_dave[::-1]
# notice the index value is valid, here we start at 29

29    Waffle house
28    Waffle house
27          Dennys
26          Dennys
25    Waffle house
24          Dennys
23          Panera
22          Panera
21          Dennys
20          Panera
19          Panera
18          Dennys
17    Waffle house
16    Waffle house
15    Waffle house
14    Waffle house
13          Dennys
12          Panera
11    Waffle house
10          Dennys
9     Waffle house
8           Dennys
7     Waffle house
6           Dennys
5           Panera
4     Waffle house
3           Panera
2           Dennys
1           Panera
0     Waffle house
Name: Dave, dtype: object

In [23]:
# dave says that he ate at Waffle house on the third
e_dave[2] = "Waffle house"

In [25]:
e_dave[:5]
# changed from Dennys to Waffle house

0    Waffle house
1          Panera
2    Waffle house
3          Panera
4    Waffle house
Name: Dave, dtype: object

In [26]:
# this works on data frames too
e[:3]

Unnamed: 0,Sheila,Dave,Pat
0,Waffle house,Waffle house,Panera
1,Waffle house,Panera,Dennys
2,Panera,Waffle house,Waffle house


In [27]:
e[::5]

Unnamed: 0,Sheila,Dave,Pat
0,Waffle house,Waffle house,Panera
5,Waffle house,Panera,Panera
10,Dennys,Dennys,Panera
15,Panera,Waffle house,Dennys
20,Panera,Panera,Dennys
25,Dennys,Waffle house,Dennys


In [28]:
# selection by label is the same
e['Dave'][:5]

0    Waffle house
1          Panera
2    Waffle house
3          Panera
4    Waffle house
Name: Dave, dtype: object

In [30]:
# with dates it is a little different
# ---- > d.loc[2:7] this will throw an error
# TypeError: cannot do slice indexing on <class 'pandas.core.indexes.datetimes.DatetimeIndex'> 
# with these indexers [2] of <class 'int'>

In [31]:
# you need to use the value or a convertible 'like'string
d.loc['20170410': '20170414']
# notice the endpoint is included...

Unnamed: 0,Waffle house,Dennys,Panera
2017-04-10,8.03,6.01,5.72
2017-04-11,5.67,6.56,6.31
2017-04-12,4.26,6.34,7.96
2017-04-13,7.3,6.34,6.52
2017-04-14,4.65,7.89,4.67


In [32]:
# here we can slice with labels
b.loc['b':'d']
# returns empy

Unnamed: 0,toast,jelly,butter,wheat,white


In [33]:
# whats in the index then
b.index.values

array(['f', 'g', 'h', 'i', 'j'], dtype=object)

In [34]:
# so do this:
b_index = b.index.values
b_index[3]

'i'

In [35]:
# use the list and pass your index range as a slice value
b.loc[b_index[2:4]]

Unnamed: 0,toast,jelly,butter,wheat,white
h,0,3,1,2,2
i,0,3,0,3,1


In [36]:
# these will come in handy
# yes we could have used the list to create the df
# but what if you don't have that list?
a_index = a.index.values
c_index = c.index.values


In [42]:
# so labels can be used just like integers(almost)
c.loc['k':]

Unnamed: 0,pancakes,waffles,crepes
k,2,0,2
l,0,1,1
m,3,0,0


In [43]:
#or
c.loc[:'m']

Unnamed: 0,pancakes,waffles,crepes
k,2,0,2
l,0,1,1
m,3,0,0


In [45]:
# grab a mix of rows:
a.loc['b':, 'ham':'hash']

Unnamed: 0,ham,spam,hash
b,5,0,0
c,1,0,4
d,1,4,1
e,0,4,2


In [46]:
# or 
a.loc['c':'e', 'ham':'spam']

Unnamed: 0,ham,spam
c,1,0
d,1,4
e,0,4


In [47]:
# cross section of one row
a.loc['d']

bacon    2
eggs     1
ham      1
spam     4
hash     1
Name: d, dtype: int64

In [48]:
# get a specific value:
# for example how much was spent at 'Waffle house' on April 13
d.loc['20170414', 'Waffle house']

4.65

In [61]:
# if you know the index number
d.iloc[13]['Waffle house']

4.65

In [62]:
d.loc['20170414']

Waffle house    4.65
Dennys          7.89
Panera          4.67
Name: 2017-04-14 00:00:00, dtype: float64

In [63]:
# return to iloc
e.iloc[:5]
# iloc is like numpy or python.... the upper bound is excluded

Unnamed: 0,Sheila,Dave,Pat
0,Waffle house,Waffle house,Panera
1,Waffle house,Panera,Dennys
2,Panera,Waffle house,Waffle house
3,Waffle house,Panera,Waffle house
4,Waffle house,Waffle house,Waffle house


In [64]:
# get rows 11-18 for Dennys and panera
d.iloc[11:19, 1:]

Unnamed: 0,Dennys,Panera
2017-04-12,6.34,7.96
2017-04-13,6.34,6.52
2017-04-14,7.89,4.67
2017-04-15,5.05,8.12
2017-04-16,9.51,7.86
2017-04-17,6.83,7.98
2017-04-18,9.92,7.9
2017-04-19,6.55,7.43


In [67]:
# this will not work ---> d.iloc[11:19, resto[1:]]
# canont put a string indexer with iloc
# TypeError: cannot perform reduce with flexible type