### Indexing and merging two

#### References
1. Python for data analysis
2. Think stats: exploratory data analysis
3. https://pandas.pydata,org

#### Purpose
1. Work in the abstract (its good for the brain)
2. Keep up to date with changes in the library
3. Explore new ways of doing common tasks --- get better

In [1]:
import pandas as pd
import numpy as np


In [2]:
# create some data frames
index = ['a', 'b', 'c', 'd', 'e']
index_2 = ['f', 'g', 'h', 'i', 'j']
columns = ['bacon', 'eggs', 'ham', 'spam', 'hash']
columns_2 = ['toast', 'jelly', 'butter', 'wheat', 'white']
index_3 = ['k', 'l', 'm']
columns_3 = ['pancakes', 'waffles', 'crepes']
index_4 = pd.date_range('04/01/2017', periods=30, freq='D')
columns_4 = ['Waffle house', 'Dennys', 'Panera']
names = ['Sheila', 'Dave', 'Pat']
resto = columns_4


a = pd.DataFrame(np.random.randint(6, size=(5,5)), index=index, columns=columns)
b = pd.DataFrame(np.random.randint(4, size=(5,5)), index=index_2, columns=columns_2)
c = pd.DataFrame(np.random.randint(4, size=(3,3)), index=index_3, columns=columns_3)
d = pd.DataFrame(6 * np.random.random_sample((30, 3)) + 3.99, dtype='float64', index=index_4, columns=columns_4)
e = pd.DataFrame(np.random.choice(resto,(30,3)), columns = names)
d[columns_4]=d[columns_4].round(2)

In [3]:
# switch the columns so that Dave = Pat and Pat = Sheila and Sheila = Dave
e.loc[:, ['Pat', 'Sheila', 'Dave']] = e[names].values

In [4]:
e.iloc[:4]

Unnamed: 0,Sheila,Dave,Pat
0,Panera,Dennys,Panera
1,Panera,Waffle house,Waffle house
2,Dennys,Waffle house,Waffle house
3,Dennys,Dennys,Panera


In [5]:
# can acces columns as attributes
e_sheila, e_dave, e_pat = e.Sheila, e.Dave, e.Pat
e_dave[:4]

0          Dennys
1    Waffle house
2    Waffle house
3          Dennys
Name: Dave, dtype: object

In [6]:
type(e_dave)

pandas.core.series.Series

In [7]:
# so that means all series slicing syntax works here
e_dave[::5]
# start from the back and get every fifth element

0           Dennys
5           Dennys
10    Waffle house
15          Dennys
20          Panera
25          Panera
Name: Dave, dtype: object

In [8]:
# get the whole thing in reverse
e_dave[::-1]
# notice the index value is valid, here we start at 29

29    Waffle house
28          Dennys
27          Panera
26          Dennys
25          Panera
24          Dennys
23    Waffle house
22          Panera
21    Waffle house
20          Panera
19    Waffle house
18          Dennys
17    Waffle house
16          Panera
15          Dennys
14          Dennys
13    Waffle house
12          Panera
11          Dennys
10    Waffle house
9     Waffle house
8           Panera
7     Waffle house
6     Waffle house
5           Dennys
4           Panera
3           Dennys
2     Waffle house
1     Waffle house
0           Dennys
Name: Dave, dtype: object

In [9]:
# dave says that he ate at Waffle house on the third
e_dave[2] = "Waffle house"

In [10]:
e_dave[:5]
# changed from Dennys to Waffle house

0          Dennys
1    Waffle house
2    Waffle house
3          Dennys
4          Panera
Name: Dave, dtype: object

In [11]:
# this works on data frames too
e[:3]

Unnamed: 0,Sheila,Dave,Pat
0,Panera,Dennys,Panera
1,Panera,Waffle house,Waffle house
2,Dennys,Waffle house,Waffle house


In [12]:
e[::5]

Unnamed: 0,Sheila,Dave,Pat
0,Panera,Dennys,Panera
5,Dennys,Dennys,Panera
10,Dennys,Waffle house,Waffle house
15,Waffle house,Dennys,Panera
20,Panera,Panera,Panera
25,Panera,Panera,Waffle house


In [13]:
# selection by label is the same
e['Dave'][:5]

0          Dennys
1    Waffle house
2    Waffle house
3          Dennys
4          Panera
Name: Dave, dtype: object

In [14]:
# with dates it is a little different
# ---- > d.loc[2:7] this will throw an error
# TypeError: cannot do slice indexing on <class 'pandas.core.indexes.datetimes.DatetimeIndex'> 
# with these indexers [2] of <class 'int'>

In [15]:
# you need to use the value or a convertible 'like'string
d.loc['20170410': '20170414']
# notice the endpoint is included...

Unnamed: 0,Waffle house,Dennys,Panera
2017-04-10,7.97,7.53,5.66
2017-04-11,6.19,9.15,4.15
2017-04-12,6.06,4.69,9.11
2017-04-13,9.85,5.41,5.61
2017-04-14,4.63,8.2,4.25


In [16]:
# here we can slice with labels
b.loc['b':'d']
# returns empy

Unnamed: 0,toast,jelly,butter,wheat,white


In [17]:
# whats in the index then
b.index.values

array(['f', 'g', 'h', 'i', 'j'], dtype=object)

In [18]:
# so do this:
b_index = b.index.values
b_index[3]

'i'

In [19]:
# use the list and pass your index range as a slice value
b.loc[b_index[2:4]]

Unnamed: 0,toast,jelly,butter,wheat,white
h,3,3,2,3,0
i,1,2,2,3,3


In [20]:
# these will come in handy
# yes we could have used the list to create the df
# but what if you don't have that list?
a_index = a.index.values
c_index = c.index.values


In [21]:
# so labels can be used just like integers(almost)
c.loc['k':]

Unnamed: 0,pancakes,waffles,crepes
k,3,2,2
l,2,1,2
m,3,2,0


In [22]:
#or
c.loc[:'m']

Unnamed: 0,pancakes,waffles,crepes
k,3,2,2
l,2,1,2
m,3,2,0


In [23]:
# grab a mix of rows:
a.loc['b':, 'ham':'hash']

Unnamed: 0,ham,spam,hash
b,1,4,0
c,1,1,0
d,1,3,2
e,2,2,3


In [24]:
# or 
a.loc['c':'e', 'ham':'spam']

Unnamed: 0,ham,spam
c,1,1
d,1,3
e,2,2


In [25]:
# cross section of one row
a.loc['d']

bacon    3
eggs     3
ham      1
spam     3
hash     2
Name: d, dtype: int64

In [26]:
# get a specific value:
# for example how much was spent at 'Waffle house' on April 13
d.loc['20170414', 'Waffle house']

4.63

In [27]:
# if you know the index number
d.iloc[13]['Waffle house']

4.63

In [28]:
d.loc['20170414']

Waffle house    4.63
Dennys          8.20
Panera          4.25
Name: 2017-04-14 00:00:00, dtype: float64

In [29]:
# return to iloc
e.iloc[:5]
# iloc is like numpy or python.... the upper bound is excluded

Unnamed: 0,Sheila,Dave,Pat
0,Panera,Dennys,Panera
1,Panera,Waffle house,Waffle house
2,Dennys,Waffle house,Waffle house
3,Dennys,Dennys,Panera
4,Panera,Panera,Panera


In [30]:
# get rows 11-18 for Dennys and panera
d.iloc[11:19, 1:]

Unnamed: 0,Dennys,Panera
2017-04-12,4.69,9.11
2017-04-13,5.41,5.61
2017-04-14,8.2,4.25
2017-04-15,4.58,9.82
2017-04-16,7.71,6.37
2017-04-17,7.53,4.9
2017-04-18,9.73,7.62
2017-04-19,4.83,7.1


In [31]:
# this will not work ---> d.iloc[11:19, resto[1:]]
# canont put a string indexer with iloc
# TypeError: cannot perform reduce with flexible type

#### Selection by callable

In [32]:
d.iloc[13:20, lambda q: [1,2]]

Unnamed: 0,Dennys,Panera
2017-04-14,8.2,4.25
2017-04-15,4.58,9.82
2017-04-16,7.71,6.37
2017-04-17,7.53,4.9
2017-04-18,9.73,7.62
2017-04-19,4.83,7.1
2017-04-20,6.57,6.66


In [33]:
d.loc['20170407':'20170413', lambda q: [resto[2]]]

Unnamed: 0,Panera
2017-04-07,8.07
2017-04-08,9.58
2017-04-09,7.48
2017-04-10,5.66
2017-04-11,4.15
2017-04-12,9.11
2017-04-13,5.61


In [34]:
# how much did sheila pay for breakfast on april 20th and where did she eat?
# so where everybody ate is given as a df in ascendo-ng order with no dates
# what was paid and what restauraunt is given by date

# we know this
cost = d.loc['20170420']
where = e.iloc[19, 0]
print(cost)
print(where)

Waffle house    9.41
Dennys          6.57
Panera          6.66
Name: 2017-04-20 00:00:00, dtype: float64
Dennys


In [35]:
# so combined that would be
d.loc['20170420', e.iloc[19,0]]

6.57

In [36]:
rest = dict(zip(d.columns, np.arange(3)))

In [37]:
rest

{'Waffle house': 0, 'Dennys': 1, 'Panera': 2}

In [38]:
def get_cost(x,y):
    a = []
    for i, n in enumerate(list(e[x])):
        b = d.iloc[i, y[n]]
        a.append(b)
    return(a)
cost_sheila = get_cost('Sheila', rest)
cost_sheila[:5]


[7.51, 8.37, 6.88, 6.99, 8.81]

In [39]:
# okay so we can get the the location and the price for sheila
# but it is not in a df
# join dfs and see what that gives
e.index, d.index
# normally the len should match, and the data in e is given chronologically
# so we'ell check

(RangeIndex(start=0, stop=30, step=1),
 DatetimeIndex(['2017-04-01', '2017-04-02', '2017-04-03', '2017-04-04',
                '2017-04-05', '2017-04-06', '2017-04-07', '2017-04-08',
                '2017-04-09', '2017-04-10', '2017-04-11', '2017-04-12',
                '2017-04-13', '2017-04-14', '2017-04-15', '2017-04-16',
                '2017-04-17', '2017-04-18', '2017-04-19', '2017-04-20',
                '2017-04-21', '2017-04-22', '2017-04-23', '2017-04-24',
                '2017-04-25', '2017-04-26', '2017-04-27', '2017-04-28',
                '2017-04-29', '2017-04-30'],
               dtype='datetime64[ns]', freq='D'))

In [40]:
len(d.index), len(e.index)

(30, 30)

In [41]:
e_i = d.index
e_i

DatetimeIndex(['2017-04-01', '2017-04-02', '2017-04-03', '2017-04-04',
               '2017-04-05', '2017-04-06', '2017-04-07', '2017-04-08',
               '2017-04-09', '2017-04-10', '2017-04-11', '2017-04-12',
               '2017-04-13', '2017-04-14', '2017-04-15', '2017-04-16',
               '2017-04-17', '2017-04-18', '2017-04-19', '2017-04-20',
               '2017-04-21', '2017-04-22', '2017-04-23', '2017-04-24',
               '2017-04-25', '2017-04-26', '2017-04-27', '2017-04-28',
               '2017-04-29', '2017-04-30'],
              dtype='datetime64[ns]', freq='D')

In [42]:
e.set_index(e_i, inplace=True)
e.iloc[:5]

Unnamed: 0,Sheila,Dave,Pat
2017-04-01,Panera,Dennys,Panera
2017-04-02,Panera,Waffle house,Waffle house
2017-04-03,Dennys,Waffle house,Waffle house
2017-04-04,Dennys,Dennys,Panera
2017-04-05,Panera,Panera,Panera


In [43]:
#okay so this should work
d_e = pd.concat([d,e], axis=1)
d_e.iloc[:5]

Unnamed: 0,Waffle house,Dennys,Panera,Sheila,Dave,Pat
2017-04-01,7.78,4.83,7.51,Panera,Dennys,Panera
2017-04-02,9.95,4.55,8.37,Panera,Waffle house,Waffle house
2017-04-03,5.48,6.88,7.15,Dennys,Waffle house,Waffle house
2017-04-04,9.57,6.99,7.75,Dennys,Dennys,Panera
2017-04-05,5.47,6.78,8.81,Panera,Panera,Panera


In [44]:
# so now set a value for the cost of sheilas breakfast
d_e['cost_sheila'] = cost_sheila

In [45]:
#d_e.iloc[:10]

In [46]:
# okay!
# the function could ne abstracted a little more to incoroporate all the players
# by taking a list of names and outputting directly to the df
# becasue now the df is combined

def get_cost(df_cost, x, y):
    d ={}
    # x is a list of names:
    for i, n in enumerate(x):
        a = []
        b = list(df_cost[n])
        name = n + '_cost'
        #print(b)
        for j, o in enumerate(b):
            c = df_cost.iloc[j, y[o]]
            a.append(c)
        d.update({name:a})
    for key, value in d.items():
        
        d_e[key] = value
    return(d)
try_this = get_cost(d_e, ['Sheila', 'Dave', 'Pat'], rest)

In [47]:
d_e.iloc[:5]

Unnamed: 0,Waffle house,Dennys,Panera,Sheila,Dave,Pat,cost_sheila,Sheila_cost,Dave_cost,Pat_cost
2017-04-01,7.78,4.83,7.51,Panera,Dennys,Panera,7.51,7.51,4.83,7.51
2017-04-02,9.95,4.55,8.37,Panera,Waffle house,Waffle house,8.37,8.37,9.95,9.95
2017-04-03,5.48,6.88,7.15,Dennys,Waffle house,Waffle house,6.88,6.88,5.48,5.48
2017-04-04,9.57,6.99,7.75,Dennys,Dennys,Panera,6.99,6.99,6.99,7.75
2017-04-05,5.47,6.78,8.81,Panera,Panera,Panera,8.81,8.81,8.81,8.81


In [48]:
# clean that up real quick 
d_e.drop('cost_sheila', axis=1, inplace=True)

In [49]:
# so now getting the cost for one person or all three on any day would look like this:
d_e.loc['20170421', ['Pat_cost', 'Dave_cost']]

Pat_cost     6.39
Dave_cost    6.39
Name: 2017-04-21 00:00:00, dtype: object

In [50]:
# and if want to know when, where and how much:
d_e.loc['20170415':'20170423', ['Sheila', 'Sheila_cost']]

Unnamed: 0,Sheila,Sheila_cost
2017-04-15,Dennys,4.58
2017-04-16,Waffle house,7.61
2017-04-17,Waffle house,4.2
2017-04-18,Panera,7.62
2017-04-19,Waffle house,9.9
2017-04-20,Dennys,6.57
2017-04-21,Panera,6.39
2017-04-22,Panera,8.39
2017-04-23,Waffle house,6.03


In [51]:
# new note book time will continue with this one