### Indexing and merging two

#### References
1. Python for data analysis
2. Think stats: exploratory data analysis
3. https://pandas.pydata,org

#### Purpose
1. Work in the abstract (its good for the brain)
2. Keep up to date with changes in the library
3. Explore new ways of doing common tasks --- get better

In [1]:
import pandas as pd
import numpy as np


In [2]:
# create some data frames
index = ['a', 'b', 'c', 'd', 'e']
index_2 = ['f', 'g', 'h', 'i', 'j']
columns = ['bacon', 'eggs', 'ham', 'spam', 'hash']
columns_2 = ['toast', 'jelly', 'butter', 'wheat', 'white']
index_3 = ['k', 'l', 'm']
columns_3 = ['pancakes', 'waffles', 'crepes']
index_4 = pd.date_range('04/01/2017', periods=30, freq='D')
columns_4 = ['Waffle house', 'Dennys', 'Panera']
names = ['Sheila', 'Dave', 'Pat']
resto = columns_4


a = pd.DataFrame(np.random.randint(6, size=(5,5)), index=index, columns=columns)
b = pd.DataFrame(np.random.randint(4, size=(5,5)), index=index_2, columns=columns_2)
c = pd.DataFrame(np.random.randint(4, size=(3,3)), index=index_3, columns=columns_3)
d = pd.DataFrame(6 * np.random.random_sample((30, 3)) + 3.99, dtype='float64', index=index_4, columns=columns_4)
e = pd.DataFrame(np.random.choice(resto,(30,3)), columns = names)
d[columns_4]=d[columns_4].round(2)

In [3]:
#print(a)
#print(b)
#print(c)
#print(d)
#print(e)

In [4]:
# switch the columns so that Dave = Pat and Pat = Sheila and Sheila = Dave
e.loc[:, ['Pat', 'Sheila', 'Dave']] = e[names].values

In [5]:
e.iloc[:4]

Unnamed: 0,Sheila,Dave,Pat
0,Dennys,Waffle house,Waffle house
1,Dennys,Panera,Waffle house
2,Panera,Panera,Panera
3,Panera,Waffle house,Waffle house


In [6]:
# can acces columns as attributes
e_sheila, e_dave, e_pat = e.Sheila, e.Dave, e.Pat
e_dave[:4]

0    Waffle house
1          Panera
2          Panera
3    Waffle house
Name: Dave, dtype: object

In [7]:
type(e_dave)

pandas.core.series.Series

In [8]:
# so that means all series slicing syntax works here
e_dave[::5]
# start from the back and get every fifth element

0     Waffle house
5     Waffle house
10          Dennys
15          Panera
20    Waffle house
25    Waffle house
Name: Dave, dtype: object

In [9]:
# get the whole thing in reverse
e_dave[::-1]
# notice the index value is valid, here we start at 29

29          Dennys
28          Panera
27          Dennys
26          Panera
25    Waffle house
24          Dennys
23          Panera
22    Waffle house
21          Panera
20    Waffle house
19          Panera
18          Dennys
17    Waffle house
16    Waffle house
15          Panera
14          Panera
13    Waffle house
12          Panera
11          Panera
10          Dennys
9           Dennys
8           Dennys
7           Dennys
6           Panera
5     Waffle house
4           Panera
3     Waffle house
2           Panera
1           Panera
0     Waffle house
Name: Dave, dtype: object

In [10]:
# dave says that he ate at Waffle house on the third
e_dave[2] = "Waffle house"

In [11]:
e_dave[:5]
# changed from Dennys to Waffle house

0    Waffle house
1          Panera
2    Waffle house
3    Waffle house
4          Panera
Name: Dave, dtype: object

In [12]:
# this works on data frames too
e[:3]

Unnamed: 0,Sheila,Dave,Pat
0,Dennys,Waffle house,Waffle house
1,Dennys,Panera,Waffle house
2,Panera,Waffle house,Panera


In [13]:
e[::5]

Unnamed: 0,Sheila,Dave,Pat
0,Dennys,Waffle house,Waffle house
5,Dennys,Waffle house,Waffle house
10,Waffle house,Dennys,Dennys
15,Dennys,Panera,Waffle house
20,Dennys,Waffle house,Panera
25,Dennys,Waffle house,Panera


In [14]:
# selection by label is the same
e['Dave'][:5]

0    Waffle house
1          Panera
2    Waffle house
3    Waffle house
4          Panera
Name: Dave, dtype: object

In [15]:
# with dates it is a little different
# ---- > d.loc[2:7] this will throw an error
# TypeError: cannot do slice indexing on <class 'pandas.core.indexes.datetimes.DatetimeIndex'> 
# with these indexers [2] of <class 'int'>

In [16]:
# you need to use the value or a convertible 'like'string
d.loc['20170410': '20170414']
# notice the endpoint is included...

Unnamed: 0,Waffle house,Dennys,Panera
2017-04-10,6.68,6.47,7.73
2017-04-11,4.36,7.92,8.93
2017-04-12,4.36,5.86,4.0
2017-04-13,4.52,4.55,5.14
2017-04-14,6.61,8.45,6.7


In [17]:
# here we can slice with labels
b.loc['b':'d']
# returns empy

Unnamed: 0,toast,jelly,butter,wheat,white


In [18]:
# whats in the index then
b.index.values

array(['f', 'g', 'h', 'i', 'j'], dtype=object)

In [19]:
# so do this:
b_index = b.index.values
b_index[3]

'i'

In [20]:
# use the list and pass your index range as a slice value
b.loc[b_index[2:4]]

Unnamed: 0,toast,jelly,butter,wheat,white
h,2,3,2,0,3
i,2,0,0,1,3


In [21]:
# these will come in handy
# yes we could have used the list to create the df
# but what if you don't have that list?
a_index = a.index.values
c_index = c.index.values


In [22]:
# so labels can be used just like integers(almost)
c.loc['k':]

Unnamed: 0,pancakes,waffles,crepes
k,2,2,2
l,3,1,0
m,1,1,2


In [23]:
#or
c.loc[:'m']

Unnamed: 0,pancakes,waffles,crepes
k,2,2,2
l,3,1,0
m,1,1,2


In [24]:
# grab a mix of rows:
a.loc['b':, 'ham':'hash']

Unnamed: 0,ham,spam,hash
b,4,4,3
c,4,0,2
d,0,3,5
e,5,2,2


In [25]:
# or 
a.loc['c':'e', 'ham':'spam']

Unnamed: 0,ham,spam
c,4,0
d,0,3
e,5,2


In [26]:
# cross section of one row
a.loc['d']

bacon    5
eggs     0
ham      0
spam     3
hash     5
Name: d, dtype: int64

In [27]:
# get a specific value:
# for example how much was spent at 'Waffle house' on April 13
d.loc['20170414', 'Waffle house']

6.61

In [28]:
# if you know the index number
d.iloc[13]['Waffle house']

6.61

In [29]:
d.loc['20170414']

Waffle house    6.61
Dennys          8.45
Panera          6.70
Name: 2017-04-14 00:00:00, dtype: float64

In [30]:
# return to iloc
e.iloc[:5]
# iloc is like numpy or python.... the upper bound is excluded

Unnamed: 0,Sheila,Dave,Pat
0,Dennys,Waffle house,Waffle house
1,Dennys,Panera,Waffle house
2,Panera,Waffle house,Panera
3,Panera,Waffle house,Waffle house
4,Waffle house,Panera,Panera


In [31]:
# get rows 11-18 for Dennys and panera
d.iloc[11:19, 1:]

Unnamed: 0,Dennys,Panera
2017-04-12,5.86,4.0
2017-04-13,4.55,5.14
2017-04-14,8.45,6.7
2017-04-15,4.35,5.16
2017-04-16,8.3,4.56
2017-04-17,5.77,9.37
2017-04-18,9.34,4.39
2017-04-19,4.07,5.59


In [32]:
# this will not work ---> d.iloc[11:19, resto[1:]]
# canont put a string indexer with iloc
# TypeError: cannot perform reduce with flexible type

#### Selection by callable

In [33]:
d.iloc[13:20, lambda q: [1,2]]

Unnamed: 0,Dennys,Panera
2017-04-14,8.45,6.7
2017-04-15,4.35,5.16
2017-04-16,8.3,4.56
2017-04-17,5.77,9.37
2017-04-18,9.34,4.39
2017-04-19,4.07,5.59
2017-04-20,6.6,5.88


In [34]:
d.loc['20170407':'20170413', lambda q: [resto[2]]]

Unnamed: 0,Panera
2017-04-07,4.79
2017-04-08,7.73
2017-04-09,5.23
2017-04-10,7.73
2017-04-11,8.93
2017-04-12,4.0
2017-04-13,5.14


In [35]:
# how much did sheila pay for breakfast on april 20th and where did she eat?
# so where everybody ate is given as a df in ascendo-ng order with no dates
# what was paid and what restauraunt is given by date

# we know this
cost = d.loc['20170420']
where = e.iloc[19, 0]
print(cost)
print(where)

Waffle house    5.02
Dennys          6.60
Panera          5.88
Name: 2017-04-20 00:00:00, dtype: float64
Waffle house


In [36]:
# so combined that would be
d.loc['20170420', e.iloc[19,0]]

5.02

In [37]:
rest = dict(zip(d.columns, np.arange(3)))

In [38]:
rest

{'Dennys': 1, 'Panera': 2, 'Waffle house': 0}

In [39]:
def get_cost(x,y):
    a = []
    for i, n in enumerate(list(e[x])):
        b = d.iloc[i, y[n]]
        a.append(b)
    return(a)
cost_sheila = get_cost('Sheila', rest)
cost_sheila[:5]


[8.02, 6.86, 5.62, 4.41, 6.4]

In [40]:
# okay so we can get the the location and the price for sheila
# but it is not in a df
# join dfs and see what that gives
e.index, d.index
# normally the len should match, and the data in e is given chronologically
# so we'ell check

(RangeIndex(start=0, stop=30, step=1),
 DatetimeIndex(['2017-04-01', '2017-04-02', '2017-04-03', '2017-04-04',
                '2017-04-05', '2017-04-06', '2017-04-07', '2017-04-08',
                '2017-04-09', '2017-04-10', '2017-04-11', '2017-04-12',
                '2017-04-13', '2017-04-14', '2017-04-15', '2017-04-16',
                '2017-04-17', '2017-04-18', '2017-04-19', '2017-04-20',
                '2017-04-21', '2017-04-22', '2017-04-23', '2017-04-24',
                '2017-04-25', '2017-04-26', '2017-04-27', '2017-04-28',
                '2017-04-29', '2017-04-30'],
               dtype='datetime64[ns]', freq='D'))

In [41]:
len(d.index), len(e.index)

(30, 30)

In [42]:
e_i = d.index
e_i

DatetimeIndex(['2017-04-01', '2017-04-02', '2017-04-03', '2017-04-04',
               '2017-04-05', '2017-04-06', '2017-04-07', '2017-04-08',
               '2017-04-09', '2017-04-10', '2017-04-11', '2017-04-12',
               '2017-04-13', '2017-04-14', '2017-04-15', '2017-04-16',
               '2017-04-17', '2017-04-18', '2017-04-19', '2017-04-20',
               '2017-04-21', '2017-04-22', '2017-04-23', '2017-04-24',
               '2017-04-25', '2017-04-26', '2017-04-27', '2017-04-28',
               '2017-04-29', '2017-04-30'],
              dtype='datetime64[ns]', freq='D')

In [43]:
e.set_index(e_i, inplace=True)
e.iloc[:5]

Unnamed: 0,Sheila,Dave,Pat
2017-04-01,Dennys,Waffle house,Waffle house
2017-04-02,Dennys,Panera,Waffle house
2017-04-03,Panera,Waffle house,Panera
2017-04-04,Panera,Waffle house,Waffle house
2017-04-05,Waffle house,Panera,Panera


In [44]:
#okay so this should work
d_e = pd.concat([d,e], axis=1)
d_e.iloc[:5]

Unnamed: 0,Waffle house,Dennys,Panera,Sheila,Dave,Pat
2017-04-01,5.57,8.02,6.4,Dennys,Waffle house,Waffle house
2017-04-02,9.16,6.86,4.76,Dennys,Panera,Waffle house
2017-04-03,7.34,4.82,5.62,Panera,Waffle house,Panera
2017-04-04,8.51,5.95,4.41,Panera,Waffle house,Waffle house
2017-04-05,6.4,7.03,6.87,Waffle house,Panera,Panera


In [45]:
# so now set a value for the cost of sheilas breakfast
d_e['cost_sheila'] = cost_sheila

In [46]:
#d_e.iloc[:10]

In [47]:
# okay!
# the function could ne abstracted a little more to incoroporate all the players
# by taking a list of names and outputting directly to the df
# becasue now the df is combined

def get_cost(df_cost, x, y):
    d ={}
    # x is a list of names:
    for i, n in enumerate(x):
        a = []
        b = list(df_cost[n])
        name = n + '_cost'
        #print(b)
        for j, o in enumerate(b):
            c = df_cost.iloc[j, y[o]]
            a.append(c)
        d.update({name:a})
    for key, value in d.items():
        
        d_e[key] = value
    return(d)
try_this = get_cost(d_e, ['Sheila', 'Dave', 'Pat'], rest)

In [48]:
d_e.iloc[:5]

Unnamed: 0,Waffle house,Dennys,Panera,Sheila,Dave,Pat,cost_sheila,Sheila_cost,Dave_cost,Pat_cost
2017-04-01,5.57,8.02,6.4,Dennys,Waffle house,Waffle house,8.02,8.02,5.57,5.57
2017-04-02,9.16,6.86,4.76,Dennys,Panera,Waffle house,6.86,6.86,4.76,9.16
2017-04-03,7.34,4.82,5.62,Panera,Waffle house,Panera,5.62,5.62,7.34,5.62
2017-04-04,8.51,5.95,4.41,Panera,Waffle house,Waffle house,4.41,4.41,8.51,8.51
2017-04-05,6.4,7.03,6.87,Waffle house,Panera,Panera,6.4,6.4,6.87,6.87


In [49]:
# clean that up real quick 
d_e.drop('cost_sheila', axis=1, inplace=True)

In [52]:
# so now getting the cost for one person or all three on any day would look like this:
d_e.loc['20170421', ['Pat_cost', 'Dave_cost']]

Pat_cost     6.18
Dave_cost    4.58
Name: 2017-04-21 00:00:00, dtype: object

In [54]:
# and if want to know when, where and how much:
d_e.loc['20170415':'20170423', ['Sheila', 'Sheila_cost']]

Unnamed: 0,Sheila,Sheila_cost
2017-04-15,Waffle house,6.94
2017-04-16,Dennys,8.3
2017-04-17,Dennys,5.77
2017-04-18,Dennys,9.34
2017-04-19,Panera,5.59
2017-04-20,Waffle house,5.02
2017-04-21,Dennys,9.02
2017-04-22,Panera,9.1
2017-04-23,Waffle house,8.11


In [None]:
# new note book time will continue with this one