# Pandas Accessing

Practice on how to access data in Pandas.

In [61]:
import numpy as np
import pandas as pd

In [62]:
# define index
dates = pd.date_range('20130101', periods=6)

# create dataframes to manipulate
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df2 = pd.DataFrame({'A': 1.,
                    'B': pd.Timestamp('20130102'),
                    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                    'D': np.array([3] * 4, dtype='int32'),
                    'E': pd.Categorical(['test', 'train', 'test', 'train']),
                    'F': 'foo'})

In [63]:
# recommended way of referring to columns
df['A']

2013-01-01    0.299848
2013-01-02    0.360215
2013-01-03   -0.078738
2013-01-04   -0.008761
2013-01-05    1.143464
2013-01-06   -0.313855
Freq: D, Name: A, dtype: float64

In [64]:
# while this performs the same callout function as slicing [], it may interfere with methods
df.A

2013-01-01    0.299848
2013-01-02    0.360215
2013-01-03   -0.078738
2013-01-04   -0.008761
2013-01-05    1.143464
2013-01-06   -0.313855
Freq: D, Name: A, dtype: float64

In [65]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,0.299848,-1.160849,0.057704,-1.025912
2013-01-02,0.360215,0.288683,0.098138,1.519857
2013-01-03,-0.078738,0.155162,0.855873,-0.004563


In [66]:
df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,0.360215,0.288683,0.098138,1.519857
2013-01-03,-0.078738,0.155162,0.855873,-0.004563
2013-01-04,-0.008761,0.569021,-1.65858,-0.120472


### Selection by Label

In [67]:
df.loc['2013-01-01']

A    0.299848
B   -1.160849
C    0.057704
D   -1.025912
Name: 2013-01-01 00:00:00, dtype: float64

In [68]:
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2013-01-01,0.299848,-1.160849
2013-01-02,0.360215,0.288683
2013-01-03,-0.078738,0.155162
2013-01-04,-0.008761,0.569021
2013-01-05,1.143464,-1.596417
2013-01-06,-0.313855,1.023412


In [69]:
df.loc['20130102':'20130104', ['A', 'B']]

Unnamed: 0,A,B
2013-01-02,0.360215,0.288683
2013-01-03,-0.078738,0.155162
2013-01-04,-0.008761,0.569021


In [70]:
df.loc[dates[0], 'A']

0.29984777236380955

### Selection by Position

In [71]:
df.iloc[3]

A   -0.008761
B    0.569021
C   -1.658580
D   -0.120472
Name: 2013-01-04 00:00:00, dtype: float64

In [72]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,-0.008761,0.569021
2013-01-05,1.143464,-1.596417


In [73]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2013-01-02,0.360215,0.288683,0.098138,1.519857
2013-01-03,-0.078738,0.155162,0.855873,-0.004563


### Selection by dtype

In [74]:
df = pd.DataFrame({'string': list('abc'),
                   'int64': list(range(1, 4)),
                   'uint8': np.arange(3, 6).astype('u1'),
                   'float64': np.arange(4.0, 7.0),
                   'bool1': [True, False, 'True'],
                   'bool2': [False, True, False],
                   'dates': pd.date_range('now', periods=3),
                   'category': pd.Series(list('ABC')).astype('category')})

In [75]:
df.select_dtypes(include=[bool])

Unnamed: 0,bool2
0,False
1,True
2,False


### Boolean Indexing

In [76]:
df[df['float64'] >= 5]

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
1,b,2,4,5.0,False,True,2022-11-08 14:03:39.440734,B
2,c,3,5,6.0,True,False,2022-11-09 14:03:39.440734,C


In [77]:
df2 = df.copy()
df2['E'] = ['one', 'two', 'three']

In [81]:
df2[df2['E'].isin(['one', 'two'])]

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category,E
0,a,1,3,4.0,True,False,2022-11-07 14:03:39.440734,A,one
1,b,2,4,5.0,False,True,2022-11-08 14:03:39.440734,B,two


In [82]:
df.iat[0, 1] = -1
df

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
0,a,-1,3,4.0,True,False,2022-11-07 14:03:39.440734,A
1,b,2,4,5.0,False,True,2022-11-08 14:03:39.440734,B
2,c,3,5,6.0,True,False,2022-11-09 14:03:39.440734,C


In [84]:
df.iloc[0, 1] = 2
df

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
0,a,2,3,4.0,True,False,2022-11-07 14:03:39.440734,A
1,b,2,4,5.0,False,True,2022-11-08 14:03:39.440734,B
2,c,3,5,6.0,True,False,2022-11-09 14:03:39.440734,C


In [89]:
df.at[0, 'float64'] = -10
df

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
0,a,2,50,-10.0,True,False,2022-11-07 14:03:39.440734,A
1,b,2,50,5.0,False,True,2022-11-08 14:03:39.440734,B
2,c,3,50,6.0,True,False,2022-11-09 14:03:39.440734,C


In [88]:
df.loc[0, 'float64'] = -20
df

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
0,a,2,50,-20.0,True,False,2022-11-07 14:03:39.440734,A
1,b,2,50,5.0,False,True,2022-11-08 14:03:39.440734,B
2,c,3,50,6.0,True,False,2022-11-09 14:03:39.440734,C


In [90]:
df.loc[:, 'uint8'] = np.array([50] * len(df))
df

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
0,a,2,50,-10.0,True,False,2022-11-07 14:03:39.440734,A
1,b,2,50,5.0,False,True,2022-11-08 14:03:39.440734,B
2,c,3,50,6.0,True,False,2022-11-09 14:03:39.440734,C
