In [1]:
import numpy as np
import pandas as pd

## Object creation

In [2]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [3]:
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [4]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,0.240059,0.959935,-1.2763,0.022332
2013-01-02,-0.71455,-0.251219,-2.222628,0.28476
2013-01-03,0.690096,1.847301,1.12129,-0.010143
2013-01-04,0.314499,1.042629,1.678483,0.841972
2013-01-05,-0.062216,-0.332579,-0.880776,-0.142824
2013-01-06,-0.259303,-0.574966,1.418908,0.728684


In [5]:
df2 = pd.DataFrame({'A': 1.0,
                       'B': pd.Timestamp('20130102'),
                       'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                       'D': np.array([3] * 4, dtype='int32'),
                       'E': pd.Categorical(["test", "train", "test", "train"]),
                       'F': 'foo'})
df2


Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [6]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

## Viewing Data

In [7]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,0.240059,0.959935,-1.2763,0.022332
2013-01-02,-0.71455,-0.251219,-2.222628,0.28476
2013-01-03,0.690096,1.847301,1.12129,-0.010143
2013-01-04,0.314499,1.042629,1.678483,0.841972
2013-01-05,-0.062216,-0.332579,-0.880776,-0.142824


In [8]:
df.tail()

Unnamed: 0,A,B,C,D
2013-01-02,-0.71455,-0.251219,-2.222628,0.28476
2013-01-03,0.690096,1.847301,1.12129,-0.010143
2013-01-04,0.314499,1.042629,1.678483,0.841972
2013-01-05,-0.062216,-0.332579,-0.880776,-0.142824
2013-01-06,-0.259303,-0.574966,1.418908,0.728684


In [9]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [10]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [11]:
df.to_numpy()

array([[ 0.24005944,  0.95993503, -1.27629976,  0.02233203],
       [-0.71455013, -0.25121933, -2.22262834,  0.28476047],
       [ 0.69009606,  1.84730136,  1.12128984, -0.01014346],
       [ 0.31449861,  1.04262861,  1.67848316,  0.8419722 ],
       [-0.06221617, -0.33257941, -0.88077578, -0.14282371],
       [-0.25930252, -0.574966  ,  1.41890781,  0.72868445]])

In [12]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.034764,0.448517,-0.026837,0.287464
std,0.491367,0.971429,1.638802,0.411382
min,-0.71455,-0.574966,-2.222628,-0.142824
25%,-0.210031,-0.312239,-1.177419,-0.002025
50%,0.088922,0.354358,0.120257,0.153546
75%,0.295889,1.021955,1.344503,0.617703
max,0.690096,1.847301,1.678483,0.841972


In [13]:
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,0.240059,-0.71455,0.690096,0.314499,-0.062216,-0.259303
B,0.959935,-0.251219,1.847301,1.042629,-0.332579,-0.574966
C,-1.2763,-2.222628,1.12129,1.678483,-0.880776,1.418908
D,0.022332,0.28476,-0.010143,0.841972,-0.142824,0.728684


In [14]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,0.022332,-1.2763,0.959935,0.240059
2013-01-02,0.28476,-2.222628,-0.251219,-0.71455
2013-01-03,-0.010143,1.12129,1.847301,0.690096
2013-01-04,0.841972,1.678483,1.042629,0.314499
2013-01-05,-0.142824,-0.880776,-0.332579,-0.062216
2013-01-06,0.728684,1.418908,-0.574966,-0.259303


In [15]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2013-01-06,-0.259303,-0.574966,1.418908,0.728684
2013-01-05,-0.062216,-0.332579,-0.880776,-0.142824
2013-01-02,-0.71455,-0.251219,-2.222628,0.28476
2013-01-01,0.240059,0.959935,-1.2763,0.022332
2013-01-04,0.314499,1.042629,1.678483,0.841972
2013-01-03,0.690096,1.847301,1.12129,-0.010143


## Filtering

### Getting

In [16]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.240059,0.959935,-1.2763,0.022332
2013-01-02,-0.71455,-0.251219,-2.222628,0.28476
2013-01-03,0.690096,1.847301,1.12129,-0.010143
2013-01-04,0.314499,1.042629,1.678483,0.841972
2013-01-05,-0.062216,-0.332579,-0.880776,-0.142824
2013-01-06,-0.259303,-0.574966,1.418908,0.728684


In [17]:
df['A']

2013-01-01    0.240059
2013-01-02   -0.714550
2013-01-03    0.690096
2013-01-04    0.314499
2013-01-05   -0.062216
2013-01-06   -0.259303
Freq: D, Name: A, dtype: float64

In [18]:
df.A

2013-01-01    0.240059
2013-01-02   -0.714550
2013-01-03    0.690096
2013-01-04    0.314499
2013-01-05   -0.062216
2013-01-06   -0.259303
Freq: D, Name: A, dtype: float64

#### Slice rows:

In [19]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,0.240059,0.959935,-1.2763,0.022332
2013-01-02,-0.71455,-0.251219,-2.222628,0.28476
2013-01-03,0.690096,1.847301,1.12129,-0.010143


In [20]:
df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,-0.71455,-0.251219,-2.222628,0.28476
2013-01-03,0.690096,1.847301,1.12129,-0.010143
2013-01-04,0.314499,1.042629,1.678483,0.841972


#### Selection by label

In [21]:
df.loc['2013-01-01']

A    0.240059
B    0.959935
C   -1.276300
D    0.022332
Name: 2013-01-01 00:00:00, dtype: float64

In [22]:
df.loc[:,['A','B']]

Unnamed: 0,A,B
2013-01-01,0.240059,0.959935
2013-01-02,-0.71455,-0.251219
2013-01-03,0.690096,1.847301
2013-01-04,0.314499,1.042629
2013-01-05,-0.062216,-0.332579
2013-01-06,-0.259303,-0.574966


In [23]:
df.loc['20130102':'20130104',['A','B']]

Unnamed: 0,A,B
2013-01-02,-0.71455,-0.251219
2013-01-03,0.690096,1.847301
2013-01-04,0.314499,1.042629


In [24]:
df.loc['20130102', ['A','B']]

A   -0.714550
B   -0.251219
Name: 2013-01-02 00:00:00, dtype: float64

In [25]:
df.loc[dates[0], 'A']

0.24005943542409586

#### Selection by position

In [26]:
df.iloc[3]

A    0.314499
B    1.042629
C    1.678483
D    0.841972
Name: 2013-01-04 00:00:00, dtype: float64

In [27]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,0.314499,1.042629
2013-01-05,-0.062216,-0.332579


In [28]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2013-01-02,-0.71455,-0.251219,-2.222628,0.28476
2013-01-03,0.690096,1.847301,1.12129,-0.010143


#### Selection by dtype

In [29]:
df = pd.DataFrame({'string': list('abc'),
                   'int64': list(range(1, 4)),
                   'uint8': np.arange(3, 6).astype('u1'),
                   'float64': np.arange(4.0, 7.0),
                   'bool1': [True, False, True],
                   'bool2': [False, True, False],
                   'dates': pd.date_range('now', periods=3),
                   'category': pd.Series(list("ABC")).astype('category')})
df

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
0,a,1,3,4.0,True,False,2022-07-07 11:32:57.223240,A
1,b,2,4,5.0,False,True,2022-07-08 11:32:57.223240,B
2,c,3,5,6.0,True,False,2022-07-09 11:32:57.223240,C


In [42]:
df.dtypes

string              object
int64                int64
uint8                int32
float64            float64
bool1                 bool
bool2                 bool
dates       datetime64[ns]
category          category
dtype: object

In [30]:
df.select_dtypes(include=[bool])

Unnamed: 0,bool1,bool2
0,True,False
1,False,True
2,True,False


#### Boolean indexing

Take the rows where column `A` is higher than 5

In [31]:
df[df['float64'] >= 5]

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
1,b,2,4,5.0,False,True,2022-07-08 11:32:57.223240,B
2,c,3,5,6.0,True,False,2022-07-09 11:32:57.223240,C


In [32]:
df2 = df.copy()
df2

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
0,a,1,3,4.0,True,False,2022-07-07 11:32:57.223240,A
1,b,2,4,5.0,False,True,2022-07-08 11:32:57.223240,B
2,c,3,5,6.0,True,False,2022-07-09 11:32:57.223240,C


In [33]:
df2['E'] = ['one', 'two', 'three']

In [34]:
df2

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category,E
0,a,1,3,4.0,True,False,2022-07-07 11:32:57.223240,A,one
1,b,2,4,5.0,False,True,2022-07-08 11:32:57.223240,B,two
2,c,3,5,6.0,True,False,2022-07-09 11:32:57.223240,C,three


use `isin()` to take only rows where `E` is `'one'` or `'two'`

In [35]:
df2[df2['E'].isin(['one','two'])]

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category,E
0,a,1,3,4.0,True,False,2022-07-07 11:32:57.223240,A,one
1,b,2,4,5.0,False,True,2022-07-08 11:32:57.223240,B,two


#### Set values in a dataframe

In [36]:
df

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
0,a,1,3,4.0,True,False,2022-07-07 11:32:57.223240,A
1,b,2,4,5.0,False,True,2022-07-08 11:32:57.223240,B
2,c,3,5,6.0,True,False,2022-07-09 11:32:57.223240,C


In [37]:
df.iat[0,1] = -2
df

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
0,a,-2,3,4.0,True,False,2022-07-07 11:32:57.223240,A
1,b,2,4,5.0,False,True,2022-07-08 11:32:57.223240,B
2,c,3,5,6.0,True,False,2022-07-09 11:32:57.223240,C


In [38]:
df.iloc[0,1] = 2
df

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
0,a,2,3,4.0,True,False,2022-07-07 11:32:57.223240,A
1,b,2,4,5.0,False,True,2022-07-08 11:32:57.223240,B
2,c,3,5,6.0,True,False,2022-07-09 11:32:57.223240,C


In [39]:
df.at[0,'float64'] = -10
df

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
0,a,2,3,-10.0,True,False,2022-07-07 11:32:57.223240,A
1,b,2,4,5.0,False,True,2022-07-08 11:32:57.223240,B
2,c,3,5,6.0,True,False,2022-07-09 11:32:57.223240,C


In [40]:
df.loc[0, 'float64'] = -20
df

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
0,a,2,3,-20.0,True,False,2022-07-07 11:32:57.223240,A
1,b,2,4,5.0,False,True,2022-07-08 11:32:57.223240,B
2,c,3,5,6.0,True,False,2022-07-09 11:32:57.223240,C


Set by assigning a numpy array:

In [41]:
df.loc[:, 'uint8'] = np.array([50] * len(df))
df

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
0,a,2,50,-20.0,True,False,2022-07-07 11:32:57.223240,A
1,b,2,50,5.0,False,True,2022-07-08 11:32:57.223240,B
2,c,3,50,6.0,True,False,2022-07-09 11:32:57.223240,C
