In [1]:
import numpy as np
import pandas as pd

### Object creation

### Creating a Series by passing a list of values, letting pandas create a default integer index:

In [2]:
s = pd.Series([1, 3, 5, np.nan, 6 ,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

### Creating a DataFrame by passing a NumPy array, with a datetime index and labeled columns:

In [3]:
dates = pd.date_range('20210101', periods=6)
dates

DatetimeIndex(['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04',
               '2021-01-05', '2021-01-06'],
              dtype='datetime64[ns]', freq='D')

In [5]:
df = pd.DataFrame(np.random.rand(6, 4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2021-01-01,0.573796,0.060363,0.655102,0.068046
2021-01-02,0.06411,0.448241,0.370597,0.518047
2021-01-03,0.224801,0.65526,0.122728,0.742273
2021-01-04,0.852493,0.228143,0.915202,0.273862
2021-01-05,0.844728,0.422882,0.231035,0.860093
2021-01-06,0.681984,0.642339,0.944196,0.30017


### Creating a DataFrame by passing a dict of objects that can be converted to series-like.

In [9]:
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


#### The columns of the resulting DataFrame have different dtypes.

In [10]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

### Viewing data

####  view the top and bottom rows of the frame

In [11]:
df.head()

Unnamed: 0,A,B,C,D
2021-01-01,0.573796,0.060363,0.655102,0.068046
2021-01-02,0.06411,0.448241,0.370597,0.518047
2021-01-03,0.224801,0.65526,0.122728,0.742273
2021-01-04,0.852493,0.228143,0.915202,0.273862
2021-01-05,0.844728,0.422882,0.231035,0.860093


In [12]:
df.tail()

Unnamed: 0,A,B,C,D
2021-01-02,0.06411,0.448241,0.370597,0.518047
2021-01-03,0.224801,0.65526,0.122728,0.742273
2021-01-04,0.852493,0.228143,0.915202,0.273862
2021-01-05,0.844728,0.422882,0.231035,0.860093
2021-01-06,0.681984,0.642339,0.944196,0.30017


In [13]:
df.tail(3)

Unnamed: 0,A,B,C,D
2021-01-04,0.852493,0.228143,0.915202,0.273862
2021-01-05,0.844728,0.422882,0.231035,0.860093
2021-01-06,0.681984,0.642339,0.944196,0.30017


#### Display the index, columns

In [14]:
# index
df.index

DatetimeIndex(['2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04',
               '2021-01-05', '2021-01-06'],
              dtype='datetime64[ns]', freq='D')

In [15]:
# columns
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

#### call `DataFrame.to_numpy()`, pandas will find the NumPy dtype that can hold all of the dtypes in the DataFrame. This may end up being object, which requires casting every value to a Python object.

In [16]:
df

Unnamed: 0,A,B,C,D
2021-01-01,0.573796,0.060363,0.655102,0.068046
2021-01-02,0.06411,0.448241,0.370597,0.518047
2021-01-03,0.224801,0.65526,0.122728,0.742273
2021-01-04,0.852493,0.228143,0.915202,0.273862
2021-01-05,0.844728,0.422882,0.231035,0.860093
2021-01-06,0.681984,0.642339,0.944196,0.30017


In [17]:
df.to_numpy()

array([[0.5737963 , 0.06036275, 0.6551015 , 0.06804554],
       [0.0641097 , 0.448241  , 0.37059714, 0.51804701],
       [0.22480055, 0.65525999, 0.12272795, 0.74227311],
       [0.85249334, 0.2281427 , 0.91520158, 0.27386165],
       [0.84472764, 0.42288188, 0.2310348 , 0.8600929 ],
       [0.68198374, 0.64233866, 0.94419616, 0.30017013]])

In [24]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [20]:
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

#### describe() shows a quick statistic summary of your data

In [25]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.540319,0.409538,0.53981,0.460415
std,0.327913,0.232843,0.350905,0.302305
min,0.06411,0.060363,0.122728,0.068046
25%,0.312049,0.276827,0.265925,0.280439
50%,0.62789,0.435561,0.512849,0.409109
75%,0.804042,0.593814,0.850177,0.686217
max,0.852493,0.65526,0.944196,0.860093


In [26]:
df

Unnamed: 0,A,B,C,D
2021-01-01,0.573796,0.060363,0.655102,0.068046
2021-01-02,0.06411,0.448241,0.370597,0.518047
2021-01-03,0.224801,0.65526,0.122728,0.742273
2021-01-04,0.852493,0.228143,0.915202,0.273862
2021-01-05,0.844728,0.422882,0.231035,0.860093
2021-01-06,0.681984,0.642339,0.944196,0.30017


In [27]:
df.T

Unnamed: 0,2021-01-01,2021-01-02,2021-01-03,2021-01-04,2021-01-05,2021-01-06
A,0.573796,0.06411,0.224801,0.852493,0.844728,0.681984
B,0.060363,0.448241,0.65526,0.228143,0.422882,0.642339
C,0.655102,0.370597,0.122728,0.915202,0.231035,0.944196
D,0.068046,0.518047,0.742273,0.273862,0.860093,0.30017


#### Sorting by an axis:

In [31]:
df

Unnamed: 0,A,B,C,D
2021-01-01,0.573796,0.060363,0.655102,0.068046
2021-01-02,0.06411,0.448241,0.370597,0.518047
2021-01-03,0.224801,0.65526,0.122728,0.742273
2021-01-04,0.852493,0.228143,0.915202,0.273862
2021-01-05,0.844728,0.422882,0.231035,0.860093
2021-01-06,0.681984,0.642339,0.944196,0.30017


In [42]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2021-01-01,0.068046,0.655102,0.060363,0.573796
2021-01-02,0.518047,0.370597,0.448241,0.06411
2021-01-03,0.742273,0.122728,0.65526,0.224801
2021-01-04,0.273862,0.915202,0.228143,0.852493
2021-01-05,0.860093,0.231035,0.422882,0.844728
2021-01-06,0.30017,0.944196,0.642339,0.681984


#### Sorting by values:

In [44]:
df

Unnamed: 0,A,B,C,D
2021-01-01,0.573796,0.060363,0.655102,0.068046
2021-01-02,0.06411,0.448241,0.370597,0.518047
2021-01-03,0.224801,0.65526,0.122728,0.742273
2021-01-04,0.852493,0.228143,0.915202,0.273862
2021-01-05,0.844728,0.422882,0.231035,0.860093
2021-01-06,0.681984,0.642339,0.944196,0.30017


In [45]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2021-01-01,0.573796,0.060363,0.655102,0.068046
2021-01-04,0.852493,0.228143,0.915202,0.273862
2021-01-05,0.844728,0.422882,0.231035,0.860093
2021-01-02,0.06411,0.448241,0.370597,0.518047
2021-01-06,0.681984,0.642339,0.944196,0.30017
2021-01-03,0.224801,0.65526,0.122728,0.742273


### Selection

#### Getting
Selecting a single column, which yields a Series, equivalent to df.A:

In [46]:
df.A

2021-01-01    0.573796
2021-01-02    0.064110
2021-01-03    0.224801
2021-01-04    0.852493
2021-01-05    0.844728
2021-01-06    0.681984
Freq: D, Name: A, dtype: float64

In [47]:
df['A']

2021-01-01    0.573796
2021-01-02    0.064110
2021-01-03    0.224801
2021-01-04    0.852493
2021-01-05    0.844728
2021-01-06    0.681984
Freq: D, Name: A, dtype: float64

In [48]:
df[0:3]

Unnamed: 0,A,B,C,D
2021-01-01,0.573796,0.060363,0.655102,0.068046
2021-01-02,0.06411,0.448241,0.370597,0.518047
2021-01-03,0.224801,0.65526,0.122728,0.742273


In [52]:
df.iloc[0: 3, 2:]

Unnamed: 0,C,D
2021-01-01,0.655102,0.068046
2021-01-02,0.370597,0.518047
2021-01-03,0.122728,0.742273


In [61]:
df

Unnamed: 0,A,B,C,D
2021-01-01,0.573796,0.060363,0.655102,0.068046
2021-01-02,0.06411,0.448241,0.370597,0.518047
2021-01-03,0.224801,0.65526,0.122728,0.742273
2021-01-04,0.852493,0.228143,0.915202,0.273862
2021-01-05,0.844728,0.422882,0.231035,0.860093
2021-01-06,0.681984,0.642339,0.944196,0.30017


In [60]:
df.loc[dates[0]]

A    0.573796
B    0.060363
C    0.655102
D    0.068046
Name: 2021-01-01 00:00:00, dtype: float64

In [62]:
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2021-01-01,0.573796,0.060363
2021-01-02,0.06411,0.448241
2021-01-03,0.224801,0.65526
2021-01-04,0.852493,0.228143
2021-01-05,0.844728,0.422882
2021-01-06,0.681984,0.642339


In [64]:
df.loc['20210102':'20210104', ['A', 'B']]

Unnamed: 0,A,B
2021-01-02,0.06411,0.448241
2021-01-03,0.224801,0.65526
2021-01-04,0.852493,0.228143


In [72]:
df.iloc[0:3, 0:2]

Unnamed: 0,A,B
2021-01-01,0.573796,0.060363
2021-01-02,0.06411,0.448241
2021-01-03,0.224801,0.65526


In [73]:
df.loc['20210102', ['A', 'B']]

A    0.064110
B    0.448241
Name: 2021-01-02 00:00:00, dtype: float64

In [74]:
# getting a scalar value
df.loc[dates[0], 'A']

0.573796303618867

#### Selection by position

In [75]:
df.iloc[3]

A    0.852493
B    0.228143
C    0.915202
D    0.273862
Name: 2021-01-04 00:00:00, dtype: float64

In [77]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2021-01-04,0.852493,0.228143
2021-01-05,0.844728,0.422882


In [78]:
df

Unnamed: 0,A,B,C,D
2021-01-01,0.573796,0.060363,0.655102,0.068046
2021-01-02,0.06411,0.448241,0.370597,0.518047
2021-01-03,0.224801,0.65526,0.122728,0.742273
2021-01-04,0.852493,0.228143,0.915202,0.273862
2021-01-05,0.844728,0.422882,0.231035,0.860093
2021-01-06,0.681984,0.642339,0.944196,0.30017


In [79]:
df.iloc[[1, 2, 4], [0, 2]]

Unnamed: 0,A,C
2021-01-02,0.06411,0.370597
2021-01-03,0.224801,0.122728
2021-01-05,0.844728,0.231035


In [80]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2021-01-02,0.06411,0.448241,0.370597,0.518047
2021-01-03,0.224801,0.65526,0.122728,0.742273


In [81]:
df.iloc[1, 1]

0.44824099778492454

In [82]:
df[df['A'] > 0]

Unnamed: 0,A,B,C,D
2021-01-01,0.573796,0.060363,0.655102,0.068046
2021-01-02,0.06411,0.448241,0.370597,0.518047
2021-01-03,0.224801,0.65526,0.122728,0.742273
2021-01-04,0.852493,0.228143,0.915202,0.273862
2021-01-05,0.844728,0.422882,0.231035,0.860093
2021-01-06,0.681984,0.642339,0.944196,0.30017


In [85]:
df[df > 0]

Unnamed: 0,A,B,C,D
2021-01-01,0.573796,0.060363,0.655102,0.068046
2021-01-02,0.06411,0.448241,0.370597,0.518047
2021-01-03,0.224801,0.65526,0.122728,0.742273
2021-01-04,0.852493,0.228143,0.915202,0.273862
2021-01-05,0.844728,0.422882,0.231035,0.860093
2021-01-06,0.681984,0.642339,0.944196,0.30017


In [84]:
df

Unnamed: 0,A,B,C,D
2021-01-01,0.573796,0.060363,0.655102,0.068046
2021-01-02,0.06411,0.448241,0.370597,0.518047
2021-01-03,0.224801,0.65526,0.122728,0.742273
2021-01-04,0.852493,0.228143,0.915202,0.273862
2021-01-05,0.844728,0.422882,0.231035,0.860093
2021-01-06,0.681984,0.642339,0.944196,0.30017


In [86]:
df.mean()

A    0.540319
B    0.409538
C    0.539810
D    0.460415
dtype: float64

In [87]:
df.mean(1)

2021-01-01    0.339327
2021-01-02    0.350249
2021-01-03    0.436265
2021-01-04    0.567425
2021-01-05    0.589684
2021-01-06    0.642172
Freq: D, dtype: float64

### Merge

#### Concat

In [88]:
df = pd.DataFrame(np.random.randn(10, 4))
df

Unnamed: 0,0,1,2,3
0,0.894408,0.339489,0.581561,-0.265512
1,0.42879,-0.573673,0.496397,-0.54542
2,0.725869,-1.259297,1.767878,-0.021257
3,-1.415811,0.86505,1.089919,-0.99369
4,0.313581,1.385031,-0.543226,1.194961
5,0.504945,0.563048,-1.53822,-2.430731
6,0.657757,-1.769434,-0.37757,-0.358055
7,0.191797,-0.462502,0.595797,1.314069
8,0.434022,-0.535035,0.792585,2.14607
9,-0.724771,1.389752,0.851312,-0.572359


In [89]:
# break it into pieces
pieces = [df[:3], df[3:7], df[7:]]

In [92]:
# Concat
pd.concat(pieces)

Unnamed: 0,0,1,2,3
0,0.894408,0.339489,0.581561,-0.265512
1,0.42879,-0.573673,0.496397,-0.54542
2,0.725869,-1.259297,1.767878,-0.021257
3,-1.415811,0.86505,1.089919,-0.99369
4,0.313581,1.385031,-0.543226,1.194961
5,0.504945,0.563048,-1.53822,-2.430731
6,0.657757,-1.769434,-0.37757,-0.358055
7,0.191797,-0.462502,0.595797,1.314069
8,0.434022,-0.535035,0.792585,2.14607
9,-0.724771,1.389752,0.851312,-0.572359


### Join

In [93]:
left = pd.DataFrame({"key": ["foo", "foo"], "lval": [1, 2]})
left

Unnamed: 0,key,lval
0,foo,1
1,foo,2


In [94]:
right = pd.DataFrame({"key": ["foo", "foo"], "rval": [4, 5]})
right

Unnamed: 0,key,rval
0,foo,4
1,foo,5


In [95]:
pd.merge(left, right,  on='key')

Unnamed: 0,key,lval,rval
0,foo,1,4
1,foo,1,5
2,foo,2,4
3,foo,2,5
