In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Create a series
by passing a list of values, letting pandas create a default integer index

In [3]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

# Create a DataFrame
by passing a NumPy array, with a datetime index and labeled columns

In [4]:
dates = pd.date_range('20180101', periods=6)
dates

DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
               '2018-01-05', '2018-01-06'],
              dtype='datetime64[ns]', freq='D')

In [5]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))

In [6]:
df

Unnamed: 0,A,B,C,D
2018-01-01,0.068109,-0.164937,0.460388,-1.481396
2018-01-02,-1.551484,0.746173,-1.498626,-0.053079
2018-01-03,0.129725,-0.745612,0.66762,-0.668749
2018-01-04,-0.652158,0.680535,-0.370779,1.688905
2018-01-05,-0.379358,-1.454433,-0.272374,-0.744914
2018-01-06,0.813482,-1.928843,1.244931,-1.400788


# Create a DataFrame
by passing a dict of objects that can be converted to series-like

In [7]:
df2 = pd.DataFrame(
    {
        'A': 1,
        'B': pd.Timestamp('20180102'),
        'C': pd.Series(1, index=list(range(4)), dtype='float32'),
        'D': np.array([3] * 4, dtype='int32'),
        'E': pd.Categorical(['test', 'train', 'test', 'train']),
        'F': 'foo',
    }
)
df2

Unnamed: 0,A,B,C,D,E,F
0,1,2018-01-02,1.0,3,test,foo
1,1,2018-01-02,1.0,3,train,foo
2,1,2018-01-02,1.0,3,test,foo
3,1,2018-01-02,1.0,3,train,foo


The columns have different dtypes

In [8]:
df2.dtypes

A             int64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

# Viewing Data

In [9]:
df.head()

Unnamed: 0,A,B,C,D
2018-01-01,0.068109,-0.164937,0.460388,-1.481396
2018-01-02,-1.551484,0.746173,-1.498626,-0.053079
2018-01-03,0.129725,-0.745612,0.66762,-0.668749
2018-01-04,-0.652158,0.680535,-0.370779,1.688905
2018-01-05,-0.379358,-1.454433,-0.272374,-0.744914


In [10]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.261947,-0.477853,0.038527,-0.443337
std,0.805959,1.101707,0.965004,1.16917
min,-1.551484,-1.928843,-1.498626,-1.481396
25%,-0.583958,-1.277228,-0.346178,-1.23682
50%,-0.155624,-0.455274,0.094007,-0.706832
75%,0.114321,0.469167,0.615812,-0.206996
max,0.813482,0.746173,1.244931,1.688905


Transposing data

In [11]:
df.T

Unnamed: 0,2018-01-01 00:00:00,2018-01-02 00:00:00,2018-01-03 00:00:00,2018-01-04 00:00:00,2018-01-05 00:00:00,2018-01-06 00:00:00
A,0.068109,-1.551484,0.129725,-0.652158,-0.379358,0.813482
B,-0.164937,0.746173,-0.745612,0.680535,-1.454433,-1.928843
C,0.460388,-1.498626,0.66762,-0.370779,-0.272374,1.244931
D,-1.481396,-0.053079,-0.668749,1.688905,-0.744914,-1.400788


Sorting by an axis

In [12]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2018-01-01,-1.481396,0.460388,-0.164937,0.068109
2018-01-02,-0.053079,-1.498626,0.746173,-1.551484
2018-01-03,-0.668749,0.66762,-0.745612,0.129725
2018-01-04,1.688905,-0.370779,0.680535,-0.652158
2018-01-05,-0.744914,-0.272374,-1.454433,-0.379358
2018-01-06,-1.400788,1.244931,-1.928843,0.813482


Sorting by values

In [13]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2018-01-06,0.813482,-1.928843,1.244931,-1.400788
2018-01-05,-0.379358,-1.454433,-0.272374,-0.744914
2018-01-03,0.129725,-0.745612,0.66762,-0.668749
2018-01-01,0.068109,-0.164937,0.460388,-1.481396
2018-01-04,-0.652158,0.680535,-0.370779,1.688905
2018-01-02,-1.551484,0.746173,-1.498626,-0.053079


# Selection

### Getting

In [14]:
df['A']

2018-01-01    0.068109
2018-01-02   -1.551484
2018-01-03    0.129725
2018-01-04   -0.652158
2018-01-05   -0.379358
2018-01-06    0.813482
Freq: D, Name: A, dtype: float64

In [15]:
df[0:3]

Unnamed: 0,A,B,C,D
2018-01-01,0.068109,-0.164937,0.460388,-1.481396
2018-01-02,-1.551484,0.746173,-1.498626,-0.053079
2018-01-03,0.129725,-0.745612,0.66762,-0.668749


In [16]:
df['20180102': '20180104']

Unnamed: 0,A,B,C,D
2018-01-02,-1.551484,0.746173,-1.498626,-0.053079
2018-01-03,0.129725,-0.745612,0.66762,-0.668749
2018-01-04,-0.652158,0.680535,-0.370779,1.688905


### Selection by Label

In [17]:
df.loc[dates[0]]

A    0.068109
B   -0.164937
C    0.460388
D   -1.481396
Name: 2018-01-01 00:00:00, dtype: float64

Selecting on a multi-axis by label

In [18]:
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2018-01-01,0.068109,-0.164937
2018-01-02,-1.551484,0.746173
2018-01-03,0.129725,-0.745612
2018-01-04,-0.652158,0.680535
2018-01-05,-0.379358,-1.454433
2018-01-06,0.813482,-1.928843


Showing label slicing, both endpoints are included

In [19]:
df.loc['20180102': '20180104', ['A', 'B']]

Unnamed: 0,A,B
2018-01-02,-1.551484,0.746173
2018-01-03,0.129725,-0.745612
2018-01-04,-0.652158,0.680535


For getting a scalar value

In [20]:
df.loc[dates[0], 'A']

0.06810942107488382

For getting fast access to a scalar (equivalent to the prior method)

In [21]:
df.at[dates[0], 'A']

0.06810942107488382

### Selection by Position

In [22]:
df.iloc[3]

A   -0.652158
B    0.680535
C   -0.370779
D    1.688905
Name: 2018-01-04 00:00:00, dtype: float64

By integer slices, acting similar to python

In [23]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2018-01-04,-0.652158,0.680535
2018-01-05,-0.379358,-1.454433


By lists of integer position locations

In [24]:
df.iloc[[1, 2, 4], [0, 2]]

Unnamed: 0,A,C
2018-01-02,-1.551484,-1.498626
2018-01-03,0.129725,0.66762
2018-01-05,-0.379358,-0.272374


For getting a value explicitly:

In [25]:
df.iloc[1, 1]

0.7461725839000001

# Boolean Indexing

Using a single column’s values to select data.

In [26]:
df[df.B > 0]

Unnamed: 0,A,B,C,D
2018-01-02,-1.551484,0.746173,-1.498626,-0.053079
2018-01-04,-0.652158,0.680535,-0.370779,1.688905


Selecting values from a DataFrame where a boolean condition is met

In [27]:
df[df > 0]

Unnamed: 0,A,B,C,D
2018-01-01,0.068109,,0.460388,
2018-01-02,,0.746173,,
2018-01-03,0.129725,,0.66762,
2018-01-04,,0.680535,,1.688905
2018-01-05,,,,
2018-01-06,0.813482,,1.244931,


In [28]:
df2 = df.copy()
df2['E'] = 'one one two three four three'.split()
df2

Unnamed: 0,A,B,C,D,E
2018-01-01,0.068109,-0.164937,0.460388,-1.481396,one
2018-01-02,-1.551484,0.746173,-1.498626,-0.053079,one
2018-01-03,0.129725,-0.745612,0.66762,-0.668749,two
2018-01-04,-0.652158,0.680535,-0.370779,1.688905,three
2018-01-05,-0.379358,-1.454433,-0.272374,-0.744914,four
2018-01-06,0.813482,-1.928843,1.244931,-1.400788,three


Using the `isin()` method for filtering

In [29]:
df2[df2['E'].isin(['two', 'four'])]

Unnamed: 0,A,B,C,D,E
2018-01-03,0.129725,-0.745612,0.66762,-0.668749,two
2018-01-05,-0.379358,-1.454433,-0.272374,-0.744914,four


# Setting

Setting a new column automatically aligns the data by the indexes

In [30]:
s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range('20180102', periods=6))
s1

2018-01-02    1
2018-01-03    2
2018-01-04    3
2018-01-05    4
2018-01-06    5
2018-01-07    6
Freq: D, dtype: int64

In [31]:
df['F'] = s1
df

Unnamed: 0,A,B,C,D,F
2018-01-01,0.068109,-0.164937,0.460388,-1.481396,
2018-01-02,-1.551484,0.746173,-1.498626,-0.053079,1.0
2018-01-03,0.129725,-0.745612,0.66762,-0.668749,2.0
2018-01-04,-0.652158,0.680535,-0.370779,1.688905,3.0
2018-01-05,-0.379358,-1.454433,-0.272374,-0.744914,4.0
2018-01-06,0.813482,-1.928843,1.244931,-1.400788,5.0


Setting values by label

In [32]:
df.at[dates[0], 'A'] = 0

Set values by position

In [33]:
df.iat[0, 1] = 0

In [34]:
df

Unnamed: 0,A,B,C,D,F
2018-01-01,0.0,0.0,0.460388,-1.481396,
2018-01-02,-1.551484,0.746173,-1.498626,-0.053079,1.0
2018-01-03,0.129725,-0.745612,0.66762,-0.668749,2.0
2018-01-04,-0.652158,0.680535,-0.370779,1.688905,3.0
2018-01-05,-0.379358,-1.454433,-0.272374,-0.744914,4.0
2018-01-06,0.813482,-1.928843,1.244931,-1.400788,5.0


# Stats

In [35]:
df.mean()

A   -0.273299
B   -0.450363
C    0.038527
D   -0.443337
F    3.000000
dtype: float64

Same operation on the other axis:

In [36]:
df.mean(1)

2018-01-01   -0.255252
2018-01-02   -0.271403
2018-01-03    0.276597
2018-01-04    0.869301
2018-01-05    0.229784
2018-01-06    0.745756
Freq: D, dtype: float64

# Append

Append rows to a dataframe

In [37]:
df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
0,0.023698,-0.647321,-0.117195,0.493708
1,0.290516,-1.55377,-0.606913,-0.952092
2,0.755714,1.256348,0.583311,-1.02578
3,0.170138,-0.178835,0.129028,0.078387
4,-1.283891,0.142117,1.693252,0.767056
5,-0.048587,-0.407315,1.687611,0.01916
6,-0.379364,0.655522,0.928558,1.509784
7,-1.657888,0.518611,-0.706903,-1.033656


In [38]:
s = df.iloc[3]

In [39]:
s

A    0.170138
B   -0.178835
C    0.129028
D    0.078387
Name: 3, dtype: float64

In [40]:
df.append(s, ignore_index=True)

Unnamed: 0,A,B,C,D
0,0.023698,-0.647321,-0.117195,0.493708
1,0.290516,-1.55377,-0.606913,-0.952092
2,0.755714,1.256348,0.583311,-1.02578
3,0.170138,-0.178835,0.129028,0.078387
4,-1.283891,0.142117,1.693252,0.767056
5,-0.048587,-0.407315,1.687611,0.01916
6,-0.379364,0.655522,0.928558,1.509784
7,-1.657888,0.518611,-0.706903,-1.033656
8,0.170138,-0.178835,0.129028,0.078387


# Grouping

In [41]:
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                           'foo', 'bar', 'foo', 'foo'],
                   'B' : ['one', 'one', 'two', 'three',
                          'two', 'two', 'one', 'three'],
                   'C' : np.random.randn(8),
                   'D' : np.random.randn(8)})

In [42]:
df

Unnamed: 0,A,B,C,D
0,foo,one,0.07802,-1.162469
1,bar,one,0.253905,-0.282616
2,foo,two,-0.925955,1.058501
3,bar,three,-0.794684,0.684385
4,foo,two,0.817291,-0.82279
5,bar,two,-0.045245,0.684317
6,foo,one,1.372854,-0.778512
7,foo,three,1.787548,0.097992


Grouping and then applying the sum() function to the resulting groups.

In [43]:
df.groupby('A').sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-0.586025,1.086086
foo,3.129758,-1.607277


Grouping by multiple columns forms a hierarchical index, and again we can apply the sum function.

In [44]:
df.groupby(['A','B']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,0.253905,-0.282616
bar,three,-0.794684,0.684385
bar,two,-0.045245,0.684317
foo,one,1.450874,-1.940981
foo,three,1.787548,0.097992
foo,two,-0.108664,0.235711


# Reshaping

In [47]:
tuples = list(zip(*[
            ['bar', 'bar', 'baz', 'baz',
             'foo', 'foo', 'qux', 'qux'],
            ['one', 'two', 'one', 'two',
             'one', 'two', 'one', 'two']]))

In [48]:
tuples

[('bar', 'one'),
 ('bar', 'two'),
 ('baz', 'one'),
 ('baz', 'two'),
 ('foo', 'one'),
 ('foo', 'two'),
 ('qux', 'one'),
 ('qux', 'two')]

In [51]:
index = pd.MultiIndex.from_tuples(tuples, names='first second'.split())

In [56]:
df = pd.DataFrame(np.random.randn(8, 2), index=index, columns='A B'.split())

In [57]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.471777,1.174707
bar,two,0.682369,-0.309311
baz,one,-0.030181,0.833159
baz,two,1.12281,-0.892403
foo,one,0.126326,-0.700221
foo,two,-1.150092,1.155512
qux,one,0.639301,0.325748
qux,two,1.287168,0.689102


In [59]:
df2 = df[:4]
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.471777,1.174707
bar,two,0.682369,-0.309311
baz,one,-0.030181,0.833159
baz,two,1.12281,-0.892403


In [61]:
stacked = df2.stack()
stacked

first  second   
bar    one     A   -0.471777
               B    1.174707
       two     A    0.682369
               B   -0.309311
baz    one     A   -0.030181
               B    0.833159
       two     A    1.122810
               B   -0.892403
dtype: float64

In [62]:
stacked.unstack()

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.471777,1.174707
bar,two,0.682369,-0.309311
baz,one,-0.030181,0.833159
baz,two,1.12281,-0.892403


In [74]:
stacked.unstack(0)

Unnamed: 0_level_0,first,bar,baz
second,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,-0.471777,-0.030181
one,B,1.174707,0.833159
two,A,0.682369,1.12281
two,B,-0.309311,-0.892403


In [75]:
stacked.unstack(1)

Unnamed: 0_level_0,second,one,two
first,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,A,-0.471777,0.682369
bar,B,1.174707,-0.309311
baz,A,-0.030181,1.12281
baz,B,0.833159,-0.892403


In [76]:
stacked.unstack(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
first,second,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.471777,1.174707
bar,two,0.682369,-0.309311
baz,one,-0.030181,0.833159
baz,two,1.12281,-0.892403
