In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Create a series
by passing a list of values, letting pandas create a default integer index

In [7]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

# Create a DataFrame
by passing a NumPy array, with a datetime index and labeled columns

In [8]:
dates = pd.date_range('20180101', periods=6)
dates

DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
               '2018-01-05', '2018-01-06'],
              dtype='datetime64[ns]', freq='D')

In [9]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))

In [10]:
df

Unnamed: 0,A,B,C,D
2018-01-01,-0.254672,-0.687515,0.633759,-0.416487
2018-01-02,-0.845215,0.447447,-0.367929,0.976476
2018-01-03,-0.343061,-0.911272,0.192334,-1.042039
2018-01-04,-1.920521,1.256841,1.638848,0.028886
2018-01-05,-0.4608,-0.433103,2.001748,0.147762
2018-01-06,-1.432754,0.434096,-0.301818,0.319898


# Create a DataFrame
by passing a dict of objects that can be converted to series-like

In [11]:
df2 = pd.DataFrame(
    {
        'A': 1,
        'B': pd.Timestamp('20180102'),
        'C': pd.Series(1, index=list(range(4)), dtype='float32'),
        'D': np.array([3] * 4, dtype='int32'),
        'E': pd.Categorical(['test', 'train', 'test', 'train']),
        'F': 'foo',
    }
)
df2

Unnamed: 0,A,B,C,D,E,F
0,1,2018-01-02,1.0,3,test,foo
1,1,2018-01-02,1.0,3,train,foo
2,1,2018-01-02,1.0,3,test,foo
3,1,2018-01-02,1.0,3,train,foo


The columns have different dtypes

In [12]:
df2.dtypes

A             int64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

# Viewing Data

In [13]:
df.head()

Unnamed: 0,A,B,C,D
2018-01-01,-0.254672,-0.687515,0.633759,-0.416487
2018-01-02,-0.845215,0.447447,-0.367929,0.976476
2018-01-03,-0.343061,-0.911272,0.192334,-1.042039
2018-01-04,-1.920521,1.256841,1.638848,0.028886
2018-01-05,-0.4608,-0.433103,2.001748,0.147762


In [14]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.876171,0.017749,0.632824,0.002416
std,0.669987,0.831514,0.995503,0.683781
min,-1.920521,-0.911272,-0.367929,-1.042039
25%,-1.28587,-0.623912,-0.17828,-0.305144
50%,-0.653008,0.000496,0.413047,0.088324
75%,-0.372496,0.444109,1.387576,0.276864
max,-0.254672,1.256841,2.001748,0.976476


Transposing data

In [15]:
df.T

Unnamed: 0,2018-01-01 00:00:00,2018-01-02 00:00:00,2018-01-03 00:00:00,2018-01-04 00:00:00,2018-01-05 00:00:00,2018-01-06 00:00:00
A,-0.254672,-0.845215,-0.343061,-1.920521,-0.4608,-1.432754
B,-0.687515,0.447447,-0.911272,1.256841,-0.433103,0.434096
C,0.633759,-0.367929,0.192334,1.638848,2.001748,-0.301818
D,-0.416487,0.976476,-1.042039,0.028886,0.147762,0.319898


Sorting by an axis

In [16]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2018-01-01,-0.416487,0.633759,-0.687515,-0.254672
2018-01-02,0.976476,-0.367929,0.447447,-0.845215
2018-01-03,-1.042039,0.192334,-0.911272,-0.343061
2018-01-04,0.028886,1.638848,1.256841,-1.920521
2018-01-05,0.147762,2.001748,-0.433103,-0.4608
2018-01-06,0.319898,-0.301818,0.434096,-1.432754


Sorting by values

In [17]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2018-01-03,-0.343061,-0.911272,0.192334,-1.042039
2018-01-01,-0.254672,-0.687515,0.633759,-0.416487
2018-01-05,-0.4608,-0.433103,2.001748,0.147762
2018-01-06,-1.432754,0.434096,-0.301818,0.319898
2018-01-02,-0.845215,0.447447,-0.367929,0.976476
2018-01-04,-1.920521,1.256841,1.638848,0.028886


# Selection

### Getting

In [18]:
df['A']

2018-01-01   -0.254672
2018-01-02   -0.845215
2018-01-03   -0.343061
2018-01-04   -1.920521
2018-01-05   -0.460800
2018-01-06   -1.432754
Freq: D, Name: A, dtype: float64

In [20]:
df[0:3]

Unnamed: 0,A,B,C,D
2018-01-01,-0.254672,-0.687515,0.633759,-0.416487
2018-01-02,-0.845215,0.447447,-0.367929,0.976476
2018-01-03,-0.343061,-0.911272,0.192334,-1.042039


In [22]:
df['20180102': '20180104']

Unnamed: 0,A,B,C,D
2018-01-02,-0.845215,0.447447,-0.367929,0.976476
2018-01-03,-0.343061,-0.911272,0.192334,-1.042039
2018-01-04,-1.920521,1.256841,1.638848,0.028886


### Selection by Label

In [23]:
df.loc[dates[0]]

A   -0.254672
B   -0.687515
C    0.633759
D   -0.416487
Name: 2018-01-01 00:00:00, dtype: float64

Selecting on a multi-axis by label

In [24]:
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2018-01-01,-0.254672,-0.687515
2018-01-02,-0.845215,0.447447
2018-01-03,-0.343061,-0.911272
2018-01-04,-1.920521,1.256841
2018-01-05,-0.4608,-0.433103
2018-01-06,-1.432754,0.434096


Showing label slicing, both endpoints are included

In [25]:
df.loc['20180102': '20180104', ['A', 'B']]

Unnamed: 0,A,B
2018-01-02,-0.845215,0.447447
2018-01-03,-0.343061,-0.911272
2018-01-04,-1.920521,1.256841


For getting a scalar value

In [31]:
df.loc[dates[0], 'A']

-0.25467196077670373

For getting fast access to a scalar (equivalent to the prior method)

In [33]:
df.at[dates[0], 'A']

-0.25467196077670373

### Selection by Position

In [35]:
df.iloc[3]

A   -1.920521
B    1.256841
C    1.638848
D    0.028886
Name: 2018-01-04 00:00:00, dtype: float64

By integer slices, acting similar to python

In [37]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2018-01-04,-1.920521,1.256841
2018-01-05,-0.4608,-0.433103


By lists of integer position locations

In [38]:
df.iloc[[1, 2, 4], [0, 2]]

Unnamed: 0,A,C
2018-01-02,-0.845215,-0.367929
2018-01-03,-0.343061,0.192334
2018-01-05,-0.4608,2.001748


For getting a value explicitly:

In [40]:
df.iloc[1, 1]

0.4474469303396715

# Boolean Indexing

Using a single column’s values to select data.

In [55]:
df[df.B > 0]

Unnamed: 0,A,B,C,D
2018-01-02,-0.845215,0.447447,-0.367929,0.976476
2018-01-04,-1.920521,1.256841,1.638848,0.028886
2018-01-06,-1.432754,0.434096,-0.301818,0.319898


Selecting values from a DataFrame where a boolean condition is met

In [57]:
df[df > 0]

Unnamed: 0,A,B,C,D
2018-01-01,,,0.633759,
2018-01-02,,0.447447,,0.976476
2018-01-03,,,0.192334,
2018-01-04,,1.256841,1.638848,0.028886
2018-01-05,,,2.001748,0.147762
2018-01-06,,0.434096,,0.319898


In [62]:
df2 = df.copy()
df2['E'] = 'one one two three four three'.split()
df2

Unnamed: 0,A,B,C,D,E
2018-01-01,-0.254672,-0.687515,0.633759,-0.416487,one
2018-01-02,-0.845215,0.447447,-0.367929,0.976476,one
2018-01-03,-0.343061,-0.911272,0.192334,-1.042039,two
2018-01-04,-1.920521,1.256841,1.638848,0.028886,three
2018-01-05,-0.4608,-0.433103,2.001748,0.147762,four
2018-01-06,-1.432754,0.434096,-0.301818,0.319898,three


Using the `isin()` method for filtering

In [63]:
df2[df2['E'].isin(['two', 'four'])]

Unnamed: 0,A,B,C,D,E
2018-01-03,-0.343061,-0.911272,0.192334,-1.042039,two
2018-01-05,-0.4608,-0.433103,2.001748,0.147762,four


# Setting

Setting a new column automatically aligns the data by the indexes

In [81]:
s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range('20180102', periods=6))
s1

2018-01-02    1
2018-01-03    2
2018-01-04    3
2018-01-05    4
2018-01-06    5
2018-01-07    6
Freq: D, dtype: int64

In [85]:
df['F'] = s1
df

Unnamed: 0,A,B,C,D,F
2018-01-01,0.0,-0.687515,0.633759,-0.416487,
2018-01-02,-0.845215,0.447447,-0.367929,0.976476,1.0
2018-01-03,-0.343061,-0.911272,0.192334,-1.042039,2.0
2018-01-04,-1.920521,1.256841,1.638848,0.028886,3.0
2018-01-05,-0.4608,-0.433103,2.001748,0.147762,4.0
2018-01-06,-1.432754,0.434096,-0.301818,0.319898,5.0
