In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Create a series
by passing a list of values, letting pandas create a default integer index

In [10]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

# Create a DataFrame
by passing a NumPy array, with a datetime index and labeled columns

In [11]:
dates = pd.date_range('20180101', periods=6)
dates

DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
               '2018-01-05', '2018-01-06'],
              dtype='datetime64[ns]', freq='D')

In [12]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))

In [13]:
df

Unnamed: 0,A,B,C,D
2018-01-01,1.695515,-0.108853,1.070644,0.875974
2018-01-02,-1.332071,0.193541,-0.293114,-0.594329
2018-01-03,0.827825,-0.429568,0.899303,0.229025
2018-01-04,1.456598,-0.723269,-0.625745,1.878387
2018-01-05,1.275196,-0.055154,0.630329,-0.455672
2018-01-06,-0.014405,-0.864493,0.744423,-0.349221


# Create a DataFrame
by passing a dict of objects that can be converted to series-like

In [14]:
df2 = pd.DataFrame(
    {
        'A': 1,
        'B': pd.Timestamp('20180102'),
        'C': pd.Series(1, index=list(range(4)), dtype='float32'),
        'D': np.array([3] * 4, dtype='int32'),
        'E': pd.Categorical(['test', 'train', 'test', 'train']),
        'F': 'foo',
    }
)
df2

Unnamed: 0,A,B,C,D,E,F
0,1,2018-01-02,1.0,3,test,foo
1,1,2018-01-02,1.0,3,train,foo
2,1,2018-01-02,1.0,3,test,foo
3,1,2018-01-02,1.0,3,train,foo


The columns have different dtypes

In [15]:
df2.dtypes

A             int64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

# Viewing Data

In [16]:
df.head()

Unnamed: 0,A,B,C,D
2018-01-01,1.695515,-0.108853,1.070644,0.875974
2018-01-02,-1.332071,0.193541,-0.293114,-0.594329
2018-01-03,0.827825,-0.429568,0.899303,0.229025
2018-01-04,1.456598,-0.723269,-0.625745,1.878387
2018-01-05,1.275196,-0.055154,0.630329,-0.455672


In [17]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.651443,-0.331299,0.404307,0.264027
std,1.143348,0.412017,0.693281,0.960487
min,-1.332071,-0.864493,-0.625745,-0.594329
25%,0.196153,-0.649844,-0.062253,-0.429059
50%,1.05151,-0.269211,0.687376,-0.060098
75%,1.411247,-0.068579,0.860583,0.714237
max,1.695515,0.193541,1.070644,1.878387


Transposing data

In [18]:
df.T

Unnamed: 0,2018-01-01 00:00:00,2018-01-02 00:00:00,2018-01-03 00:00:00,2018-01-04 00:00:00,2018-01-05 00:00:00,2018-01-06 00:00:00
A,1.695515,-1.332071,0.827825,1.456598,1.275196,-0.014405
B,-0.108853,0.193541,-0.429568,-0.723269,-0.055154,-0.864493
C,1.070644,-0.293114,0.899303,-0.625745,0.630329,0.744423
D,0.875974,-0.594329,0.229025,1.878387,-0.455672,-0.349221


Sorting by an axis

In [19]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2018-01-01,0.875974,1.070644,-0.108853,1.695515
2018-01-02,-0.594329,-0.293114,0.193541,-1.332071
2018-01-03,0.229025,0.899303,-0.429568,0.827825
2018-01-04,1.878387,-0.625745,-0.723269,1.456598
2018-01-05,-0.455672,0.630329,-0.055154,1.275196
2018-01-06,-0.349221,0.744423,-0.864493,-0.014405


Sorting by values

In [20]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2018-01-06,-0.014405,-0.864493,0.744423,-0.349221
2018-01-04,1.456598,-0.723269,-0.625745,1.878387
2018-01-03,0.827825,-0.429568,0.899303,0.229025
2018-01-01,1.695515,-0.108853,1.070644,0.875974
2018-01-05,1.275196,-0.055154,0.630329,-0.455672
2018-01-02,-1.332071,0.193541,-0.293114,-0.594329


# Selection

### Getting

In [21]:
df['A']

2018-01-01    1.695515
2018-01-02   -1.332071
2018-01-03    0.827825
2018-01-04    1.456598
2018-01-05    1.275196
2018-01-06   -0.014405
Freq: D, Name: A, dtype: float64

In [22]:
df[0:3]

Unnamed: 0,A,B,C,D
2018-01-01,1.695515,-0.108853,1.070644,0.875974
2018-01-02,-1.332071,0.193541,-0.293114,-0.594329
2018-01-03,0.827825,-0.429568,0.899303,0.229025


In [23]:
df['20180102': '20180104']

Unnamed: 0,A,B,C,D
2018-01-02,-1.332071,0.193541,-0.293114,-0.594329
2018-01-03,0.827825,-0.429568,0.899303,0.229025
2018-01-04,1.456598,-0.723269,-0.625745,1.878387


### Selection by Label

In [24]:
df.loc[dates[0]]

A    1.695515
B   -0.108853
C    1.070644
D    0.875974
Name: 2018-01-01 00:00:00, dtype: float64

Selecting on a multi-axis by label

In [25]:
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2018-01-01,1.695515,-0.108853
2018-01-02,-1.332071,0.193541
2018-01-03,0.827825,-0.429568
2018-01-04,1.456598,-0.723269
2018-01-05,1.275196,-0.055154
2018-01-06,-0.014405,-0.864493


Showing label slicing, both endpoints are included

In [26]:
df.loc['20180102': '20180104', ['A', 'B']]

Unnamed: 0,A,B
2018-01-02,-1.332071,0.193541
2018-01-03,0.827825,-0.429568
2018-01-04,1.456598,-0.723269


For getting a scalar value

In [27]:
df.loc[dates[0], 'A']

1.69551530048718

For getting fast access to a scalar (equivalent to the prior method)

In [28]:
df.at[dates[0], 'A']

1.69551530048718

### Selection by Position

In [29]:
df.iloc[3]

A    1.456598
B   -0.723269
C   -0.625745
D    1.878387
Name: 2018-01-04 00:00:00, dtype: float64

By integer slices, acting similar to python

In [30]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2018-01-04,1.456598,-0.723269
2018-01-05,1.275196,-0.055154


By lists of integer position locations

In [31]:
df.iloc[[1, 2, 4], [0, 2]]

Unnamed: 0,A,C
2018-01-02,-1.332071,-0.293114
2018-01-03,0.827825,0.899303
2018-01-05,1.275196,0.630329


For getting a value explicitly:

In [32]:
df.iloc[1, 1]

0.1935412264270998

# Boolean Indexing

Using a single column’s values to select data.

In [33]:
df[df.B > 0]

Unnamed: 0,A,B,C,D
2018-01-02,-1.332071,0.193541,-0.293114,-0.594329


Selecting values from a DataFrame where a boolean condition is met

In [34]:
df[df > 0]

Unnamed: 0,A,B,C,D
2018-01-01,1.695515,,1.070644,0.875974
2018-01-02,,0.193541,,
2018-01-03,0.827825,,0.899303,0.229025
2018-01-04,1.456598,,,1.878387
2018-01-05,1.275196,,0.630329,
2018-01-06,,,0.744423,


In [35]:
df2 = df.copy()
df2['E'] = 'one one two three four three'.split()
df2

Unnamed: 0,A,B,C,D,E
2018-01-01,1.695515,-0.108853,1.070644,0.875974,one
2018-01-02,-1.332071,0.193541,-0.293114,-0.594329,one
2018-01-03,0.827825,-0.429568,0.899303,0.229025,two
2018-01-04,1.456598,-0.723269,-0.625745,1.878387,three
2018-01-05,1.275196,-0.055154,0.630329,-0.455672,four
2018-01-06,-0.014405,-0.864493,0.744423,-0.349221,three


Using the `isin()` method for filtering

In [36]:
df2[df2['E'].isin(['two', 'four'])]

Unnamed: 0,A,B,C,D,E
2018-01-03,0.827825,-0.429568,0.899303,0.229025,two
2018-01-05,1.275196,-0.055154,0.630329,-0.455672,four


# Setting

Setting a new column automatically aligns the data by the indexes

In [37]:
s1 = pd.Series([1, 2, 3, 4, 5, 6], index=pd.date_range('20180102', periods=6))
s1

2018-01-02    1
2018-01-03    2
2018-01-04    3
2018-01-05    4
2018-01-06    5
2018-01-07    6
Freq: D, dtype: int64

In [38]:
df['F'] = s1
df

Unnamed: 0,A,B,C,D,F
2018-01-01,1.695515,-0.108853,1.070644,0.875974,
2018-01-02,-1.332071,0.193541,-0.293114,-0.594329,1.0
2018-01-03,0.827825,-0.429568,0.899303,0.229025,2.0
2018-01-04,1.456598,-0.723269,-0.625745,1.878387,3.0
2018-01-05,1.275196,-0.055154,0.630329,-0.455672,4.0
2018-01-06,-0.014405,-0.864493,0.744423,-0.349221,5.0


Setting values by label

In [43]:
df.at[dates[0], 'A'] = 0

Set values by position

In [44]:
df.iat[0, 1] = 0

In [45]:
df

Unnamed: 0,A,B,C,D,F
2018-01-01,0.0,0.0,1.070644,0.875974,
2018-01-02,-1.332071,0.193541,-0.293114,-0.594329,1.0
2018-01-03,0.827825,-0.429568,0.899303,0.229025,2.0
2018-01-04,1.456598,-0.723269,-0.625745,1.878387,3.0
2018-01-05,1.275196,-0.055154,0.630329,-0.455672,4.0
2018-01-06,-0.014405,-0.864493,0.744423,-0.349221,5.0


# Stats

In [46]:
df.mean()

A    0.368857
B   -0.313157
C    0.404307
D    0.264027
F    3.000000
dtype: float64

Same operation on the other axis:

In [47]:
df.mean(1)

2018-01-01    0.486654
2018-01-02   -0.205195
2018-01-03    0.705317
2018-01-04    0.997194
2018-01-05    1.078940
2018-01-06    0.903261
Freq: D, dtype: float64

# Append

Append rows to a dataframe

In [49]:
df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
0,0.46025,0.530873,0.386705,-1.806661
1,-0.185253,-0.884916,-1.746256,-1.270412
2,-0.319003,-0.805226,-1.887339,-0.248177
3,-0.805977,-0.492576,-1.634447,0.378433
4,-2.075965,2.25997,-0.573028,0.010059
5,1.247687,1.287659,-0.505102,-0.593215
6,0.338704,0.092339,-0.923662,1.553639
7,-0.070897,0.590112,-1.275117,0.77763


In [50]:
s = df.iloc[3]

In [51]:
s

A   -0.805977
B   -0.492576
C   -1.634447
D    0.378433
Name: 3, dtype: float64

In [52]:
df.append(s, ignore_index=True)

Unnamed: 0,A,B,C,D
0,0.46025,0.530873,0.386705,-1.806661
1,-0.185253,-0.884916,-1.746256,-1.270412
2,-0.319003,-0.805226,-1.887339,-0.248177
3,-0.805977,-0.492576,-1.634447,0.378433
4,-2.075965,2.25997,-0.573028,0.010059
5,1.247687,1.287659,-0.505102,-0.593215
6,0.338704,0.092339,-0.923662,1.553639
7,-0.070897,0.590112,-1.275117,0.77763
8,-0.805977,-0.492576,-1.634447,0.378433


# Grouping

In [53]:
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                           'foo', 'bar', 'foo', 'foo'],
                   'B' : ['one', 'one', 'two', 'three',
                          'two', 'two', 'one', 'three'],
                   'C' : np.random.randn(8),
                   'D' : np.random.randn(8)})

In [54]:
df

Unnamed: 0,A,B,C,D
0,foo,one,-0.632324,-0.352763
1,bar,one,-1.209649,0.195765
2,foo,two,-0.073214,-1.641542
3,bar,three,0.561152,1.24731
4,foo,two,-1.973072,-0.9334
5,bar,two,0.063183,-1.261542
6,foo,one,0.09334,1.141003
7,foo,three,0.299553,0.578927


Grouping and then applying the sum() function to the resulting groups.

In [57]:
df.groupby('A').sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-0.585314,0.181533
foo,-2.285717,-1.207775
