# Pandas基本介绍

In [15]:
import pandas as pd
import numpy as np

In [19]:
s = pd.Series([1,3,6,np.nan,44,1])

In [20]:
s

0     1.0
1     3.0
2     6.0
3     NaN
4    44.0
5     1.0
dtype: float64

In [21]:
dates = pd.date_range('20160101',periods=6)

In [22]:
dates

DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
               '2016-01-05', '2016-01-06'],
              dtype='datetime64[ns]', freq='D')

In [24]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=['a','b','c','d'])

In [25]:
df

Unnamed: 0,a,b,c,d
2016-01-01,-1.348567,1.564128,1.56153,-0.793511
2016-01-02,2.762322,0.17101,0.248115,1.742636
2016-01-03,-1.126552,0.982214,-0.406223,0.486113
2016-01-04,0.986524,0.750885,0.318843,1.15228
2016-01-05,1.417459,-0.539709,0.142173,0.09758
2016-01-06,0.202111,0.414185,0.328448,1.440292


In [26]:
df1 = pd.DataFrame(np.arange(12).reshape((3,4)))

In [27]:
df1

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [28]:
df2 = pd.DataFrame({
    'A': 1.,
    'B': pd.Timestamp('20130102'),
    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
    'D': np.array([3]*4, dtype='int32'),
    'E': pd.Categorical(["test","train","test","train"]),
    'F': 'foo'   
})

In [29]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [33]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [34]:
df2.index

Int64Index([0, 1, 2, 3], dtype='int64')

In [35]:
df2.columns    # colums是属性，所以后面不用加括号

Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')

In [36]:
df2.values

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [37]:
df2.describe()   # 只显示数值类型的数据，所以B E F不会显示

Unnamed: 0,A,C,D
count,4.0,4.0,4.0
mean,1.0,1.0,3.0
std,0.0,0.0,0.0
min,1.0,1.0,3.0
25%,1.0,1.0,3.0
50%,1.0,1.0,3.0
75%,1.0,1.0,3.0
max,1.0,1.0,3.0


In [38]:
df2.T

Unnamed: 0,0,1,2,3
A,1.0,1.0,1.0,1.0
B,2013-01-02 00:00:00,2013-01-02 00:00:00,2013-01-02 00:00:00,2013-01-02 00:00:00
C,1.0,1.0,1.0,1.0
D,3,3,3,3
E,test,train,test,train
F,foo,foo,foo,foo


In [39]:
df2.sort_index(axis=1, ascending=False)

Unnamed: 0,F,E,D,C,B,A
0,foo,test,3,1.0,2013-01-02,1.0
1,foo,train,3,1.0,2013-01-02,1.0
2,foo,test,3,1.0,2013-01-02,1.0
3,foo,train,3,1.0,2013-01-02,1.0


In [40]:
df2.sort_index(axis=0, ascending=False)

Unnamed: 0,A,B,C,D,E,F
3,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
0,1.0,2013-01-02,1.0,3,test,foo


In [41]:
df2.sort_values(by='E')

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
2,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
3,1.0,2013-01-02,1.0,3,train,foo
