# pandas

## introduction

In [15]:
import numpy as np
import pandas as pd

# Series
s = pd.Series([1, 3, 6, np.nan, 67, 4])
print(s)

# 生成日期
dates = pd.date_range('20170101', periods = 6)
print(dates)

# 生成dataframe
df = pd.DataFrame(np.random.randn(6, 4), index = dates, columns = ['a', 'b', 'c', 'd'])
print(df)

df = pd.DataFrame(np.arange(12).reshape((3, 4)))
print(df)

# 根据字典生成DataFrame
df = pd.DataFrame({'A':1., 
                   'B':pd.Timestamp('20170101'), 
                   'C':pd.Series(1, index = list(range(4)), dtype = 'float32'), 
                   'D':np.array([3] * 4, dtype = 'int32'), 
                   'E':pd.Categorical(['test', 'train', 'test', 'train']), 
                   'F':'foo'})
print(df)
B = pd.Timestamp('20170101')
print('B:', B)
C = pd.Series(1, index = list(range(4)), dtype = 'float32')
print('C:', C)
D = np.array([3] * 4, dtype = 'int32')
print('D:', D)

0     1.0
1     3.0
2     6.0
3     NaN
4    67.0
5     4.0
dtype: float64
DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04',
               '2017-01-05', '2017-01-06'],
              dtype='datetime64[ns]', freq='D')
                   a         b         c         d
2017-01-01 -0.506652  0.208121  1.064361 -0.465262
2017-01-02  1.271673  1.658515 -1.176396  1.249311
2017-01-03 -0.991712  0.600136  0.722521 -2.312928
2017-01-04  1.459415 -0.657185  1.235445 -0.675061
2017-01-05  0.597027 -1.393711  0.195447 -0.194661
2017-01-06 -0.218653  1.360652  1.022581 -0.135605
   0  1   2   3
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11
     A          B    C  D      E    F
0  1.0 2017-01-01  1.0  3   test  foo
1  1.0 2017-01-01  1.0  3  train  foo
2  1.0 2017-01-01  1.0  3   test  foo
3  1.0 2017-01-01  1.0  3  train  foo
B: 2017-01-01 00:00:00
C: 0    1.0
1    1.0
2    1.0
3    1.0
dtype: float32
D: [3 3 3 3]


In [23]:
# DataFrame的属性
## 各列的数据类型
print(df.dtypes)
## 行index
print(df.index)
## columns
print(df.columns)
## 数据值
print(df.values)
## 描述DataFrame，数值类型的字段
print(df.describe())
## 转置
print(df.T)

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object
Int64Index([0, 1, 2, 3], dtype='int64')
Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
[[1.0 Timestamp('2017-01-01 00:00:00') 1.0 3 'test' 'foo']
 [1.0 Timestamp('2017-01-01 00:00:00') 1.0 3 'train' 'foo']
 [1.0 Timestamp('2017-01-01 00:00:00') 1.0 3 'test' 'foo']
 [1.0 Timestamp('2017-01-01 00:00:00') 1.0 3 'train' 'foo']]
         A    C    D
count  4.0  4.0  4.0
mean   1.0  1.0  3.0
std    0.0  0.0  0.0
min    1.0  1.0  3.0
25%    1.0  1.0  3.0
50%    1.0  1.0  3.0
75%    1.0  1.0  3.0
max    1.0  1.0  3.0
                     0                    1                    2  \
A                    1                    1                    1   
B  2017-01-01 00:00:00  2017-01-01 00:00:00  2017-01-01 00:00:00   
C                    1                    1                    1   
D                    3                    3                    3   
E   

In [27]:
# 排序
## 根据index排序
### 行
print(df.sort_index(axis = 0, ascending = False))
### 列
print(df.sort_index(axis = 1, ascending = False))

## 根据值进行排序
print(df.sort_values('E', ascending = False))

     A          B    C  D      E    F
3  1.0 2017-01-01  1.0  3  train  foo
2  1.0 2017-01-01  1.0  3   test  foo
1  1.0 2017-01-01  1.0  3  train  foo
0  1.0 2017-01-01  1.0  3   test  foo
     F      E  D    C          B    A
0  foo   test  3  1.0 2017-01-01  1.0
1  foo  train  3  1.0 2017-01-01  1.0
2  foo   test  3  1.0 2017-01-01  1.0
3  foo  train  3  1.0 2017-01-01  1.0
     A          B    C  D      E    F
3  1.0 2017-01-01  1.0  3  train  foo
1  1.0 2017-01-01  1.0  3  train  foo
2  1.0 2017-01-01  1.0  3   test  foo
0  1.0 2017-01-01  1.0  3   test  foo
