In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [3]:
dates = pd.date_range('20210301', periods=6, freq='d')
dates

DatetimeIndex(['2021-03-01', '2021-03-02', '2021-03-03', '2021-03-04',
               '2021-03-05', '2021-03-06'],
              dtype='datetime64[ns]', freq='D')

In [4]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2021-03-01,-1.183416,1.033259,0.326932,1.725981
2021-03-02,0.116951,0.756386,0.814913,-0.749633
2021-03-03,-0.254456,-1.669079,0.550803,-0.470475
2021-03-04,0.108437,-0.527687,-1.025504,1.07025
2021-03-05,0.915446,-1.813385,-0.34992,1.737633
2021-03-06,0.092384,1.606678,0.335138,0.301319


In [5]:
df = pd.DataFrame({
    'A': 1.,
    'B': pd.date_range('20210301', periods=4, freq='d'),
    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
    'D': np.array([3] * 4, dtype='int32'),
    'E': pd.Categorical(["test", "train", "test", "train"]),
    'F': 'foo'
})
df
# 数据自动填充

Unnamed: 0,A,B,C,D,E,F
0,1.0,2021-03-01,1.0,3,test,foo
1,1.0,2021-03-02,1.0,3,train,foo
2,1.0,2021-03-03,1.0,3,test,foo
3,1.0,2021-03-04,1.0,3,train,foo


In [6]:
df.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [7]:
# 查看最前面三行
df.head(3)

Unnamed: 0,A,B,C,D,E,F
0,1.0,2021-03-01,1.0,3,test,foo
1,1.0,2021-03-02,1.0,3,train,foo
2,1.0,2021-03-03,1.0,3,test,foo


In [8]:
# 列索引
df.columns

Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')

In [9]:
# 行索引
df.index

Int64Index([0, 1, 2, 3], dtype='int64')

In [10]:
array = df.to_numpy()
array

array([[1.0, Timestamp('2021-03-01 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2021-03-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2021-03-03 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2021-03-04 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [11]:
df

Unnamed: 0,A,B,C,D,E,F
0,1.0,2021-03-01,1.0,3,test,foo
1,1.0,2021-03-02,1.0,3,train,foo
2,1.0,2021-03-03,1.0,3,test,foo
3,1.0,2021-03-04,1.0,3,train,foo


In [12]:
df.T

Unnamed: 0,0,1,2,3
A,1.0,1.0,1.0,1.0
B,2021-03-01 00:00:00,2021-03-02 00:00:00,2021-03-03 00:00:00,2021-03-04 00:00:00
C,1.0,1.0,1.0,1.0
D,3,3,3,3
E,test,train,test,train
F,foo,foo,foo,foo


In [13]:
# 按照B列的值进行排序，支持lambda
df.sort_values(by='B', ascending=True)

Unnamed: 0,A,B,C,D,E,F
0,1.0,2021-03-01,1.0,3,test,foo
1,1.0,2021-03-02,1.0,3,train,foo
2,1.0,2021-03-03,1.0,3,test,foo
3,1.0,2021-03-04,1.0,3,train,foo


In [14]:
df['A']

0    1.0
1    1.0
2    1.0
3    1.0
Name: A, dtype: float64

In [15]:
df[0:2]

Unnamed: 0,A,B,C,D,E,F
0,1.0,2021-03-01,1.0,3,test,foo
1,1.0,2021-03-02,1.0,3,train,foo


In [20]:
df.loc[2: 3]

Unnamed: 0,A,B,C,D,E,F
2,1.0,2021-03-03,1.0,3,test,foo
3,1.0,2021-03-04,1.0,3,train,foo


In [31]:
df2 = pd.DataFrame(np.random.randn(6, 4), columns=list('ABCD'), index=pd.date_range('20210301', periods=6, freq='D'))
df2

Unnamed: 0,A,B,C,D
2021-03-01,1.001624,1.236549,-1.137904,0.128509
2021-03-02,1.94723,-0.157445,1.147431,-0.659523
2021-03-03,-1.441142,0.131658,-1.644041,-0.10245
2021-03-04,1.797989,-0.24533,0.006412,-0.378509
2021-03-05,0.211846,-0.680002,-0.745484,0.30881
2021-03-06,-0.264111,-1.045884,1.056168,-0.681433


In [32]:
# 按照标签提取，可以用在行名称不是int的时候
df2.loc['20210302': '20210304']

Unnamed: 0,A,B,C,D
2021-03-02,1.94723,-0.157445,1.147431,-0.659523
2021-03-03,-1.441142,0.131658,-1.644041,-0.10245
2021-03-04,1.797989,-0.24533,0.006412,-0.378509


In [35]:
df2.loc['20210301':'20210304', ['A', 'B']]

Unnamed: 0,A,B
2021-03-01,1.001624,1.236549
2021-03-02,1.94723,-0.157445
2021-03-03,-1.441142,0.131658
2021-03-04,1.797989,-0.24533


In [28]:
df

Unnamed: 0,A,B,C,D,E,F
0,1.0,2021-03-01,1.0,3,test,foo
1,1.0,2021-03-02,1.0,3,train,foo
2,1.0,2021-03-03,1.0,3,test,foo
3,1.0,2021-03-04,1.0,3,train,foo


In [37]:
df.iloc[0]

A                    1.0
B    2021-03-01 00:00:00
C                    1.0
D                      3
E                   test
F                    foo
Name: 0, dtype: object

In [43]:
# 类似python的切片方式
df.iloc[[0, 1, 2], [0, 4]]

Unnamed: 0,A,E
0,1.0,test
1,1.0,train
2,1.0,test


In [44]:
df

Unnamed: 0,A,B,C,D,E,F
0,1.0,2021-03-01,1.0,3,test,foo
1,1.0,2021-03-02,1.0,3,train,foo
2,1.0,2021-03-03,1.0,3,test,foo
3,1.0,2021-03-04,1.0,3,train,foo


In [46]:
df[df.E == 'test']

Unnamed: 0,A,B,C,D,E,F
0,1.0,2021-03-01,1.0,3,test,foo
2,1.0,2021-03-03,1.0,3,test,foo


In [50]:
# 增加一列
df['G'] = np.random.randn(4, 1)
df

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2021-03-01,1.0,3,test,0.390441,0.346262
1,1.0,2021-03-02,1.0,3,train,-0.640777,0.240316
2,1.0,2021-03-03,1.0,3,test,-0.769066,-1.94518
3,1.0,2021-03-04,1.0,3,train,-0.086116,-1.980008


In [51]:
df['F'] = ['Apple', 'Orange', 'Potato', 'Carrot']
df

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2021-03-01,1.0,3,test,Apple,0.346262
1,1.0,2021-03-02,1.0,3,train,Orange,0.240316
2,1.0,2021-03-03,1.0,3,test,Potato,-1.94518
3,1.0,2021-03-04,1.0,3,train,Carrot,-1.980008


In [52]:
df[df['F'].isin(['Apple', 'Potato'])]

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2021-03-01,1.0,3,test,Apple,0.346262
2,1.0,2021-03-03,1.0,3,test,Potato,-1.94518


In [53]:
df.to_excel('foo.xlsx', sheet_name='Sheet1')