这一节介绍Series和DataFrame两中数据结构

In [2]:
import pandas as pd

In [3]:
import numpy as np

一个带有默认标签的序列
series相当于有标签的一维数组

In [4]:
s=pd.Series([1,3,6,np.nan,44,1])

In [5]:
s

0     1.0
1     3.0
2     6.0
3     NaN
4    44.0
5     1.0
dtype: float64

自定义标签的序列
series(data_list,index_list)
data_list:numpy一维数组
index_list:

In [6]:
s1=pd.Series(np.random.randn(3),index=np.arange(3,6))

In [7]:
s1

3   -0.925190
4   -0.203900
5   -0.314005
dtype: float64

一个日期序列

In [8]:
dates=pd.date_range('20250920',periods=6)

In [9]:
dates

DatetimeIndex(['2025-09-20', '2025-09-21', '2025-09-22', '2025-09-23',
               '2025-09-24', '2025-09-25'],
              dtype='datetime64[ns]', freq='D')

指定行和列的标签

In [10]:
df=pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])

In [11]:
df

Unnamed: 0,a,b,c,d
2025-09-20,1.402608,-0.904716,-1.404392,-0.26088
2025-09-21,-0.690798,0.787976,-0.536739,-0.585693
2025-09-22,-1.720823,0.720672,-0.382544,0.911185
2025-09-23,0.68462,2.089258,1.102352,-0.758509
2025-09-24,1.62201,0.820028,0.206039,-0.851916
2025-09-25,-0.36467,1.568196,0.940683,0.823041


新建一个行与列都没有标签的矩阵，行与列的标签默认是从0开始的数字序列

In [12]:
df1=pd.DataFrame(np.arange(12).reshape(3,4))

In [13]:
df1

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [14]:
df2=pd.DataFrame({'A':1.,
'B':pd.Timestamp('20251001'),
'C':pd.Series(1,index=list(range(4)),dtype='float32'),
'D':np.array([3]*4,dtype='int32'),
'E':pd.Categorical(["test","train","test","train"]),
'F':'foo'
                 }
)

In [15]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2025-10-01,1.0,3,test,foo
1,1.0,2025-10-01,1.0,3,train,foo
2,1.0,2025-10-01,1.0,3,test,foo
3,1.0,2025-10-01,1.0,3,train,foo


In [16]:
df2.dtypes

A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object

In [17]:
df2.index

Index([0, 1, 2, 3], dtype='int64')

In [18]:
df2.columns

Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')

In [19]:
df2.values

array([[1.0, Timestamp('2025-10-01 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2025-10-01 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2025-10-01 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2025-10-01 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [21]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2025-10-01,1.0,3,test,foo
1,1.0,2025-10-01,1.0,3,train,foo
2,1.0,2025-10-01,1.0,3,test,foo
3,1.0,2025-10-01,1.0,3,train,foo


In [22]:
df2.describe()

Unnamed: 0,A,B,C,D
count,4.0,4,4.0,4.0
mean,1.0,2025-10-01 00:00:00,1.0,3.0
min,1.0,2025-10-01 00:00:00,1.0,3.0
25%,1.0,2025-10-01 00:00:00,1.0,3.0
50%,1.0,2025-10-01 00:00:00,1.0,3.0
75%,1.0,2025-10-01 00:00:00,1.0,3.0
max,1.0,2025-10-01 00:00:00,1.0,3.0
std,0.0,,0.0,0.0


In [23]:
df2.T

Unnamed: 0,0,1,2,3
A,1.0,1.0,1.0,1.0
B,2025-10-01 00:00:00,2025-10-01 00:00:00,2025-10-01 00:00:00,2025-10-01 00:00:00
C,1.0,1.0,1.0,1.0
D,3,3,3,3
E,test,train,test,train
F,foo,foo,foo,foo


In [24]:
df2.sort_index(axis=1,ascending=False)

Unnamed: 0,F,E,D,C,B,A
0,foo,test,3,1.0,2025-10-01,1.0
1,foo,train,3,1.0,2025-10-01,1.0
2,foo,test,3,1.0,2025-10-01,1.0
3,foo,train,3,1.0,2025-10-01,1.0


In [26]:
df2.sort_index(axis=0,ascending=False)

Unnamed: 0,A,B,C,D,E,F
3,1.0,2025-10-01,1.0,3,train,foo
2,1.0,2025-10-01,1.0,3,test,foo
1,1.0,2025-10-01,1.0,3,train,foo
0,1.0,2025-10-01,1.0,3,test,foo


In [27]:
df2.sort_values(by="E")

Unnamed: 0,A,B,C,D,E,F
0,1.0,2025-10-01,1.0,3,test,foo
2,1.0,2025-10-01,1.0,3,test,foo
1,1.0,2025-10-01,1.0,3,train,foo
3,1.0,2025-10-01,1.0,3,train,foo
