# <span style="color:white">pandas是python数据分析核心库，充当数据读取、清洗、分析、统计、输出的高效工具</span>

## 1. Series
### 1.1 Series的创建

In [63]:
import numpy as np
import pandas as pd

s = pd.Series([11, 12, 13, 14, 15])
print(type(s), s)
print("----------------------")

# 自定义索引、元素类型
s = pd.Series([11, 12, 13, 14, 15], index=['A', 'B', 'C', 'D', 'E'], dtype=float)
print(s)
print("----------------------")

# 自定义名称
s = pd.Series([11, 12, 13, 14, 15], index=['A', 'B', 'C', 'D', 'E'], name='sample')
print(s)

<class 'pandas.core.series.Series'> 0    11
1    12
2    13
3    14
4    15
dtype: int64
----------------------
A    11.0
B    12.0
C    13.0
D    14.0
E    15.0
dtype: float64
----------------------
A    11
B    12
C    13
D    14
E    15
Name: sample, dtype: int64


In [64]:
# 通过字典创建
s = pd.Series({"A": 11, "B": 12})
print(s)
print("----------------------")

# 通过Series创建
s = pd.Series(s, index=['A'])  # 指定索引
print(s)

A    11
B    12
dtype: int64
----------------------
A    11
dtype: int64


### 1.2 Series的属性

In [65]:

s = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'])
print(s)
print(s.values)  # Series值
print(s.index)  # Series索引
s.name = 'sample'
print(s.shape, s.ndim, s.name)  # Series形状、维度、名称
print('===========================')
print(s.loc['a':'c'])  # 显式索引，按标签索引或切片
print(s.iloc[1:3])  # 隐式索引，按位置索引或切片
print('===========================')
print(s.at['a'])  # 使用标签访问单个元素
print(s.iat[2])  # 使用位置访问单个元素

a    1
b    2
c    3
d    4
e    5
dtype: int64
[1 2 3 4 5]
Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
(5,) 1 sample
a    1
b    2
c    3
Name: sample, dtype: int64
b    2
c    3
Name: sample, dtype: int64
1
3


In [66]:
# 访问数据
print(s[1])
print(s['a'])
print(s[(s > 1) & (s < 4)])  # 支持布尔条件
s['f'] = 6
print(s.head(3))  # 前n行数据(默认5)
print(s.tail(3))  # 后n行数据(默认5)

2
1
b    2
c    3
Name: sample, dtype: int64
a    1
b    2
c    3
Name: sample, dtype: int64
d    4
e    5
f    6
Name: sample, dtype: int64


### 1.3 Series常用方法

In [67]:
import numpy as np

s = pd.Series([10, 2, 7, np.NaN, None, 3],
              index=['a', 'b', 'c', 'd', 'e', 'f'],
              name='data')

# s.head()  # 前n行数据(默认5)
# s.tail(10)  # 后n行数据(默认5)
print(s.describe())  # 打印描述性信息
print(s.mean())
print(s.median())
print(s.sum())
print(s.std())
print(s.var())
print(s.min())
print(s.max())
print("--------------------")

count     4.000000
mean      5.500000
std       3.696846
min       2.000000
25%       2.750000
50%       5.000000
75%       7.750000
max      10.000000
Name: data, dtype: float64
5.5
5.0
22.0
3.696845502136472
13.666666666666666
2.0
10.0
--------------------


In [68]:
print(s.index)  # 属性 获取索引
print(s.keys())  # 方法 获取索引
print(s.isna())  # 检查每个元素是否为缺失值
print(s.isin([1, 2, 3]))  # 检查每个元素是否在参数集合中

Index(['a', 'b', 'c', 'd', 'e', 'f'], dtype='object')
Index(['a', 'b', 'c', 'd', 'e', 'f'], dtype='object')
a    False
b    False
c    False
d     True
e     True
f    False
Name: data, dtype: bool
a    False
b     True
c    False
d    False
e    False
f     True
Name: data, dtype: bool


In [69]:
print(s.sort_values())  # 排序 NaN忽略
print(s.quantile(0.75))  # 分位数
s['g'] = 2
print(s.mode())  # 众数
print(s.value_counts())  # 元素出现次数
print(s.drop_duplicates())  # 去重并返回Series
print(s.unique())  # 去重并返回List
print(s.nunique())  # 去重后元素个数

b     2.0
f     3.0
c     7.0
a    10.0
d     NaN
e     NaN
Name: data, dtype: float64
7.75
0    2.0
dtype: float64
2.0     2
10.0    1
7.0     1
3.0     1
Name: data, dtype: int64
a    10.0
b     2.0
c     7.0
d     NaN
f     3.0
Name: data, dtype: float64
[10.  2.  7. nan  3.]
4


In [70]:
print(s.sort_index())  # 按索引排序
print(s.sort_values())  # 按元素排序

a    10.0
b     2.0
c     7.0
d     NaN
e     NaN
f     3.0
g     2.0
Name: data, dtype: float64
b     2.0
g     2.0
f     3.0
c     7.0
a    10.0
d     NaN
e     NaN
Name: data, dtype: float64


### 1.4 Series案例

In [71]:
"""
创建一个包含10名学生数学成绩的Series, 成绩范围在50-100之间. 计算平均分、
最高分、最低分, 并找出高于平均分的学生人数
"""
np.random.seed(30)
scores = pd.Series(
    np.random.randint(50, 101, 10),
    index=['student' + str(i) for i in range(1, 11)])
print(scores)
print(scores.mean())
print(scores.max())
print(scores.min())
print(scores[scores < scores.mean()].count())
print(len(scores[scores < scores.mean()]))

student1     87
student2     87
student3     95
student4     95
student5     62
student6     73
student7     52
student8     67
student9     96
student10    53
dtype: int32
76.7
96
52
5
5


In [72]:
"""
给定一周每天的最高温度Series. 找出温度超过30度的天数、计算平均温度、
将温度从高到低排序、找出温度变化最大的两天
"""
temperatures = pd.Series(
    [28, 31, 29, 32, 30, 27, 33],
    index=['周一', '周二', '周三', '周四', '周五', '周六', '周日'])
print(temperatures)
print('温度高于30度的天数:', temperatures[temperatures > 30].count())
print('平均温度:', temperatures.mean())
print('高到低排序:', temperatures.sort_values(ascending=False))
temperatures.diff()  # 计算元素变化值
print('变化最大的两天:', temperatures.diff().abs()
      .sort_values(ascending=False)
      .keys()[:2]
      .tolist())

周一    28
周二    31
周三    29
周四    32
周五    30
周六    27
周日    33
dtype: int64
温度高于30度的天数: 3
平均温度: 30.0
高到低排序: 周日    33
周四    32
周二    31
周五    30
周三    29
周一    28
周六    27
dtype: int64
变化最大的两天: ['周日', '周二']


In [73]:
"""
给定某股票连续10个交易日的收盘价Series. 计算每日收益率(当日收盘价-前日收盘价-1)、
找出收益率最高和最低的日期、计算波动值(收益率的标准差)
"""
stocks = pd.Series(
    [102.3, 103.5, 105.1, 104.8, 106.2, 107.0, 106.5, 108.1, 109.3, 110.2],
    index=pd.date_range('2025-09-01', periods=10))
print(stocks)
yields = stocks.pct_change()  # 收益率计算
print('收益率:', yields)
# print('最高收益率日期:', yields[yields == yields.max()].keys())
print('最高收益率日期:', yields.idxmax())  # 最大值的标签
# print('最第收益率日期:', yields[yields == yields.min()].keys())
print('最第收益率日期:', yields.idxmin())  # 最小值的标签
print('波动率:', yields.std())

2025-09-01    102.3
2025-09-02    103.5
2025-09-03    105.1
2025-09-04    104.8
2025-09-05    106.2
2025-09-06    107.0
2025-09-07    106.5
2025-09-08    108.1
2025-09-09    109.3
2025-09-10    110.2
Freq: D, dtype: float64
收益率: 2025-09-01         NaN
2025-09-02    0.011730
2025-09-03    0.015459
2025-09-04   -0.002854
2025-09-05    0.013359
2025-09-06    0.007533
2025-09-07   -0.004673
2025-09-08    0.015023
2025-09-09    0.011101
2025-09-10    0.008234
Freq: D, dtype: float64
最高收益率日期: 2025-09-03 00:00:00
最第收益率日期: 2025-09-07 00:00:00
波动率: 0.007373623845361105


In [79]:
"""
产品过去12个月的销售量Series. 计算季度平均销量、找出销量最高的月份、
计算月环比增长率(与上个月比较)、找出连续增长超过两个月的月份
"""
sales = pd.Series(
    [125, 135, 145, 160, 155, 170, 180, 175, 190, 200, 210, 220],
    index=pd.date_range('2025-09-01', periods=12, freq='MS'))
print(sales)
print('季度平均销量:', sales.resample('QS').mean())  # 重采样
print('销量最高月份:', sales.idxmax())
print('月环比增长率:', sales.pct_change())
temp = sales.pct_change() > 0  #  月环比是否增长
print('连续增长超过两个月的月份:',
      temp[temp.rolling(3).sum() == 3]  # 滚动窗口
      .keys()
      .tolist())

2025-09-01    125
2025-10-01    135
2025-11-01    145
2025-12-01    160
2026-01-01    155
2026-02-01    170
2026-03-01    180
2026-04-01    175
2026-05-01    190
2026-06-01    200
2026-07-01    210
2026-08-01    220
Freq: MS, dtype: int64
季度平均销量: 2025-07-01    125.000000
2025-10-01    146.666667
2026-01-01    168.333333
2026-04-01    188.333333
2026-07-01    215.000000
Freq: QS-JAN, dtype: float64
销量最高月份: 2026-08-01 00:00:00
月环比增长率: 2025-09-01         NaN
2025-10-01    0.080000
2025-11-01    0.074074
2025-12-01    0.103448
2026-01-01   -0.031250
2026-02-01    0.096774
2026-03-01    0.058824
2026-04-01   -0.027778
2026-05-01    0.085714
2026-06-01    0.052632
2026-07-01    0.050000
2026-08-01    0.047619
Freq: MS, dtype: float64
连续增长超过两个月的月份: [Timestamp('2025-12-01 00:00:00'), Timestamp('2026-07-01 00:00:00'), Timestamp('2026-08-01 00:00:00')]


In [99]:
"""
商店每小时销售额Series. 按天重采样计算每日总销售额、计算每天营业时间(08:00-22:00)
和非营业时间的销售额比例、找出销售额最高的三个小时
"""
np.random.seed(30)
hourly_sales = pd.Series(
    np.random.randint(0, 100, 24),
    index=pd.date_range('2025-09-01', periods=24, freq='H'))
print(hourly_sales)
print('每日总销售额:', hourly_sales.resample('D').sum())

# 筛选一段时间内的Series
# print(hourly_sales.between_time('08:00', '22:00'))
# business_sales = hourly_sales[(hourly_sales.index.hour >= 8) & (hourly_sales.index.hour <= 22)]
business_period = (hourly_sales.index.hour >= 8) & (hourly_sales.index.hour <= 22)
business_sales = hourly_sales[business_period]

# 删掉对应标签
# non_business_sales = hourly_sales.drop(business_sales.index)
non_business_sales = hourly_sales[~business_period]

print('营业时间和非营业时间销售额比例:',
      business_sales.sum() / non_business_sales.sum())
print('销售额最高的三个小时:', hourly_sales.nlargest(3).keys().tolist())



2025-09-01 00:00:00    37
2025-09-01 01:00:00    37
2025-09-01 02:00:00    45
2025-09-01 03:00:00    45
2025-09-01 04:00:00    12
2025-09-01 05:00:00    23
2025-09-01 06:00:00     2
2025-09-01 07:00:00    53
2025-09-01 08:00:00    17
2025-09-01 09:00:00    46
2025-09-01 10:00:00     3
2025-09-01 11:00:00    41
2025-09-01 12:00:00     7
2025-09-01 13:00:00    65
2025-09-01 14:00:00    49
2025-09-01 15:00:00    45
2025-09-01 16:00:00    61
2025-09-01 17:00:00    35
2025-09-01 18:00:00    18
2025-09-01 19:00:00    18
2025-09-01 20:00:00    76
2025-09-01 21:00:00    16
2025-09-01 22:00:00     6
2025-09-01 23:00:00    62
Freq: H, dtype: int32
每日总销售额: 2025-09-01    819
Freq: D, dtype: int32
营业时间和非营业时间销售额比例: 1.5917721518987342
销售额最高的三个小时: [Timestamp('2025-09-01 20:00:00'), Timestamp('2025-09-01 13:00:00'), Timestamp('2025-09-01 23:00:00')]
