# 3.0 简介

知识点：数据整理是将原始数据转换成整洁、组织合理的形式的过程。最常用数据结构是DataFrame（表格状，用行和列表示数据）。
示例：泰坦尼克号乘客数据

# 3.1 创建一个数据帧

In [4]:
# 加载库
import pandas as pd

# 创建数据帧
dataframe = pd.DataFrame()

# 增加列
dataframe['Name'] = ['Jacky Jackson', 'Steven Stevenson']
dataframe['Age'] = [38, 25]
dataframe['Driver'] = [True, False]

# 查看数据帧
dataframe

Unnamed: 0,Name,Age,Driver
0,Jacky Jackson,38,True
1,Steven Stevenson,25,False


创建后追加行

In [6]:
# 创建一行（转为DataFrame格式）
new_person = pd.DataFrame([['Molly Mooney', 40, True]],
                          columns=['Name', 'Age', 'Driver'])

# 用 pd.concat 合并
dataframe = pd.concat([dataframe, new_person], ignore_index=True)

# 3.2 描述数据

In [7]:
# 加载库
import pandas as pd
import seaborn as sns

# 加载泰坦尼克号数据集
dataframe = sns.load_dataset('titanic')

# 查看前两行数据
dataframe.head(2)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False


查看维度和描述性统计量

In [8]:
# 查看维数
dataframe.shape  # (891, 15)

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [9]:
# 查看描述性统计量
dataframe.describe()
# 输出: count, mean, std, min, 25%, 50%, 75%, max 等统计量

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


讨论：
head()：查看前几行（默认5行）
tail()：查看最后几行
describe()：提供数值列的基本描述性统计量

# 3.3 浏览数据帧

In [10]:
# 加载库
import pandas as pd
import seaborn as sns

# 加载泰坦尼克号数据集
dataframe = sns.load_dataset('titanic')

# 选择第一行
dataframe.iloc[0]

survived                 0
pclass                   3
sex                   male
age                   22.0
sibsp                    1
parch                    0
fare                  7.25
embarked                 S
class                Third
who                    man
adult_male            True
deck                   NaN
embark_town    Southampton
alive                   no
alone                False
Name: 0, dtype: object

选择多行

In [11]:
# 选择三行（第2-4行）
dataframe.iloc[1:4]

# 获取到第4行为止的所有行
dataframe.iloc[:4]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False


使用标签索引

In [12]:
# 选择第一行
dataframe.loc[0]

# 选择多行
dataframe.loc[0:2]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True


讨论：
iloc：按整数位置索引
loc：按标签索引
索引可以是字符串或自定义数字

# 3.4 根据条件语句来选择行

In [13]:
# 加载库
import pandas as pd
import seaborn as sns

# 加载泰坦尼克号数据集
dataframe = sns.load_dataset('titanic')

# 展示Sex列是female的前两行
dataframe[dataframe['sex'] == 'female'].head(2)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True


多个条件

In [14]:
# 筛选出所有年龄大于或等于65岁的女性乘客
dataframe[(dataframe['sex'] == 'female') & (dataframe['age'] >= 65)]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone


# 3.5替换值

In [22]:
# 加载库
import pandas as pd
import seaborn as sns

# 加载泰坦尼克号数据集
dataframe = sns.load_dataset('titanic')

# 替换列中所有的"female"
dataframe['sex'].replace("female", "Woman").head(2)

0     male
1    Woman
Name: sex, dtype: object

替换多个值

In [23]:
# 用"Woman"和"Man"分别替换"female"和"male"
dataframe['sex'].replace(["female", "male"], ["Woman", "Man"]).head(5)

0      Man
1    Woman
2    Woman
3    Woman
4      Man
Name: sex, dtype: object

替换整列中的值

In [24]:
# 替换sex列的值，并查看两行数据
dataframe['sex'] = dataframe['sex'].replace(1, "One")
dataframe.head(2)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False


使用正则表达式

In [26]:
# 将"First"类标签替换
dataframe['class'] = dataframe['class'].cat.rename_categories({'First': '1st'})
dataframe['class'].head(2)

0    Third
1      1st
Name: class, dtype: category
Categories (3, object): ['1st', 'Second', 'Third']

# 3.6 重命名列

In [27]:
# 加载库
import pandas as pd
import seaborn as sns

# 加载泰坦尼克号数据集
dataframe = sns.load_dataset('titanic')

# 重命名列，查看两行数据
dataframe.rename(columns={'pclass': 'Passenger Class'}).head(2)

Unnamed: 0,survived,Passenger Class,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False


讨论：rename方法灵活，可批量重命名列。

# 3.7 计算最小值、最大值、总和、平均值与计数值

In [28]:
# 加载库
import pandas as pd
import seaborn as sns

# 加载泰坦尼克号数据集
dataframe = sns.load_dataset('titanic')

# 计算描述统计量
print('Maximum:', dataframe['age'].max())
print('Minimum:', dataframe['age'].min())
print('Mean:', dataframe['age'].mean())
print('Sum:', dataframe['age'].sum())
print('Count:', dataframe['age'].count())

Maximum: 80.0
Minimum: 0.42
Mean: 29.69911764705882
Sum: 21205.17
Count: 714


对整个数据帧应用统计方法

In [29]:
# 查看计数
dataframe.count()
# 输出每列的非空值数量

survived       891
pclass         891
sex            891
age            714
sibsp          891
parch          891
fare           891
embarked       889
class          891
who            891
adult_male     891
deck           203
embark_town    889
alive          891
alone          891
dtype: int64

其他描述性统计量：
var()：方差
std()：标准差
kurt()：峰态
skew()：偏态
sem()：平均值标准误差
mode()：众数
median()：中位数

# 3.8 寻找唯一值

In [30]:
# 加载库
import pandas as pd
import seaborn as sns

# 加载泰坦尼克号数据集
dataframe = sns.load_dataset('titanic')

# 筛选出唯一值
dataframe['sex'].unique()
# 输出: array(['male', 'female'], dtype=object)

array(['male', 'female'], dtype=object)

统计唯一值出现的次数

In [31]:
# 查看计数
dataframe['sex'].value_counts()
# 输出:
# male      577
# female    314
# Name: sex, dtype: int64

sex
male      577
female    314
Name: count, dtype: int64

处理分类数据问题

In [32]:
# 查看class列的计数
dataframe['class'].value_counts()
# 输出:
# Third    491
# First    216
# Second   184
# Name: class, dtype: int64

class
Third     491
First     216
Second    184
Name: count, dtype: int64

统计唯一值的数量

In [33]:
# 查看唯一值的个数
dataframe['class'].nunique()  # 3

3

# 3.9 处理缺失值

In [34]:
# 加载库
import pandas as pd
import seaborn as sns

# 加载泰坦尼克号数据集
dataframe = sns.load_dataset('titanic')

# isnull和notnull返回布尔值
dataframe[dataframe['age'].isnull()].head(2)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
17,1,2,male,,0,0,13.0,S,Second,man,True,,Southampton,yes,True


使用numpy中的NaN

In [35]:
# 加载NumPy
import numpy as np

# 用NaN替换值
dataframe['sex'] = dataframe['sex'].replace('male', np.nan)

加载时指定缺失值

In [36]:
# 加载数据，设置缺失值
dataframe = sns.load_dataset('titanic')
# seaborn自带数据集通常已处理好缺失值

# 3.10 删除一列

In [37]:
# 加载库
import pandas as pd
import seaborn as sns

# 加载泰坦尼克号数据集
dataframe = sns.load_dataset('titanic')

# 删除列
dataframe.drop("age", axis=1).head(2)

Unnamed: 0,survived,pclass,sex,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False


删除多列

In [39]:
# 删除多列
dataframe.drop(['age', 'sex'], axis=1).head(2)

Unnamed: 0,survived,pclass,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False


按列下标删除

In [40]:
# 删除第二列
dataframe.drop(dataframe.columns[1], axis=1).head(2)

Unnamed: 0,survived,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False


# 3.11 删除1行

In [42]:
# 加载库
import pandas as pd
import seaborn as sns

# 加载泰坦尼克号数据集
dataframe = sns.load_dataset('titanic')

# 删除一行，查看输出结果的前两行
dataframe[dataframe['sex'] != 'female'].head(2)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


按索引删除

In [43]:
# 删除一行，查看输出结果的前两行
dataframe[dataframe.index != 0].head(2)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True


讨论：推荐将数据帧视为不可变对象，使用布尔条件筛选而非直接修改。

# 3.12 删除重复行

In [44]:
# 加载库
import pandas as pd
import seaborn as sns

# 加载泰坦尼克号数据集
dataframe = sns.load_dataset('titanic')

# 删除重复行
dataframe.drop_duplicates().head(2)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False


根据指定列删除

In [45]:
# 删除sex列的重复行（保留第一个）
dataframe.drop_duplicates(subset=['sex'])

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False


保留最后出现的行

In [46]:
# 删除重复行（保留最后出现的）
dataframe.drop_duplicates(subset=['sex'], keep='last')

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
888,0,3,female,,1,2,23.45,S,Third,woman,False,,Southampton,no,False
890,0,3,male,32.0,0,0,7.75,Q,Third,man,True,,Queenstown,no,True


讨论：默认keep='first'保留第一次出现的重复行。

# 3.13 根据值对行进行分组

In [51]:
# 加载库
import pandas as pd
import seaborn as sns

# 加载泰坦尼克号数据集
dataframe = sns.load_dataset('titanic')
# 直接指定要计算平均值的数值列
dataframe.groupby('sex')[['survived', 'age', 'fare']].mean()

Unnamed: 0_level_0,survived,age,fare
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.742038,27.915709,44.479818
male,0.188908,30.726645,25.523893


多列分组和统计

In [53]:
# 按行分组，计算行数
dataframe.groupby('survived')['sex'].count()

survived
0    549
1    342
Name: sex, dtype: int64

In [54]:
# 多列分组，计算平均年龄
dataframe.groupby(['sex', 'survived'])['age'].mean()

sex     survived
female  0           25.046875
        1           28.847716
male    0           31.618056
        1           27.276022
Name: age, dtype: float64

# 3.14 按时间段进行分组

In [55]:
# 加载库
import pandas as pd
import numpy as np

# 创建日期范围
time_index = pd.date_range('06/06/2017', periods=100000, freq='30s')

# 创建数据帧
dataframe = pd.DataFrame(index=time_index)

# 创建一列随机变量
dataframe['Sale_Amount'] = np.random.randint(1, 10, 100000)

# 按周对行分组，计算每一周的总和
dataframe.resample('W').sum()
# 输出:
#             Sale_Amount
# 2017-06-11        86423
# 2017-06-18       101045
# 2017-06-25       100867
# 2017-07-02       100894
# 2017-07-09       100438
# 2017-07-16        10297

Unnamed: 0,Sale_Amount
2017-06-11,86420
2017-06-18,100638
2017-06-25,101196
2017-07-02,100562
2017-07-09,100344
2017-07-16,10387


按不同时间段分组

In [56]:
# 按两周分组，计算平均值
dataframe.resample('2W').mean()

# 按月分组，计算行数
dataframe.resample('M').count()

  dataframe.resample('M').count()


Unnamed: 0,Sale_Amount
2017-06-30,72000
2017-07-31,28000


调整标签

In [57]:
# 按月分组，计算行数（使用左标签）
dataframe.resample('M', label='left').count()

  dataframe.resample('M', label='left').count()


Unnamed: 0,Sale_Amount
2017-05-31,72000
2017-06-30,28000


讨论：
数据帧的索引必须是datetime类型
时间偏移别名：W（周），2W（两周），M（月）
label参数控制使用左边界还是右边界作为标签

# 3.15 遍历一个列的数据

In [61]:
# 导入所需库
import pandas as pd

# 加载 Kaggle 完整泰坦尼克数据集（无需手动下载，直接通过链接加载）
url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
dataframe = pd.read_csv(url)

# 循环打印 'Name' 列前两行的大写形式（注意列名是 Name，首字母大写）
for person in dataframe['Name'][0:2]:
    # 预期输出：注释里的姓名大写形式
    # BRAUND, MR. OWEN HARRIS
    # ALLEN, MISS. ELISABETH WALTON
    print(person.upper())

BRAUND, MR. OWEN HARRIS
CUMINGS, MRS. JOHN BRADLEY (FLORENCE BRIGGS THAYER)


列表解析式代替方案

In [63]:
# 以大写的形式打印前两行的名字
[name.upper() for name in dataframe['Name'][0:2]]
# 输出: ['ALLEN,MISS ELISABETH WALTON', 'ALLISON,MISS HELEN LORAINE']

['BRAUND, MR. OWEN HARRIS',
 'CUMINGS, MRS. JOHN BRADLEY (FLORENCE BRIGGS THAYER)']

# 3.16对一列的所有元素应用某个函数

In [67]:
# 加载库
import pandas as pd
import seaborn as sns

# 加载泰坦尼克号数据集
url = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
dataframe = pd.read_csv(url)
# 创建一个函数
def uppercase(x):
    return x.upper()

# 应用函数，查看两行
dataframe['Name'].apply(uppercase)[0:2]
# 输出:
# 0    ALLEN,MISS ELISABETH WALTON
# 1    ALLISON,MISS HELEN LORAINE
# Name: name, dtype: object

0                              BRAUND, MR. OWEN HARRIS
1    CUMINGS, MRS. JOHN BRADLEY (FLORENCE BRIGGS TH...
Name: Name, dtype: object

讨论：apply比for循环更高效，是pandas推荐的方式。


# 3.17对所有分组应用一个函数

In [68]:
# 加载库
import pandas as pd
import seaborn as sns

# 加载泰坦尼克号数据集
dataframe = sns.load_dataset('titanic')

# 对行分组，然后在每一组上应用函数
dataframe.groupby('sex').apply(lambda x: x.count())

  dataframe.groupby('sex').apply(lambda x: x.count())


Unnamed: 0_level_0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
female,314,314,314,261,314,314,314,312,314,314,314,97,312,314,314
male,577,577,577,453,577,577,577,577,577,577,577,106,577,577,577


讨论：联合使用groupby和apply可计算自定义统计量。

# 3.18连接多个数据帧

连接两个数据帧

In [69]:
# 加载库
import pandas as pd

# 创建数据帧A
data_a = {'id': ['1', '2', '3'],
          'first': ['Alex', 'Amy', 'Allen'],
          'last': ['Anderson', 'Ackerman', 'Ali']}
dataframe_a = pd.DataFrame(data_a, columns=['id', 'first', 'last'])

# 创建数据帧B
data_b = {'id': ['4', '5', '6'],
          'first': ['Billy', 'Brian', 'Bran'],
          'last': ['Bonder', 'Black', 'Balwner']}
dataframe_b = pd.DataFrame(data_b, columns=['id', 'first', 'last'])

# 沿着行的方向连接两个数据帧
pd.concat([dataframe_a, dataframe_b], axis=0)

Unnamed: 0,id,first,last
0,1,Alex,Anderson
1,2,Amy,Ackerman
2,3,Allen,Ali
0,4,Billy,Bonder
1,5,Brian,Black
2,6,Bran,Balwner


在列方向上连接

In [70]:
# 在列的方向上连接两个数据帧
pd.concat([dataframe_a, dataframe_b], axis=1)

Unnamed: 0,id,first,last,id.1,first.1,last.1
0,1,Alex,Anderson,4,Billy,Bonder
1,2,Amy,Ackerman,5,Brian,Black
2,3,Allen,Ali,6,Bran,Balwner


追加一行

In [75]:
# 创建一行
row = pd.Series([10, 'Chris', 'Chillon'], index=['id', 'first', 'last'])
row_df = pd.DataFrame([row])
dataframe_a = pd.concat([dataframe_a, row_df], ignore_index=True)

# 3.19合并两个数据帧

In [76]:
# 加载库
import pandas as pd

# 创建员工数据
employee_data = {'employee_id': ['1', '2', '3', '4'],
                 'name': ['Amy Jones', 'Allen Keys', 'Alice Bees', 'Tim Horton']}
dataframe_employees = pd.DataFrame(employee_data, columns=['employee_id', 'name'])

# 创建销售数据
sales_data = {'employee_id': ['3', '4', '5', '6'],
              'total_sales': [23456, 2512, 2345, 1455]}
dataframe_sales = pd.DataFrame(sales_data, columns=['employee_id', 'total_sales'])

# 合并数据帧（等值连接）
pd.merge(dataframe_employees, dataframe_sales, on='employee_id')

Unnamed: 0,employee_id,name,total_sales
0,3,Alice Bees,23456
1,4,Tim Horton,2512


外链接

In [77]:
# 合并两个数据帧（外连接）
pd.merge(dataframe_employees, dataframe_sales, on='employee_id', how='outer')

Unnamed: 0,employee_id,name,total_sales
0,1,Amy Jones,
1,2,Allen Keys,
2,3,Alice Bees,23456.0
3,4,Tim Horton,2512.0
4,5,,2345.0
5,6,,1455.0


指定左右两个列名

In [78]:
# 指定每个数据帧中的列名进行合并
pd.merge(dataframe_employees, dataframe_sales, left_on='employee_id', right_on='employee_id')

Unnamed: 0,employee_id,name,total_sales
0,3,Alice Bees,23456
1,4,Tim Horton,2512


连接类型说明：
Inner（默认）：只返回左右数据帧匹配的行
Outer：返回所有行，用NaN填充缺失值
Left：返回左数据帧所有行，只返回右数据帧匹配的行
Right：返回右数据帧所有行，只返回左数据帧匹配的行
讨论：现实场景中数据常分散在多个查询结果或文件中，需要合并为大数据帧。


知识点总结
DataFrame基础操作：
创建：直接赋值或使用外部数据源
查看：head(), tail(), describe(), info()
索引：iloc（按位置）, loc（按标签）
数据清洗：
值替换：replace()（支持正则）
缺失值处理：isnull(), notnull(), dropna(), fillna()
列操作：drop()（删除）, rename()（重命名）
行操作：布尔条件筛选, drop_duplicates()（去重）
数据探索：
统计计算：min(), max(), mean(), sum(), count()
唯一值：unique(), value_counts(), nunique()
分组聚合：groupby() + 聚合函数
高级操作：
时间序列：resample()（按时间段分组）
函数应用：apply()（列级）, map()（元素级）
数据合并：concat()（连接）, merge()（合并）
迭代操作：iterrows(), itertuples()（不推荐，性能差）
最佳实践：
优先使用向量化操作（apply）而非循环
将DataFrame视为不可变对象，使用筛选而非直接修改
处理分类数据时注意检查异常值
设置随机种子保证可复现性
SQL是工业界数据提取的标准方式