In [None]:
# 注意事项

In [None]:
'''
区分copy（拷贝的数据）与view（数据的索引）的区别：
1 SettingWithCopyWarning警告: 确认自己是否要改变最开始的表格的值
2 有时可以使用.copy()直接进行拷贝
'''

In [3]:
# 以前面的 workout 为例
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 创建表格
workout_dict = {
  "calories": [420, 380, 390, 390],
  "duration": [50, 40, 45, 45],
  "type": ['run', 'walk', 'walk', 'run']
}

workout = pd.DataFrame(workout_dict)
display(workout)
print(type(workout))

Unnamed: 0,calories,duration,type
0,420,50,run
1,380,40,walk
2,390,45,walk
3,390,45,run


<class 'pandas.core.frame.DataFrame'>


In [5]:
workout = pd.DataFrame(workout_dict, 
                       index=['day1', 'day2', 'day3', 'day4'])
workout


Unnamed: 0,calories,duration,type
day1,420,50,run
day2,380,40,walk
day3,390,45,walk
day4,390,45,run


In [6]:
# 对workout可以提取dataframe（多维数据）或者series（一维的）
workout_sub1 = workout.loc[["day1"], :]
workout_sub1['calories'] = 1700
display(workout_sub1)
display(workout)

Unnamed: 0,calories,duration,type
day1,1700,50,run


Unnamed: 0,calories,duration,type
day1,420,50,run
day2,380,40,walk
day3,390,45,walk
day4,390,45,run


In [7]:
# 再对上面提取的dataframe（多维数据）或者series（一维的）进行赋值
# 会有易错提示
workout_sub2 = workout.loc["day1", :]
workout_sub2['calories'] = 2700
display(workout_sub2)
display(workout)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  workout_sub2['calories'] = 2700


calories    2700
duration      50
type         run
Name: day1, dtype: object

Unnamed: 0,calories,duration,type
day1,420,50,run
day2,380,40,walk
day3,390,45,walk
day4,390,45,run


In [9]:
# 在series上进行重新赋值 其给了一个警告 意思是原来DataFrame中的值没有被改变
# 从本质上说 一般不能'串联'赋值表格

In [10]:
# 等价于: workout_sub2['calories'] = 2700
workout.loc["day1"]["calories"] = 2700 # 原来workout中的值没有被改变
workout

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  workout.loc["day1"]["calories"] = 2700


Unnamed: 0,calories,duration,type
day1,420,50,run
day2,380,40,walk
day3,390,45,walk
day4,390,45,run


In [14]:
# 正确的方法
# 法一
workout.at["day1", "calories"] = 2700 #.at赋值单个数据效率更高一点
workout

Unnamed: 0,calories,duration,type
day1,2700,50,run
day2,380,40,walk
day3,390,45,walk
day4,390,45,run


In [15]:
# 法二
# 使用.copy()复制DataFrame
workout_copy = workout.copy()
workout_copy.at["day1", "calories"] = 12345
workout # 原始DataFrame没有变化（法一已经更改了 这里好像提现不出来）

Unnamed: 0,calories,duration,type
day1,2700,50,run
day2,380,40,walk
day3,390,45,walk
day4,390,45,run


In [None]:
'''
缺失值的处理:
1 替换为0 (fill with 0)
2 替换为均值/中位数值 (fill with mean/median)
3 直接删除对应纪律 (delete the records with NAs)
4 根据其他列差值 (interpolate)
5 根据其他列均值回归 (linear regression)
'''

In [None]:
'''
Pandas的不足：
1 数据全部在内存之中(一般个人电脑设备的内存限制在8GB-64GB)
2 单个核心运行
3 在大数据/云计算/多核计算中，可以使用Spark/PySpark等工具进行运算
'''