# Missing Data

In [7]:
import numpy as np
import pandas as pd

n = 8
np.random.seed(42)
random_matrix = np.random.randint(1, 10, (n,n))   # array 8*8
random_matrix

array([[7, 4, 8, 5, 7, 3, 7, 8],
       [5, 4, 8, 8, 3, 6, 5, 2],
       [8, 6, 2, 5, 1, 6, 9, 1],
       [3, 7, 4, 9, 3, 5, 3, 7],
       [5, 9, 7, 2, 4, 9, 2, 9],
       [5, 2, 4, 7, 8, 3, 1, 4],
       [2, 8, 4, 2, 6, 6, 4, 6],
       [2, 2, 4, 8, 7, 9, 8, 5]])

In [8]:
random_matrix.shape

(8, 8)

In [9]:
random_matrix.size

64

In [14]:
np.random.choice(random_matrix.size)

# 从 0 到 random_matrix.size - 1 的范围内随机选取一个整数

52

In [36]:
np.random.seed(42)
index = np.random.choice(random_matrix.size, 10, replace=False)    # index是随机生成的
random_matrix = random_matrix * 1.0    # 数据类型（dtype）从整型 (int) 转换为浮点型 (float)
random_matrix.ravel()[index] = np.nan 
random_matrix

# ravel() :to 2D, 是 NumPy 中用来将多维数组展平（flatten）为一维数组的函数,会返回一个一维视图（如果可能），而不是创建数组的副本
# 将 NumPy 矩阵 random_matrix 展平为一维数组（使用 .ravel() 方法），并用 np.nan 替换指定索引 index 对应的元素 为 nan

# 从范围 [0, random_matrix.size-1] 中随机抽取 10 个索引。默认情况下，replace=True，因此抽取时是允许重复的。
# replace=False 时，意味着无放回抽样：每次随机选择后，该值不能再次被选择。这确保了抽样结果中没有重复的值

array([[nan,  4.,  8.,  5.,  7., nan,  7.,  8.],
       [ 5.,  4.,  8.,  8., nan,  6.,  5.,  2.],
       [nan,  6.,  2.,  5.,  1.,  6.,  9.,  1.],
       [ 3., nan,  4.,  9.,  3.,  5.,  3.,  7.],
       [ 5.,  9.,  7.,  2., nan,  9.,  2.,  9.],
       [ 5.,  2.,  4.,  7., nan,  3.,  1.,  4.],
       [ 2.,  8.,  4.,  2., nan,  6.,  4.,  6.],
       [ 2.,  2., nan,  8.,  7., nan,  8.,  5.]])

In [38]:
scores = pd.DataFrame(random_matrix)
scores

Unnamed: 0,0,1,2,3,4,5,6,7
0,,4.0,8.0,5.0,7.0,,7.0,8.0
1,5.0,4.0,8.0,8.0,,6.0,5.0,2.0
2,,6.0,2.0,5.0,1.0,6.0,9.0,1.0
3,3.0,,4.0,9.0,3.0,5.0,3.0,7.0
4,5.0,9.0,7.0,2.0,,9.0,2.0,9.0
5,5.0,2.0,4.0,7.0,,3.0,1.0,4.0
6,2.0,8.0,4.0,2.0,,6.0,4.0,6.0
7,2.0,2.0,,8.0,7.0,,8.0,5.0


In [39]:
scores = pd.DataFrame(
    random_matrix,
    index=[f"Player {i}" for i in range(1, n + 1)],
    columns=[f"Round {i}" for i in range(1, n + 1)],
)
scores

Unnamed: 0,Round 1,Round 2,Round 3,Round 4,Round 5,Round 6,Round 7,Round 8
Player 1,,4.0,8.0,5.0,7.0,,7.0,8.0
Player 2,5.0,4.0,8.0,8.0,,6.0,5.0,2.0
Player 3,,6.0,2.0,5.0,1.0,6.0,9.0,1.0
Player 4,3.0,,4.0,9.0,3.0,5.0,3.0,7.0
Player 5,5.0,9.0,7.0,2.0,,9.0,2.0,9.0
Player 6,5.0,2.0,4.0,7.0,,3.0,1.0,4.0
Player 7,2.0,8.0,4.0,2.0,,6.0,4.0,6.0
Player 8,2.0,2.0,,8.0,7.0,,8.0,5.0


## Missing Data

- None - pythonic missing data, python object, not use in numpy and pandas
- NaN - floating point Not a number, support fast operations. All arithmetic operations with NaN will result in a nan, use in numpy and pandas.
- NA/ pd.NA - be used consistently across data types so this is pandas version of not the number       
跨数据类型一致使用，因此这是 pandas 版本的非数字

Pandas
- is null() - return true if value is null
- notnull() / notna() - return true if value not null
- dropna() - drops whole axis with nulls. 默认 axis=0 ，删除有 nan 的整行 row ; axis=1 ，删除有 nan 的整列 column .
- fillna() - fills null values with certain balue

In [40]:
scores.isnull()

Unnamed: 0,Round 1,Round 2,Round 3,Round 4,Round 5,Round 6,Round 7,Round 8
Player 1,True,False,False,False,False,True,False,False
Player 2,False,False,False,False,True,False,False,False
Player 3,True,False,False,False,False,False,False,False
Player 4,False,True,False,False,False,False,False,False
Player 5,False,False,False,False,True,False,False,False
Player 6,False,False,False,False,True,False,False,False
Player 7,False,False,False,False,True,False,False,False
Player 8,False,False,True,False,False,True,False,False


In [41]:
scores.notnull()

Unnamed: 0,Round 1,Round 2,Round 3,Round 4,Round 5,Round 6,Round 7,Round 8
Player 1,False,True,True,True,True,False,True,True
Player 2,True,True,True,True,False,True,True,True
Player 3,False,True,True,True,True,True,True,True
Player 4,True,False,True,True,True,True,True,True
Player 5,True,True,True,True,False,True,True,True
Player 6,True,True,True,True,False,True,True,True
Player 7,True,True,True,True,False,True,True,True
Player 8,True,True,False,True,True,False,True,True


In [42]:
scores.notna()

# = scores.notnull(), interchangeably 可以互换

Unnamed: 0,Round 1,Round 2,Round 3,Round 4,Round 5,Round 6,Round 7,Round 8
Player 1,False,True,True,True,True,False,True,True
Player 2,True,True,True,True,False,True,True,True
Player 3,False,True,True,True,True,True,True,True
Player 4,True,False,True,True,True,True,True,True
Player 5,True,True,True,True,False,True,True,True
Player 6,True,True,True,True,False,True,True,True
Player 7,True,True,True,True,False,True,True,True
Player 8,True,True,False,True,True,False,True,True


In [44]:
scores.dropna()   # 默认 axis=0 ， 删除有 nan 的整行

Unnamed: 0,Round 1,Round 2,Round 3,Round 4,Round 5,Round 6,Round 7,Round 8


In [45]:
scores.dropna(axis=1)  # axis=1 ， 删除有 nan 的整列

Unnamed: 0,Round 4,Round 7,Round 8
Player 1,5.0,7.0,8.0
Player 2,8.0,5.0,2.0
Player 3,5.0,9.0,1.0
Player 4,9.0,3.0,7.0
Player 5,2.0,2.0,9.0
Player 6,7.0,1.0,4.0
Player 7,2.0,4.0,6.0
Player 8,8.0,8.0,5.0


In [48]:
scores.fillna(0)    # 赋值给 nan 的值 为0

Unnamed: 0,Round 1,Round 2,Round 3,Round 4,Round 5,Round 6,Round 7,Round 8
Player 1,0.0,4.0,8.0,5.0,7.0,0.0,7.0,8.0
Player 2,5.0,4.0,8.0,8.0,0.0,6.0,5.0,2.0
Player 3,0.0,6.0,2.0,5.0,1.0,6.0,9.0,1.0
Player 4,3.0,0.0,4.0,9.0,3.0,5.0,3.0,7.0
Player 5,5.0,9.0,7.0,2.0,0.0,9.0,2.0,9.0
Player 6,5.0,2.0,4.0,7.0,0.0,3.0,1.0,4.0
Player 7,2.0,8.0,4.0,2.0,0.0,6.0,4.0,6.0
Player 8,2.0,2.0,0.0,8.0,7.0,0.0,8.0,5.0


## Missing data strategy - discuss with domain expert

- dataset size
    - small dataset - more carefull with data
    - large dataset and few nulls - higher chance to remove nulls
- impute missing value: 推算 nan 值， 有时候可能是 mean()
- domain knowledge - how to fill data
    - e.g. score missing value -> 0
    - e.g. missing value in a column could be filled with mean or median
    - missing values between two points -> interpolate in between
    - regression to fill missing value
- note missing data can negatively impact:
    - data visualization
    - arithmetic computations and summary statistics
    - machine learning algorithms        
                 
- 数据集大小
    - 小数据集 - 更谨慎地处理数据
    - 大数据集且空值很少 - 删除空值的几率更高
- 估算缺失值：推算 nan 值，有时候可能是 mean()
- 领域知识 - 如何填充数据
    - 例如，将缺失值评分为 -> 0
    - 例如，列中的缺失值可以用平均值或中位数填充
    - 两点之间的缺失值 -> 在两者之间进行插值
    - 回归以填充缺失值
- 注意，缺失数据可能会产生负面影响：
    - 数据可视化
    - 算术计算和汇总统计
    - 机器学习算法
