In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

In [3]:
# 设置显示所有的列
pd.options.display.max_columns = None
# 设置显示所有的行
pd.options.display.max_rows = None

# 阻止waring显示
import warnings
warnings.filterwarnings('ignore')

In [4]:
# jupyter notebook设置同一个cell打印多个结果
from IPython.display import display
# 然后使用
"""
python
display(a)
display(b)
"""

'\npython\ndisplay(a)\ndisplay(b)\n'

# 常用的一些pandas函数

In [5]:
data = {'one' : pd.Series([1.,2.,3.],index=['a', 'b', 'c']),
   'two' : pd.Series([1.,2.,3.,4.],index=['a', 'b', 'c', 'd']),
   'three':pd.Series([5,6,7,8],index=['a','b','c','d'])}
df = pd.DataFrame(data)

In [6]:
df

Unnamed: 0,one,two,three
a,1.0,1.0,5
b,2.0,2.0,6
c,3.0,3.0,7
d,,4.0,8


In [7]:
# 同一个cell输出多个结果
display(df)
df

Unnamed: 0,one,two,three
a,1.0,1.0,5
b,2.0,2.0,6
c,3.0,3.0,7
d,,4.0,8


Unnamed: 0,one,two,three
a,1.0,1.0,5
b,2.0,2.0,6
c,3.0,3.0,7
d,,4.0,8


+ 检查缺失值位置

In [8]:
df.isnull()

Unnamed: 0,one,two,three
a,False,False,False
b,False,False,False
c,False,False,False
d,True,False,False


In [9]:
df.isnull().sum()

one      1
two      0
three    0
dtype: int64

+ 判断DF中是否含有某些元素

In [10]:
df.isin([6,7])

Unnamed: 0,one,two,three
a,False,False,False
b,False,False,True
c,False,False,True
d,False,False,False


In [11]:
df['three'].isin([6,7])

a    False
b     True
c     True
d    False
Name: three, dtype: bool

In [12]:
# 过滤出 three 这一列包含 [6,7] 的记录
df.loc[df['three'].isin([6,7])]

Unnamed: 0,one,two,three
b,2.0,2.0,6
c,3.0,3.0,7


+ 排序相关

In [13]:
df.sort_values(by=['three'], axis=0, ascending=False, inplace=False)

Unnamed: 0,one,two,three
d,,4.0,8
c,3.0,3.0,7
b,2.0,2.0,6
a,1.0,1.0,5


+ 计算某一列的取值个数, 注意，只有Series有这个函数，DF没有

In [14]:
df['three'].value_counts()

7    1
6    1
5    1
8    1
Name: three, dtype: int64

+ 错行（列）相减  
可以使用`df.shift()`和`df.diff()`两种函数实现

In [15]:
df

Unnamed: 0,one,two,three
a,1.0,1.0,5
b,2.0,2.0,6
c,3.0,3.0,7
d,,4.0,8


In [16]:
df.shift(periods=1)

Unnamed: 0,one,two,three
a,,,
b,1.0,1.0,5.0
c,2.0,2.0,6.0
d,3.0,3.0,7.0


In [17]:
df.diff()

Unnamed: 0,one,two,three
a,,,
b,1.0,1.0,1.0
c,1.0,1.0,1.0
d,,1.0,1.0


# 检查特征取值的平衡问题

In [14]:
t1 = np.array([[1,0],[1,1]])
t1

array([[1, 0],
       [1, 1]])

In [15]:
t2 = np.tile(t1, (2,1))
t2

array([[1, 0],
       [1, 1],
       [1, 0],
       [1, 1]])

In [37]:
def rep(n):
    """
    这个函数用于创建一个 shape 为 (n*5*2, 2)的数组，第一列是从 1 到 n, 每个值重复10次，其中每 5 次对应于第二列的0，另外5次对应于第二列的1
    """
    result = np.empty((1,2))
    for i in range(n):
        t = np.tile( np.array([[i+1,0], [i+1,1]]), (5,1) )
        result = np.vstack((result,t))
    return result[1:,:]

In [42]:
data = pd.DataFrame(rep(5), columns=['x','y'])
data.head()

Unnamed: 0,x,y
0,1.0,0.0
1,1.0,1.0
2,1.0,0.0
3,1.0,1.0
4,1.0,0.0


In [61]:
data.groupby(['x','y'])['y'].count()
# 或者
# data.groupby(['x','y']).size()
# 但是不能使用下面这个
# data.groupby(['x','y']).count()

x    y  
1.0  0.0    5
     1.0    5
2.0  0.0    5
     1.0    5
3.0  0.0    5
     1.0    5
4.0  0.0    5
     1.0    5
5.0  0.0    5
     1.0    5
Name: y, dtype: int64

+ 相关系数显示：x和y是完全没有线性关系的

In [43]:
data.corr()

Unnamed: 0,x,y
x,1.0,0.0
y,0.0,1.0


------

# 缺失值处理

In [4]:
def miss_stat(df):
    """
    用于统计DF中各列的缺失值信息和占比
    """
    miss_num = df.isnull().sum()
    # 只展示有缺失值的特征
    miss_num = miss_num[miss_num > 0]
    total = df.shape[0]
    miss_persent = miss_num/total*100
    miss_info = pd.concat([miss_num, miss_persent, df.dtypes], axis = 1, keys=['miss_num', 'miss_percent','dtype'])
    miss_info['total'] = total
    miss_info = miss_info[['miss_num','total','miss_percent','dtype']]
    miss_info.dropna(inplace=True)
#     降序排列，同时设置小数点的位数
    miss_info = miss_info.sort_values(by=['miss_num'],ascending=False).round(decimals=3)
#     打印
#     from IPython.display import display
#     display(miss_info[miss_info['miss_num'] > 0])  # 或者 print df.to_html()
#     print(miss_info[miss_info['miss_num'] > 0])
    return miss_info