# 描述性统计 Statistical Summary
- 这项工作主要是让我们知道数据的整体状况怎么样，描述这个数据的“样子”
- 数据处理的最关键，也是最重要的一步
- 了解数据的概况，有助于后续的数据分析和挖掘

描述性统计的Python工具

PANDAS

NumPy和SciPy

- count 统计非NA的数量
- describe 针对series或者DF的列计算汇总统计
- min max 最小值和最大值
- quantile 样本分位数
- sum 求和
- mean 均值
- median 中位数
- mad 根据均值计算平均绝对离差
- var 方差
- std 标准差
- skew 偏度（三阶矩）
- kurt 峰度
- cumsum 累计和
- cumprod 累积乘积
- diff 一阶差分
- pct_change 计算百分比变化
- mode 计算众数
- cov 斜方差
- corrcoef 相关系数

In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt #画图模块
from datetime import datetime #时间模块
from IPython.core.interactiveshell import InteractiveShell

#输出矢量图 渲染矢量图 魔法函数（Magic Functions）内嵌绘图
%matplotlib inline
%config InlineBackend.figure_format = 'svg'

#显示每一个运行结果
InteractiveShell.ast_node_interactivity = 'all'

#输出或者打印的时候，不限制列或者行
pd.set_option('display.max_columns',None)

In [21]:
data = pd.read_csv('C:/Users/hp/Desktop/Python/Python-4/000001.csv')
data['Day'] = pd.to_datetime(data['Day'], format = '%Y-%m-%d')
data.set_index('Day', inplace = True)
data.sort_values(by = ['Day'], axis=0, ascending=True)
data

Unnamed: 0_level_0,Preclose,Open,Highest,Lowest,Close,Volume,Money
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1990-12-19,,96.05,99.98,95.79,99.98,126000,4.940000e+05
1990-12-20,99.98,104.30,104.39,99.98,104.39,19700,8.400000e+04
1990-12-21,104.39,109.07,109.13,103.73,109.13,2800,1.600000e+04
1990-12-24,109.13,113.57,114.55,109.13,114.55,3200,3.100000e+04
1990-12-25,114.55,120.09,120.25,114.55,120.25,1500,6.000000e+03
...,...,...,...,...,...,...,...
2022-07-25,3269.97,3269.71,3273.18,3243.03,3250.39,27124574400,3.480000e+11
2022-07-26,3250.39,3254.19,3282.41,3246.04,3277.44,25946867600,3.340000e+11
2022-07-27,3277.44,3271.78,3282.57,3265.73,3275.76,24913148500,3.400000e+11
2022-07-28,3275.76,3287.50,3305.71,3277.11,3282.58,28805505600,3.960000e+11


Unnamed: 0_level_0,Preclose,Open,Highest,Lowest,Close,Volume,Money
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1990-12-19,,96.05,99.98,95.79,99.98,126000,4.940000e+05
1990-12-20,99.98,104.30,104.39,99.98,104.39,19700,8.400000e+04
1990-12-21,104.39,109.07,109.13,103.73,109.13,2800,1.600000e+04
1990-12-24,109.13,113.57,114.55,109.13,114.55,3200,3.100000e+04
1990-12-25,114.55,120.09,120.25,114.55,120.25,1500,6.000000e+03
...,...,...,...,...,...,...,...
2022-07-25,3269.97,3269.71,3273.18,3243.03,3250.39,27124574400,3.480000e+11
2022-07-26,3250.39,3254.19,3282.41,3246.04,3277.44,25946867600,3.340000e+11
2022-07-27,3277.44,3271.78,3282.57,3265.73,3275.76,24913148500,3.400000e+11
2022-07-28,3275.76,3287.50,3305.71,3277.11,3282.58,28805505600,3.960000e+11


In [22]:
daily_data = data['1995-01':'2022-07'].copy()
daily_data['Close'] = pd.to_numeric(daily_data['Close'])
daily_data['Preclose'] = pd.to_numeric(daily_data['Preclose'])
daily_data['Raw_return'] = daily_data['Close'] / daily_data['Preclose'] - 1
daily_data['Log_return'] = np.log(daily_data['Close']) - np.log(daily_data['Preclose'])
daily_data

Unnamed: 0_level_0,Preclose,Open,Highest,Lowest,Close,Volume,Money,Raw_return,Log_return
Day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1995-01-03,647.87,637.72,647.71,630.53,639.88,23451800,1.806930e+08,-0.012333,-0.012409
1995-01-04,639.88,641.90,655.51,638.86,653.81,42222000,3.069230e+08,0.021770,0.021536
1995-01-05,653.81,655.38,657.52,645.81,646.89,43012300,3.015330e+08,-0.010584,-0.010641
1995-01-06,646.89,642.75,643.89,636.33,640.76,48748200,3.537580e+08,-0.009476,-0.009521
1995-01-09,640.76,637.52,637.55,625.04,626.00,50985100,3.985190e+08,-0.023035,-0.023305
...,...,...,...,...,...,...,...,...,...
2022-07-25,3269.97,3269.71,3273.18,3243.03,3250.39,27124574400,3.480000e+11,-0.005988,-0.006006
2022-07-26,3250.39,3254.19,3282.41,3246.04,3277.44,25946867600,3.340000e+11,0.008322,0.008288
2022-07-27,3277.44,3271.78,3282.57,3265.73,3275.76,24913148500,3.400000e+11,-0.000513,-0.000513
2022-07-28,3275.76,3287.50,3305.71,3277.11,3282.58,28805505600,3.960000e+11,0.002082,0.002080


In [23]:
Month_data = daily_data.resample('M')['Log_return'].sum().to_frame()  
Month_data['Raw_return'] = np.exp(Month_data['Log_return'])-1
Month_data

Unnamed: 0_level_0,Log_return,Raw_return
Day,Unnamed: 1_level_1,Unnamed: 2_level_1
1995-01-31,-0.141139,-0.131631
1995-02-28,-0.023979,-0.023694
1995-03-31,0.163651,0.177803
1995-04-30,-0.109315,-0.103552
1995-05-31,0.188901,0.207922
...,...,...
2022-03-31,-0.062604,-0.060685
2022-04-30,-0.065154,-0.063077
2022-05-31,0.044724,0.045739
2022-06-30,0.064468,0.066592


In [24]:
Year_data = daily_data.resample('Y')['Log_return'].sum().to_frame()  
Year_data['Raw_return'] = np.exp(Year_data['Log_return'])-1
Year_data

Unnamed: 0_level_0,Log_return,Raw_return
Day,Unnamed: 1_level_1,Unnamed: 2_level_1
1995-12-31,-0.1542,-0.142899
1996-12-31,0.501639,0.651425
1997-12-31,0.264019,0.302153
1998-12-31,-0.040505,-0.039695
1999-12-31,0.175423,0.19175
2000-12-31,0.416917,0.517277
2001-12-31,-0.230898,-0.20618
2002-12-31,-0.192575,-0.175167
2003-12-31,0.097735,0.10267
2004-12-31,-0.167233,-0.153997


In [25]:
Year_data2 = daily_data.resample('Y')['Close'].last().to_frame()
Year_data2

Unnamed: 0_level_0,Close
Day,Unnamed: 1_level_1
1995-12-31,555.29
1996-12-31,917.02
1997-12-31,1194.1
1998-12-31,1146.7
1999-12-31,1366.58
2000-12-31,2073.48
2001-12-31,1645.97
2002-12-31,1357.65
2003-12-31,1497.04
2004-12-31,1266.5


## 均值 mean
算术平均：$ A_n = (a_1 + a_2 + a_3 + ... + a_n) / n $ 

In [26]:
daily_data['2000-01':'2021-12']['Raw_return'].mean()

0.0002997925742341234

In [27]:
Month_data['2000-01':'2021-12']['Raw_return'].mean()

0.00638659075285674

In [28]:
np.mean(daily_data['2000-01':'2021-12']['Raw_return'])

0.0002997925742341234

In [29]:
sum(daily_data['2000-01':'2021-12']['Raw_return']) / len(daily_data['2000-01':'2021-12']['Raw_return'])

0.0002997925742341234

In [30]:
daily_data['2000-01':'2021-12']['Raw_return'].describe().round(5)

count    5332.00000
mean        0.00030
std         0.01522
min        -0.08841
25%        -0.00651
50%         0.00064
75%         0.00740
max         0.09857
Name: Raw_return, dtype: float64

## 分位数 quantile
分位数（Quantile），亦称分位点，是指将一个随机变量的概率分布范围分为几个等份的数值点，常用的的中位数、四分位数、百分位数等
常见的分类方法：
- 中位数 Median
  - 当N是奇数时 $ m_{0.5} = X_{(N+1)/2} $

  - 当N是偶数时 $ m_{0.5} =( X_{(N/2)} + X_{(N/2+1)} )/2 $
- 四分位数Quartile
10分位数 常用在股票投资策略中

In [31]:
from statistics import quantiles

quantiles(daily_data['2000-01':'2021-12']['Raw_return'], n=10, method='inclusive')

[-0.016097081828110336,
 -0.008678806004592077,
 -0.004752631067066526,
 -0.001721643766958003,
 0.0006416905475513657,
 0.002879009623246542,
 0.005590864432495524,
 0.009490838188521967,
 0.016934368056506454]

In [32]:
quantiles(Month_data['2000-01':'2021-12']['Raw_return'], n=4)

[-0.03759568413564121, 0.0063653439661619515, 0.04587742274975387]

In [33]:
quantiles(daily_data['2000-01':'2021-12']['Raw_return'], n=10, method='exclusive')

[-0.016107902309649737,
 -0.008682174117724895,
 -0.004753759033724048,
 -0.0017217679867727932,
 0.0006416905475513657,
 0.0028793525212044368,
 0.005593687159267247,
 0.009506414116411222,
 0.016939114491093886]