# 新指标特征提取方法

In [51]:
import pandas as pd
###方案一：直接统计法 ：标识震荡的幅度，为1时表示单调，越小震荡越大？
def direct_statistic(df:pd.DataFrame) -> float:
    seqLength=len(df)
    S=0
    for i in range(1,seqLength):
        S +=abs(df.iloc[i,0]-df.iloc[i-1,0])
    return abs(df.iloc[seqLength-1,0]-df.iloc[0,0]) / S 


In [52]:
###方案二：斜率法（numpy的polyfit：用一个多项式去拟合时间序列变化，若是一次幂，取倒数的第二个系数即为斜率k，y=kx+c）
#正数：上升趋势，负数：下降趋势，0：平稳
import numpy as np
def trendline(index: list, data: pd.DataFrame, order: int = 1) -> float:
    lst=data[data.columns[0]].tolist() #将dataframe第一列转换为一维list
    coeffs = np.polyfit(index, list(lst), order)
    slope = coeffs[-2]
    return slope


In [53]:
#方案三：Cox-Stuart趋势检验
#最后就返回p_value把，小于0.05就是显著的，可以判断上升或者下降，大于0.05就是不显著的，默认判断为无趋势
import scipy.stats as stats


def cos_staut(data:pd.DataFrame, debug=False) -> str:
    lst=data[data.columns[0]].tolist()
    raw_len = len(lst)
    if raw_len % 2 == 1:
        del lst[int((raw_len - 1) / 2)]
    c = int(len(lst) / 2)
    n_pos = n_neg = 0
    for i in range(c):
        diff = lst[i + c] - lst[i]
        if diff > 0:
            n_pos += 1
        elif diff < 0:
            n_neg += 1
        else:
            continue
    num = n_pos + n_neg
    k = min(n_pos, n_neg)
    p_value = 2 * stats.binom.cdf(k, num, 0.5)
    if debug:
        print("fall:%i, rise:%i, p-value:%f" % (n_neg, n_pos, p_value))
    #print(n_pos)
    if n_pos > n_neg and p_value < 0.05:
        return "increasing"
    elif n_neg > n_pos and p_value < 0.05:
        return "decreasing"
    else:
        return "no trend"


In [54]:
import math
from scipy.stats import mstats, norm
#方案四：MK趋势检验
#MK检验：不要求数据服从正态分布，但是要求数据是独立的
#最后就返回p_value把，小于0.05就是显著的，可以判断上升或者下降，大于0.05就是不显著的，默认判断为无趋势
def mk_test(data:pd.DataFrame, alpha=0.05) -> str:
    x=data[data.columns[0]].tolist()
    n = len(x)

    # calculate S
    s = 0
    for k in range(n - 1):
        for j in range(k + 1, n):
            s += np.sign(x[j] - x[k])

    # calculate the unique data
    unique_x, tp = np.unique(x, return_counts=True)
    g = len(unique_x)

    # calculate the var(s)
    if n == g:  # there is no tie
        var_s = (n * (n - 1) * (2 * n + 5)) / 18
    else:  # there are some ties in data
        var_s = (n * (n - 1) * (2 * n + 5) - np.sum(tp * (tp - 1) * (2 * tp + 5))) / 18

    if s > 0:
        z = (s - 1) / np.sqrt(var_s)
    elif s < 0:
        z = (s + 1) / np.sqrt(var_s)
    else:  # s == 0:
        z = 0

    # calculate the p_value
    p = 2 * (1 - norm.cdf(abs(z)))  # two tail test
    h = abs(z) > norm.ppf(1 - alpha / 2)

    if (z < 0) and h:
        trend = "decreasing"
    elif (z > 0) and h:
        trend = "increasing"
    else:
        trend = "no trend"
    #print(p, h)
    return trend

## 数据预处理

In [55]:
import pandas as pd
from scipy import stats

df=pd.read_csv('../data/data.csv')
k2, p = stats.normaltest(df['load']) #原假设H0：符合正态分布
alpha = 0.05
if p < alpha:  # null hypothesis: x comes from a normal distribution
    print("拒绝原样本是正态分布的假设")
else:
    print("不能拒绝正态分布的假设")


拒绝原样本是正态分布的假设


# 1.选取一天为一组，提取新的指标，数据规模/24


In [56]:
# #选取一天为一组数据，并进行四个指标运算，得到四个结果列表，然后将最终结果存入一个新的dataframe中
# ConcussionShake,Slope,CoxStuart,Mk=[],[],[],[]
# for i in range(0,len(df)-24+1,24):
#     load=df[['load']][i:i+24]
#     #方案一：直接统计法 ：标识震荡的幅度，为1时表示单调，越小震荡越大越不规律
#     ConcussionShake.append(direct_statistic(load))
#     #方案二：斜率法 #正数：上升趋势，负数：下降趋势，0：平稳
#     Slope.append(trendline(load.index, load))
#     #方案三：Cox-Stuart趋势检验（要正态分布） #p_value小于0.05就是显著的，可以判断上升或者下降，大于0.05就是不显著的，默认判断为无趋势
#     CoxStuart.append(cos_staut(load))
#     #方案四：MK趋势检验 #不要求数据服从正态分布，但是要求数据是独立的
#     Mk.append(mk_test(load))

In [57]:
# while(True):  
#     try:
#         time = input('请输入想要预测的时间是每天几点(24h制)？：')
#         time = int(time)
#         if time < 0 or time > 23:
#             print("输入范围应为：0~23")
#             continue
#         break
#     except ValueError:
#         print("输入范围应为：0~23")

# df_new=df.iloc[time::24, :].copy() #第一个冒号省略是到剩下所有行，后面的冒号是取所有列
# df_new.loc[:, 'ConcussionShake'] = ConcussionShake
# df_new.loc[:, 'Slope'] = Slope
# df_new.loc[:, 'CoxStuart'] = CoxStuart
# df_new.loc[:, 'Mk'] = Mk
# df_new.head()

Unnamed: 0,date,load,year,month,hour,day,lowtmep,hightemp,ConcussionShake,Slope,CoxStuart,Mk
8,2016/11/25 8:00,228.001,0.0,0.909091,0.347826,0.666667,0.302326,0.243902,0.087683,0.791108,no trend,no trend
32,2016/11/26 8:00,224.432,0.0,0.909091,0.347826,0.833333,0.325581,0.243902,0.065419,1.81942,no trend,increasing
56,2016/11/27 8:00,223.996,0.0,0.909091,0.347826,1.0,0.325581,0.219512,0.148006,0.690724,no trend,no trend
80,2016/11/28 8:00,235.431,0.0,0.909091,0.347826,0.0,0.302326,0.219512,0.030532,1.574692,no trend,increasing
104,2016/11/29 8:00,237.024,0.0,0.909091,0.347826,0.166667,0.325581,0.317073,0.173919,0.581039,no trend,no trend


In [58]:
# df_new['CoxStuart'].value_counts(),df_new['Mk'].value_counts()

(CoxStuart
 no trend      868
 increasing    223
 decreasing      4
 Name: count, dtype: int64,
 Mk
 no trend      686
 increasing    400
 decreasing      9
 Name: count, dtype: int64)

In [59]:
# df_encoded = pd.get_dummies(df_new, columns=['CoxStuart', 'Mk'],dtype=float) #对分类变量进行one-hot编码
# #每一周某一个时刻的，加上了关于电荷的新指标
# df_encoded.to_csv('../data/data_day_8.csv', index=False)
# #每一天的，加上了关于电荷的新指标

: 

# 2.选取一周某一个时刻为一组，提取新的指标，数据规模/7

In [46]:
#选取一周某一个时刻的数据，并进行四个指标运算，得到四个结果列表，然后将最终结果存入一个新的dataframe中
ConcussionShake,Slope,CoxStuart,Mk=[],[],[],[]
load=df[['load']].iloc[0:24*7:24]
for i in range(0,len(df),7*24): #一周为一组遍历
    for j in range(0,24):#24h每一时刻
        load=df[['load']].iloc[j+i:j+i+24*7:24] #一周内的同一时刻
        df2=df.iloc[j+i:j+i+24*7:24]
        #方案一：直接统计法 ：标识震荡的幅度，为1时表示单调，越小震荡越大越不规律
        ConcussionShake.append(direct_statistic(load))
        #方案二：斜率法 #正数：上升趋势，负数：下降趋势，0：平稳
        Slope.append(trendline(load.index, load))
        #方案三：Cox-Stuart趋势检验（要正态分布） #p_value小于0.05就是显著的，可以判断上升或者下降，大于0.05就是不显著的，默认判断为无趋势
        CoxStuart.append(cos_staut(load))
        #方案四：MK趋势检验 #不要求数据服从正态分布，但是要求数据是独立的
        Mk.append(mk_test(load))

In [47]:
df_new=pd.DataFrame()
for i in range(0,len(df),7*24): #一周为一组遍历
    df_temp=df.iloc[i:i+24:1, :].copy()
    df_new=df_new._append(df_temp)

#df_new=df_new.reset_index(drop=True)#丢弃原来的索引，重新生成索引
df_new.loc[:, 'ConcussionShake'] = ConcussionShake
df_new.loc[:, 'Slope'] = Slope
df_new.loc[:, 'CoxStuart'] = CoxStuart
df_new.loc[:, 'Mk'] = Mk
df_new[::12].head() # 查看间隔12行的数据

Unnamed: 0,date,load,year,month,hour,day,lowtmep,hightemp,ConcussionShake,Slope,CoxStuart,Mk
0,2016/11/25 0:00,193.987,0.0,0.909091,0.0,0.666667,0.302326,0.243902,0.36445,0.136641,no trend,no trend
12,2016/11/25 12:00,225.034,0.0,0.909091,0.521739,0.666667,0.302326,0.243902,0.306394,0.137759,no trend,no trend
168,2016/12/2 0:00,215.311,0.0,1.0,0.0,0.666667,0.418605,0.341463,0.037307,0.042586,no trend,no trend
180,2016/12/2 12:00,220.633,0.0,1.0,0.521739,0.666667,0.418605,0.341463,0.058115,0.080978,no trend,no trend
336,2016/12/9 0:00,211.115,0.0,1.0,0.0,0.666667,0.255814,0.219512,0.148006,-0.033638,no trend,no trend


In [48]:
#df_new['CoxStuart'].value_counts(),df_new['Mk'].value_counts()

In [50]:
df_encoded = pd.get_dummies(df_new, columns=['CoxStuart', 'Mk'],dtype=float) #对分类变量进行one-hot编码
#每一周某一个时刻的，加上了关于电荷的新指标
df_encoded.to_csv('../data/data_week.csv', index=False)