# 1.导入包及数据

In [2]:
import numpy as np
import matplotlib.pyplot as plt  
import pandas as pd
from sklearn import preprocessing
from vmdpy import VMD  
import akshare as ak #大豆数据集
from scipy.fftpack import fft ##傅里叶级数
import pkuseg   #分词包
import re
from sklearn.feature_extraction.text import CountVectorizer #sklearn的统计词特征包
from sklearn.feature_extraction.text import TfidfTransformer #sklearn的TF-IDF包





#下载数据
Data=pd.read_csv("data.csv")
Data.head()


Unnamed: 0.1,Unnamed: 0,date,open,high,low,close,volume,hold,settle,return,volatility
0,1,2010/1/4,4080,4090,4049,4057,321838,284296,0,0.0,0.0
1,2,2010/1/5,4067,4082,4060,4066,253640,283384,0,0.00222,0.0
2,3,2010/1/6,4066,4194,4057,4154,860812,352830,0,0.02141,0.0
3,4,2010/1/7,4165,4188,3975,4044,705704,297236,0,-0.02684,0.0
4,5,2010/1/8,4040,4050,3947,3981,526594,270074,0,-0.0157,0.0


In [4]:
#数据处理
pd.to_datetime(Data.date)
Data.drop(columns={'Unnamed: 0'},inplace=True)
Data.head()

KeyError: "['Unnamed: 0'] not found in axis"

##### **标准化

In [5]:
#标准化数据
# array=[Data['return'],Data['volatility']]


In [7]:
minmax = preprocessing.MinMaxScaler()
data_minmax = minmax.fit_transform(Data[['return','volatility']])
data_minmax

array([[0.38370969, 0.        ],
       [0.39633695, 0.        ],
       [0.50548888, 0.        ],
       ...,
       [0.39559752, 0.14371025],
       [0.39855526, 0.14600355],
       [0.40737159, 0.14992982]])

# 2.VMD分解 Return&Volatility 

### vmd参数设置及绘制分解模态图

In [None]:
#1.导入文件，可以指定列也可以，就是usecols
filename= '/Volumes/本机/paper/futures voality forcaseting/VMD_BiLSTM_A0/Data/data.csv'
f = pd.read_csv(filename,usecols=[9,10])

#vmd包参数设置并执行VMD
alpha = 5000       #宽带限制，一般为数据点的1.5-2倍  
tau = 0.           # 噪声限制 (no strict fidelity enforcement)  
K = 11             # 分解模态个数  
DC = 0             # 合成信号若无常量则为0；若含常量，则其取值为 1；  
init = 1           # 初始化ω值,当初始化为1时,均匀分布产生的随机数； 
tol = 1e-7         # 控制误差大小常量，决定精度与迭代次数

u, u_hat, omega = VMD(f['return'], alpha, tau, K, DC, init, tol)  
u1,u1_hat,omega1 = VMD(f['volatility'], alpha, tau, K, DC, init, tol)  
#绘制分解模态图
plt.figure(figsize=(10,8))
plt.subplot(2,1,1)
plt.plot(u.T)
plt.title('Decomposed modes of return')
plt.subplot(2,1,2)
plt.plot(u1.T)
plt.title('Decomposed modes of volatility')


### IMF of return

In [None]:
#未分解前的图
fig_return = plt.figure()
plt.plot(f['return'])
fig_return.suptitle('Original input signal and its components')

#循环每个分解模态的
for i in range(K):
    plt.figure(figsize=(5,5), dpi=100)
    plt.subplot(K,1,i+1)
    plt.plot(u[i,:], linewidth=0.2, c='r')
    plt.ylabel('IMF{}'.format(i+1))
    
    #每个模态的中心频率
for i in range(K):
    plt.figure(figsize=(5,5), dpi=100)
    plt.subplot(K,1,i+1)
    plt.plot(abs(fft(u[i,:])))
    plt.ylabel('IMF{}'.format(i+1))



### IMF of volatility

In [None]:
#未分解前的图
fig_volatility = plt.figure()
plt.plot(f['volatility'])
fig_volatility.suptitle('Original input signal and its components OF volatility')

#循环每个分解模态的
for i in range(K):
    plt.figure(figsize=(5,5), dpi=100)
    plt.subplot(K,1,i+1)
    plt.plot(u1[i,:], linewidth=0.2, c='r')
    plt.ylabel('IMF{}'.format(i+1))
    
    #每个模态的中心频率
for i in range(K):
    plt.figure(figsize=(5,5), dpi=100)
    plt.subplot(K,1,i+1)
    plt.plot(abs(fft(u1[i,:])))
    plt.ylabel('IMF{}'.format(i+1))

# 3.文本处理

In [None]:
filepath1='/Volumes/本机/paper/futures voality forcaseting/VMD_BiLSTM_A0/Data/text.csv'
text=pd.read_csv(filepath1)
pd.to_datetime(text.date)

### (1).文本预处理

In [None]:
titles=text.title
titles1=[]

for i in range(len(titles)):
    textgo = re.sub('[^\u4e00-\u9fa5_a-zA-Z0-9]','',str(titles[i]))
    titles1.append(textgo) 


In [None]:
len(titles1)

###  (2).文本分词

In [None]:
type(titles1)

In [None]:
seg = pkuseg.pkuseg() # 以默认配置加载模型

In [None]:
textcut=[]
for j in titles1:  
    textfo=seg.cut( j )  # 进行分词
    textcut.append(textfo)
    
text_cut=pd.Series(textcut)##list换series，然后放入表中


In [None]:
text.insert(loc=1,column='text_cut',value=text_cut)

In [None]:
text

###  (3).去除停用词（采用中文停用词表）

In [None]:
def stopwordslist(filepath):   # 定义函数创建停用词列表
    stopword = [line.strip() for line in open(filepath, 'r').readlines()]    #以行的形式读取停用词表，同时转换为列表
    return stopword

filepath='/Volumes/本机/learning/NLP/stopwords/stopwords-master/cn_stopwords.txt'
stopwords = stopwordslist(filepath)

In [None]:
text_cut[0][1]

In [None]:
type(text_cut)

In [None]:
lastsentences=[]
lastsentences_str=[]##增加列表内是字符串的一项，方便进行向量化处理
for i in range(len(text_cut)):     #for循环遍历分词后的每个词语
    lastsentence=[]
    for word in text_cut[i]:
        if word not in stopwords:     #判断分词后的词语是否在停用词表内
            lastsentence.append(word)
            lastsentence_str=' '.join(lastsentence)
    lastsentences.append(lastsentence)
    lastsentences_str.append(lastsentence_str)
text_cut_stopwords=pd.Series(lastsentences)
text_cut_stopwords_str=pd.Series(lastsentences_str)
text.insert(loc=2,column='text_cut_stopwords',value=text_cut_stopwords)
text.insert(loc=2,column='text_cut_stopwords_str',value=text_cut_stopwords_str)

In [None]:
lastsentences_str

In [None]:
text.text_cut_stopwords[0]

In [None]:
text

###  (4).向量转换（BoW & TF-IDF）

In [None]:
#CountVectorizer和TfidfTransformer默认学习list里的str（在str中以空格为分断），所以对象应该换成list(‘str1’,‘str2’,...)
vectorizer = CountVectorizer(min_df=1,max_df=0.8) ##创建单词表时，忽略占比超过百分之五十的词 以及 出现数少于2的词
transformer = TfidfTransformer() 
X = vectorizer.fit_transform(lastsentences_str)#学习词汇，返回文档术语与矩阵
feature_name = vectorizer.get_feature_names()#显示特征名
tfidf = transformer.fit_transform(X)  #把已经生成的文档矩阵换成TF-IDF矩阵
tfidf.toarray()

### (5).情感分析