In [2]:
import numpy as np
import pandas as pd

Data1 = pd.read_csv('./data/TRAINSET_NEWS.csv')
Data2 = pd.read_csv('./data/TRAINSET_STOCK.csv')


import os
import re
import jieba


def get_stopwords():
    '''获取停用词'''
    stop_words = []
    for file in os.listdir('./stopwords'):
        with open('./stopwords/{}'.format(file),encoding='utf8') as f:
            stop_words.extend(f.read().splitlines())
    return set(stop_words)

stop_words = get_stopwords()

def clear_data(data,is_nan=0):
    # 注意这里有nan数据
    '''数据的清洗'''
    if data is np.nan:
        data = ''
        is_nan += 1
        print('\rHere are {} empty titles...'.format(is_nan),end='')
    data = re.sub(r'[1-9]+月[0-9]+日至[0-9]+日','',data)
    data = re.sub(r'(当地时间)*[0-9]+日','',data)
    data = re.sub(r'[1-9]+月[0-9]+日([上|下]午)*','',data)
    data = re.sub(r'[0-9]+日([上|下]午)*','',data)
    data = re.sub(r'[0-9]+时[0-9]+分(许)*','',data)
    data = re.sub(r'([0-9]+日)*([上|下]午)*[0-9]+时[0-9]+分(许)*','',data)
    data = re.sub(r'[a-zA-Z0-9]+','',data)
    data = re.sub(r'[,.，。、!:"\'：《》【】’%‘“”)(（·）—]','',data)
    data = re.sub(r'\s+',' ',data)
    return data

def split_data(data):
    '''结巴分词'''
    data = jieba.cut(data)
    # todo token的时候进行停用词的去除
    data = [word for word in data if word not in stop_words ]
    
#     data =  ' '.join(data) + '\n'
    return data

def data_process(head,data,save_path=''):
    '''数据处理第一大部分'''
    cleared = data[head].apply(clear_data)
    splited = cleared.apply(split_data)
    
    return splited

#     with open('./{}'.format(save_path),'w',encoding='utf8') as f:
#         f.writelines(splited_title)
#         print('\nTo {} processed.'.format(save_path))

# 统计空title
if not os.path.exists('./my_data'):
    os.makedirs('./my_data')
    print('创建文件夹({})成功'.format('/my_data'))
        
    
# data_process('title',save_path='./my_data/titles_split.txt')
# data_process('content',save_path='./my_data/contents_split.txt')


from gensim.models.word2vec import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer
from multiprocessing import cpu_count
import numpy as np
import pickle


def get_word2vec(data,n_dims=200,w2v_path='./models/w2v_model.pkl'):
    '''进行word2vec模型的获取'''
    if not os.path.exists(w2v_path):
        print('\n# Train the word2vec ')
        core_count = cpu_count()
        w2v = Word2Vec(size=n_dims,min_count=5,workers=core_count)
        w2v.build_vocab(data)
        w2v.train(data,total_examples=w2v.corpus_count,epochs=128)
        w2v.save(w2v_path)
        print('# Finish train the word2vec')
    else:
        w2v = Word2Vec.load(w2v_path)
        
    
    return w2v

def get_tf_idf(data,model_path='./models/tf_idf_model.pkl'):
    '''进行tfidf模型的获取'''
    if not os.path.exists(model_path):
        tf_model = TfidfVectorizer(ngram_range=(1,2),token_pattern=r'(?u)\b\w+\b"')
        tf_model.fit(data)
        pickle.dump(tf_model,open(model_path,'wb'))
        print('# Finish train the Tf-idf')
    else:
        tf_model = pickle.load(open(model_path,'rb'))
    
    return tf_model

def build_tfidf_vec(sent,tfidf):
    '''获取tfidf向量'''
    return tfidf.transform([sent]).todense()[0]


def build_word_vec(sent,w2v):
    '''获取word2vec向量'''
    vec = []
    for word in sent:
        try:
            vec.append(w2v[word])
        except:
            vec.append(np.zeros(200))
    if len(vec) != 0:
        vec = np.mean(vec,axis=0)
    else:
        vec = np.zeros(200)
    
    return vec


def convert2vec(data):    
    '''把处理后的文本转换成向量'''
    '''注意：再使用word2vec的时候split_data和使用tfidf的时候方法里面需要进行就该，split_data函数使用tfidf时加入代码" ".join(data) '''
    ## word2vec  
    tmp1 = data_process('title',data).apply(list)
    tmp2 = data_process('content',data).apply(list)
    data.drop(['id','title','content'],axis=1,inplace=True)
    w2v = get_word2vec(tmp1 + tmp2)

    ## tf-idf   
#     tmp1 = data_process('title',data)
#     tmp2 = data_process('content',data)
#     tfidf = get_tf_idf(tmp1 + tmp1)
    
    tmp_size = 200
    for i in range(int(np.ceil(len(tmp1 + tmp2) / tmp_size))):
        tmp1[i * tmp_size: i * tmp_size + tmp_size] = tmp1[i * tmp_size: i * tmp_size + tmp_size].apply(build_word_vec,args=(w2v,))
        tmp2[i * tmp_size: i * tmp_size + tmp_size] = tmp2[i * tmp_size :i * tmp_size + tmp_size].apply(build_word_vec,args=(w2v,))
        
#         tmp1[i * tmp_size: i * tmp_size + tmp_size] = tmp1[i * tmp_size: i * tmp_size + tmp_size].apply(build_tfidf_vec,args=(tfidf,))
#         tmp2[i * tmp_size: i * tmp_size + tmp_size] = tmp2[i * tmp_size :i * tmp_size + tmp_size].apply(build_tfidf_vec,args=(tfidf,))
    
    data['allVec'] = list(np.concatenate([list(tmp1),list(tmp2)],axis=1))  # 这里转换成list再存进df
    
    sum_vec = data.groupby(['date']).sum().reset_index()
    count_vec = data.groupby(['date']).count().reset_index()['allVec']
    
    sum_vec['allVec'] = sum_vec['allVec'] / count_vec
    
    print('# Finish build word_vec')
    
    return sum_vec

new_data = convert2vec(Data1)


import pandas as pd

Data2.y = Data2.y.astype(str)
Data2.trade_date = Data2.trade_date.astype(str)

Data = Data2.loc[:,['name','y','trade_date']].groupby('trade_date').sum()['y'].reset_index()
Data.set_index(Data.trade_date,inplace=True)

new_data['date'] = pd.to_datetime(new_data['date'],format='%Y%m%d')
new_data.set_index(new_data['date'],inplace=True)

#  把对应的过去一段时间内10天的数据合并进第一个表
Data['allVec'] = None

for date in Data.index:
    temp_data = new_data.truncate(after=date)[-20:-10]
    Data.set_value(date,'allVec',temp_data.allVec.mean())

# 去除不完整数据
Data = Data.dropna()


def fix_y_1(y):
    '''处理y值成格式1'''
    def map_f(x):
        if x == '1':
            return [1,0]
        else:
            return [0,1]
    return list(map(map_f,y))
    
def fix_y_2(y):
    '''处理y值成格式2'''
    labels = []
    for i in range(len(y)):
        if y[i] == '1':
            labels.append(i)
    return labels

Data['y_'] = Data['y'].apply(lambda x: list(map(int,x)))


pre_date = ['20190402','20190403','20190404','20190408','20190409']
def set_val_data(date_list,new_data):
    val_data = pd.DataFrame(columns=['vec'],index=pre_date)

    for date in date_list:
        temp_data = new_data.truncate(before=20190402)[-20:-10]
        val_data.set_value(date,'vec',temp_data.allVec.mean())
    
    return val_data

val_data = []
for i in [1,2,3,7,8]:
    val_data.append(new_data.allVec[-20 + i: -10 + i].mean())
val_data = pd.DataFrame(val_data)


input_x = np.array(list(Data['allVec']))
input_y = np.array(list(Data['y_']))[:,0]
input_y


from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler



Here are 1 empty titles...



# Finish build word_vec




In [9]:
from xgboost.sklearn import XGBClassifier

result = []
for i,ts in enumerate(Data2.ts_code.unique()):
    input_x = np.array(list(Data['allVec']))
    input_y = np.array(list(Data['y_']))[:,i]
    input_x = ss.fit_transform(input_x)
    
    model = XGBClassifier(learn_rate=0.1,n_estimators=500,seed=64)
    print('# fit {}...'.format(i))
    model.fit(input_x,input_y)
    pred = model.predict_proba(np.array(val_data))
    
    for _date,_pred in zip(pre_date,pred):
        result.append([ts,_date,_pred[1]])
result = pd.DataFrame(result,columns=['ts_code','trade_date','p'])
result.to_csv('result2_ss.csv',index=None)

# fit 0...
# fit 1...
# fit 2...
# fit 3...
# fit 4...
# fit 5...
# fit 6...
# fit 7...
# fit 8...
# fit 9...
# fit 10...
# fit 11...
# fit 12...
# fit 13...
# fit 14...
# fit 15...
# fit 16...
# fit 17...
# fit 18...
# fit 19...
# fit 20...
# fit 21...
# fit 22...
# fit 23...
# fit 24...
# fit 25...
# fit 26...
# fit 27...
# fit 28...
# fit 29...
# fit 30...
# fit 31...
# fit 32...
# fit 33...
