#  制备训练数据

In [1]:
import pickle
from collections import Counter
import numpy as np
import copy
import random
from sklearn.feature_extraction.text import TfidfTransformer
# k折交叉验证
from sklearn.model_selection import KFold,train_test_split

In [2]:
# 导入数据
def get_data(path):
    with open(path,mode='rb') as in_file:
        positive_c,negative_c=pickle.load(in_file)
    return positive_c,negative_c

In [4]:
def get_wordlist(txt_data, topnum, min_count, ratio_value):
    word_count = Counter()
    
    for line in txt_data:
        word_count.update(line)
        
    # print (word_count.most_common())
    idCODES = {'<PAD>': 0, '<UNK>': 1 }
    count = len(idCODES)
    
    i = 0
    vocab2id=copy.copy(idCODES)
    
    for word,cnt in list(word_count.most_common()):
        if i < topnum:
            if(cnt>min_count):
                vocab2id[word]=count
                count+=1
                i+=1
        else:
            break
    
    id2vocab={vi:v for v,vi in vocab2id.items()}
#     print("total word number:",len(id2vocab)-len(CODES))
    return vocab2id,id2vocab
        
# vocab_to_int, id2vocab=get_wordlist(positive ,100000,5,ratio_value=0)   

In [18]:
# 词袋模型，只考虑权重  内存不够
def bow_data_prepare(alldata, vocab2id):
    
    data1 = np.zeros((int(1/2*len(alldata)),len(vocab2id)))
    
    for i in range(0, int(1/2*len(alldata))):
        for word in alldata[i]:
            if word in vocab2id.keys():
                data[i, vocab2id[word]]+=1
      
    left = len(alldata) - int(1/2*len(alldata))
    data2 = np.zeros(left,len(vocab2id))

    for i in range(left, len(alldata)):
        
        for word in alldata[i]:
            if word in vocab2id.keys():
                data2[i-left, vocab2id[word]] += 1
                
    return np.concatenate((data1, data2), axis=0)

In [5]:
def bow_data_prepare(sent_list,vocab2id):
    data=np.zeros((len(sent_list),len(vocab2id)))
    for i in range(len(sent_list)):
        for word in sent_list[i]:
            if word in vocab2id.keys():
                data[i,vocab2id[word]]+=1
    return data

In [6]:

def sent2id(all_train, word_num):
    train = np.zeros((len(all_train), word_num))
    
    for i in range(0, len(all_train)):
        k = min(len(all_train[i]), word_num)
        for j in range(0, k):
            if all_train[i][j] in vocab_to_int.keys():
                train[i,j] = vocab_to_int[all_train[i][j]]
            else :
                train[i,j] = vocab_to_int['<UNK>']
    return train

In [40]:
# 导入分好词的数据
positive,negative=get_data('12000_5_8.p')
# 将list随机排序
random.shuffle(positive)
random.shuffle(negative)
negative=negative[:len(positive)]
# 将一个list拼接到另一个上
positive.extend(negative)

#获取单词转换表
vocab_to_int,id2vocab=get_wordlist(positive, 100000, 5, ratio_value=0)
print("vocab_to_int len:",len(vocab_to_int))


# 打标签，积极为1，消极为0
data_label = np.zeros((len(positive),1))
for i in range(len(data_label)):
    if i < len(negative) :
        data_label[i] = 1
        

# 数据随机划分train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(positive, data_label, test_size=0.2, random_state=42)
print("train len:", len(X_train))


# 获取词袋模型训练数据
X_train_bow = bow_data_prepare(X_train,vocab_to_int)
X_test_bow = bow_data_prepare(X_test,vocab_to_int)


# 构建TF-IDF模型的训练数据
transformer = TfidfTransformer(smooth_idf=True)

X_train_tfidf = transformer.fit_transform(X_train_bow)
X_test_tfidf = transformer.transform(X_test_bow)


# 构建embed模型的训练数据
X_train_embed = sent2id(X_train, 20)
X_test_embed = sent2id(X_test, 20)

print("all the preprocess done")


vocab_to_int len: 8122
train len: 19561
all the preprocess done


In [47]:
Y_train[0:5]

array([[ 0.],
       [ 1.],
       [ 0.],
       [ 1.],
       [ 0.]])

# 构建分类模型

In [18]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn import svm
from sklearn.linear_model import LogisticRegression

import keras
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras.layers import Dense,Activation,LSTM,TimeDistributed,Embedding,Dropout,Bidirectional,Dropout,Conv1D,MaxPooling1D,Flatten
from keras.optimizers import Adam
from keras import optimizers

In [10]:
def create_cnn_model():
    model = Sequential()
    model.add(Embedding(len(vocab_to_int), input_length=20, trainable=True, output_dim=64))
    model.add(Conv1D(input_shape=(20,64), filters=128, kernel_size=3, activation='relu'))
    model.add(MaxPooling1D(pool_size=3, strides=None, padding='valid'))
    model.add(Flatten())
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(2, activation='softmax'))
    print(model.summary())
    Adam=keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08,clipvalue=0.3)
    model.compile(optimizer=Adam,loss='categorical_crossentropy',metrics=['accuracy'])
    return model

In [11]:
#     
def create_rnn_model():
    
    model = Sequential()
    model.add(Embedding(input_dim=len(vocab_to_int), output_dim=64))
    model.add(Bidirectional(LSTM(128, return_sequences=False)))#默认的merge形式为concat
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(2, activation='softmax'))
    print(model.summary())
    Adam=keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08,clipvalue=0.5)
    model.compile(optimizer=Adam,loss='categorical_crossentropy',metrics=['accuracy'])
    return model

In [56]:
def get_oof(clf, _X_train, _y_train, _X_test):
    
    oof_train = np.zeros((_X_train.shape[0], 2))
#     oof_test = np.zeros((_y_train.shape[0]))
    oof_test_skf = np.zeros((_X_test.shape[0], 2))

 #   oof_test_skf=np.empty((5,X_test.shape[0],2))
#    oof_test_skf=np.empty((5,1))
    
    for i ,(train_index,test_index) in enumerate(kf.split(_X_train)):
        print(i)
        kf_X_train=_X_train[train_index]
        kf_y_train=_y_train[train_index]
        kf_X_test=_X_train[test_index]
#         print(kf_X_test.shape)
        
        clf.fit(kf_X_train,kf_y_train)
#         print(clf.predict_proba(kf_X_test).shape)

#   把那个一折作为训练数据，循环之后会填满
        # 得到每折测试集的结果
        oof_train[test_index] = clf.predict_proba(kf_X_test)
        oof_test_skf += 1 / 5 * clf.predict_proba(_X_test)#每一次的训练结果针对test的输出

#     oof_test[:]= oof_test_skf.mean(axis=0)#去均值作为 test的结果
    print("valid data shape:",oof_train.shape,oof_test_skf.shape)
    return oof_train ,oof_test_skf

In [57]:
def get_oof_nn(clf, _X_train, _y_train, _X_test):
    
    oof_train=np.zeros((_X_train.shape[0],2))
#     oof_test=np.zeros((_y_train.shape))
    oof_test_skf=np.zeros((_X_test.shape[0],2))
    print("oof_train",oof_train.shape)
    
 #   oof_test_skf=np.empty((5,X_test.shape[0],2))
#    oof_test_skf=np.empty((5,1))
    
    for i ,(train_index,test_index) in enumerate(kf.split(_X_train)):
        print(i)
        kf_X_train = _X_train[train_index]
        kf_y_train = _y_train[train_index]
        kf_X_test = _X_train[test_index]
#         print(kf_X_test.shape)
        t_label = keras.utils.to_categorical(kf_y_train, num_classes=2)
    
        clf.fit(kf_X_train, t_label)
#         print(clf.predict_proba(kf_X_test).shape)
        oof_train[test_index] = clf.predict_proba(kf_X_test)#把那个一折作为训练数据，循环之后会填满
    
        oof_test_skf += 1 / 5 * clf.predict_proba(_X_test)
#     print(oof_test_skf)    
#     oof_test[:]= oof_test_skf.mean(axis=0)#去均值作为 test的结果
    print("valid data shape:", oof_train.shape, oof_test_skf.shape)
    return oof_train ,oof_test_skf

In [58]:
# 贝叶斯分类器
clf_nb = MultinomialNB()

# SVM分类器
clf_svm = svm.SVC(cache_size=1000, kernel='linear', probability=True)

# 构建RNN模型
clf_rnn = KerasClassifier(build_fn = create_rnn_model, epochs=3, batch_size=1024)

# 构建cnn模型
clf_cnn = KerasClassifier(build_fn = create_cnn_model, epochs=3, batch_size=1024)


In [None]:
# 构建k折交叉验证
kf = KFold(n_splits=5,random_state=2018)


# 进行训练
train_sets=[]
test_sets=[]

for clf in [clf_svm, clf_nb]:
    train_set,test_set = get_oof(clf, X_train_tfidf, Y_train, X_test_tfidf)
    print (len(train_set))
    train_sets.append(train_set)
    print (len(train_sets))
    test_sets.append(test_set)

for clf in [clf_rnn, clf_cnn]:
    train_set,test_set = get_oof_nn(clf, X_train_embed, Y_train, X_test_embed)
    train_sets.append(train_set)
    test_sets.append(test_set)


0


  y = column_or_1d(y, warn=True)


In [19]:
import keras
print (keras.__version__)

2.1.2


In [27]:
for result_set in train_sets:
    print(result_set)


# 按对应行对数组进行凭借
meta_train = np.concatenate([result_set.reshape(-1,2) for result_set in train_sets], axis=1)
meta_test = np.concatenate([y_test_set.reshape(-1,2) for y_test_set in test_sets], axis=1)


print(meta_train.shape,meta_test.shape)

clf_lr=LogisticRegression()
clf_lr.fit(meta_train, Y_train)

clf_lr.score(meta_test, Y_test)

[[ 0.91011094  0.08988906]
 [ 0.32061934  0.67938066]
 [ 0.89000793  0.10999207]
 ..., 
 [ 0.47891341  0.52108659]
 [ 0.67980324  0.32019676]
 [ 0.55437774  0.44562226]]
[[ 0.73668902  0.26331098]
 [ 0.22875531  0.77124469]
 [ 0.63645469  0.36354531]
 ..., 
 [ 0.51651264  0.48348736]
 [ 0.59638107  0.40361893]
 [ 0.53337654  0.46662346]]
[[ 0.96001935  0.03998067]
 [ 0.24489652  0.75510353]
 [ 0.95557177  0.04442823]
 ..., 
 [ 0.61116683  0.38883322]
 [ 0.59094816  0.40905184]
 [ 0.40123555  0.59876448]]
[[ 0.8874101   0.11258985]
 [ 0.25627583  0.74372417]
 [ 0.59583741  0.40416256]
 ..., 
 [ 0.7275542   0.27244589]
 [ 0.67258638  0.32741368]
 [ 0.42781565  0.57218438]]
(19561, 8) (4891, 8)


  y = column_or_1d(y, warn=True)


0.78675117562870578