In [None]:
# -*- coding: utf-8 -*-
import pandas  as pd
import numpy as np
from  keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import  pad_sequences
from keras.layers import Input, Dropout, Dense, BatchNormalization, Activation, concatenate, GlobalMaxPool1D,GRU, Embedding,Bidirectional, Flatten,LSTM, BatchNormalization,Conv1D,MaxPooling1D
from keras.models import Model
from keras.layers import GlobalMaxPooling1D
from keras.layers import *
from keras.layers.convolutional import Convolution1D
from keras import optimizers
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping
from keras import backend as K
from keras import regularizers
import re
import jieba
import jieba.posseg
import jieba.analyse
import codecs
from keras.layers import Input, Concatenate

### 读取数据

In [None]:
train = pd.read_csv('../data/train_first.csv')
test = pd.read_csv('../data/predict_first.csv')

### 全局变量

In [None]:
max_features = 80000 ## 词汇量  
maxlen = 150  ## 最大长度
embed_size = 200 # emb 长度

### 预处理：缺失值填充+结巴分词+去停用词
###  将词映射为ID

In [None]:
def splitWord(query, stopwords):
    wordList = jieba.cut(query)
    num = 0
    result = ''
    for word in wordList:
        word = word.rstrip()
        word = word.rstrip('"')
        if word not in stopwords:
            if num == 0:
                result = word
                num = 1
            else:
                result = result + ' ' + word
    return result.encode('utf-8')
def preprocess(data):
    stopwords = {}
    for line in codecs.open('../data/stop.txt','r','utf-8'):
        stopwords[line.rstrip()]=1    
    data['doc'] = data['Discuss'].map(lambda x:splitWord(x,stopwords))
    return data;

In [None]:
train.Discuss.fillna('_na_',inplace=True)
test.Discuss.fillna('_na_',inplace=True)
train = preprocess(train)
test = preprocess(test)

comment_text = np.hstack([train.doc.values])
tok_raw = Tokenizer(num_words=max_features)
tok_raw.fit_on_texts(comment_text)
train['Discuss_seq'] = tok_raw.texts_to_sequences(train.doc.values)
test['Discuss_seq'] = tok_raw.texts_to_sequences(test.doc.values)

In [None]:
def get_keras_data(dataset): 
    X={
        'Discuss_seq':pad_sequences(dataset.Discuss_seq,maxlen=maxlen)
    }
    return X


### text-cnn 多filter_size

In [None]:
def score(y_true, y_pred):
    return 1.0/(1.0+K.sqrt(K.mean(K.square(y_true - y_pred), axis=-1)))

def cnn():
    #Inputs
    comment_seq = Input(shape=[maxlen],name='Discuss_seq')
    
    #Embeddings layers
    emb_comment =Embedding(max_features, embed_size)(comment_seq)
    
    # conv layers
    convs = []
    filter_sizes = [2,3,4,5]
    for fsz in filter_sizes:
        l_conv = Conv1D(filters=100,kernel_size=fsz,activation='relu')(emb_comment)
        l_pool = MaxPooling1D(maxlen-fsz+1)(l_conv)
        l_pool = Flatten()(l_pool)
        convs.append(l_pool)
    merge =concatenate(convs,axis=1)
    
    out = Dropout(0.5)(merge)
    output  = Dense(32,activation='relu')(out)
    
    output = Dense(units=1,activation='linear')(output)
    
    model = Model([comment_seq],output)
    adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    model.compile(loss="mse", optimizer="adam", metrics=["mae", score])
    return model


### rnn 模型

In [None]:
def rnn1():
    comment_seq = Input(shape=[maxlen],name='Discuss_seq')
    
    #Embeddings layers
    emb_comment =Embedding(max_features, embed_size, weights=[embedding_matrix])(comment_seq)
    main = Bidirectional(GRU(50, return_sequences=True, dropout=0.2, recurrent_dropout=0.4))(emb_comment)
#     main1 = Bidirectional(LSTM(32, return_sequences=True, dropout=0.1, recurrent_dropout=0.2))(emb_comment)
    main = GlobalMaxPool1D()(main)
#     main1 = GlobalMaxPool1D()(main1)
#     main =concatenate([main,main1],axis=1)
    main = Dense(50, activation="relu")(main)
    main= Dropout(0.2)(main)
    main = Dense(units=1,activation='linear')(main)
    model = Model([comment_seq],main)
    adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    model.compile(loss="mse", optimizer="adam", metrics=["mae", score])
    return model

### cnn_rnn 模型

In [None]:

def cnn_rnn():
    comment_seq = Input(shape=[maxlen],name='Discuss_single_seq')
    
    #Embeddings layers
    main =Embedding(max_features, embed_size)(comment_seq)
    main = Conv1D(filters=32, kernel_size=2, padding='same', activation='relu')(main)
    main = MaxPooling1D(pool_size=2)(main)
    main = Conv1D(filters=32, kernel_size=2, padding='same', activation='relu')(main)
    main = MaxPooling1D(pool_size=2)(main)
    main = Bidirectional(GRU(32))(main)
    main = Dense(units=1,activation='linear')(main)
    
    model = Model([comment_seq],main)
    adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    model.compile(loss="mse", optimizer="adam", metrics=["mae", score])
    return model

In [None]:
X_train =get_keras_data(train)
X_test = get_keras_data(test)
y_train = train.Score.values

### 训练

In [None]:
batch_size = 128 
epochs = 20
early_stopping = EarlyStopping(monitor="val_loss", mode="min", patience=2)

callbacks_list = [early_stopping]
model = cnn()
model.summary()
model.fit(X_train, y_train,
            validation_split=0.1,
            batch_size=batch_size, 
            epochs=epochs, 
            shuffle = True,
            callbacks=callbacks_list)

### 预测

In [None]:
preds = model.predict(X_test)
submission =pd.DataFrame(test.Id.values,columns=['Id'])
submission['Score'] = preds
submission.to_csv('../result/cnn-baseline.csv',index=None,header =None)