In [1]:
import pickle
import numpy as np
import pandas as pd

from sklearn.model_selection import GridSearchCV, KFold
from keras.wrappers.scikit_learn import KerasRegressor

import tensorflow as tf

import keras
import keras.backend as K

from keras.layers.core import Dense, Dropout
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer,text_to_word_sequence

import word2vecReader as w2v_reader

from helpers import *


pd.set_option('display.max_colwidth',150)

config = tf.ConfigProto()
config.gpu_options.allow_growth=True
sess = tf.Session(config=config)
K.set_session(sess)

def plot_loss(history, show=True, title=None):
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    if title is None:
        plt.title('model loss')
    else:
        plt.title(title)
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    if show:
        plt.show()
    
    
def pearson_loss(y_true, y_pred):
    numerator = -K.sum((y_true-K.mean(y_true))*(y_pred-K.mean(y_pred)))
    denominator = ( K.sqrt(K.sum((K.square(y_true-K.mean(y_true))))) * K.sqrt(K.sum((K.square(y_pred-K.mean(y_pred)))))\
                   +K.epsilon() )
    return numerator/denominator

Using TensorFlow backend.


In [2]:
word2vec = w2v_reader.Word2Vec.load_word2vec_format('word_embeddings/word2vec/word2vec_twitter_model/word2vec_twitter_model.bin', binary=True)

In [3]:
main_emotions = ['anger','fear','joy','sadness']

full_data={}
full_Y = {}
for emotion in main_emotions:
    full_data[emotion] = get_emotion_data(emotion)
    for data in full_data[emotion]:
        data['cleaned_text'] = data['text'].map(lambda s: text_to_wordlist(s, w2v = word2vec))
    full_Y[emotion] = [data['intensity'] for data in full_data[emotion]]

# Основная идея

У нас есть значения для малого количества слов. Обучим нейронную сеть, которая будет прогнозировать значения для неизвестных слов. В основе будет лежать word2vec представления.  
Расширим AFINN

In [4]:
kfold = KFold(5, True,42)
tokenizer = Tokenizer(filters='"$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(np.hstack([data['cleaned_text'] for data 
                                  in full_data['anger']+full_data['fear']+full_data['joy']+full_data['sadness']]))

# AFINN

In [5]:
afinn_lexicon = pd.read_csv('lexicons/AFINN-en-165.txt', delimiter='\t', header = None, names=['term','score'])
print(afinn_lexicon.shape)
afinn_lexicon.head(3)

(3382, 2)


Unnamed: 0,term,score
0,abandon,-2
1,abandoned,-2
2,abandons,-2


## Сформируем обучающую выборку

In [6]:
afinn_word2index = dict(zip(afinn_lexicon['term'],list(afinn_lexicon.index)))

X = np.zeros((len(afinn_word2index), 400))
Y = afinn_lexicon['score'].values

for word, index in afinn_word2index.items():
    if word in word2vec:
        X[index,:]=word2vec[word]

In [7]:
def make_model(optimizer='adam', loss='mse', activation='relu', layer_size=200, use_third_layer=False, dropout = 0):
    model = Sequential()
    model.add(Dense(layer_size, activation=activation, input_shape=(400,)))
    model.add(Dropout(0.2))
    model.add(Dense(layer_size, activation =activation))
    if use_third_layer:
        model.add(Dense(25, activation=activation))
    model.add(Dense(1))
    model.compile(optimizer, loss)
    
    return model

In [8]:
my_regressor = KerasRegressor(make_model, batch_size=32)
validator = GridSearchCV(my_regressor,
                         param_grid={'optimizer':['adam'],
                                     'loss':['mse'],
                                     'activation':['relu'],
                                     'layer_size':[200],
                                     'use_third_layer':[True,False],
                                     'dropout':[0.1,0.2,0.3],
                                     'epochs': [60]
                                    },
                         scoring='neg_mean_squared_error',
                         n_jobs=1,
                         return_train_score=False,
                         cv=kfold
                        )
validator.fit(X,Y,verbose=0)

GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
       error_score='raise',
       estimator=<keras.wrappers.scikit_learn.KerasRegressor object at 0x7f8ebc6aa400>,
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'optimizer': ['adam'], 'loss': ['mse'], 'activation': ['relu'], 'layer_size': [200], 'use_third_layer': [True, False], 'dropout': [0.1, 0.2, 0.3], 'epochs': [60]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
       scoring='neg_mean_squared_error', verbose=0)

## Обучим модель

![title](images/lex_extend_arch.png)

In [14]:
model = make_model(dropout=0.3)
model.fit(X,Y, epochs=60, verbose=0)

<keras.callbacks.History at 0x7f8e7cb2ea58>

In [15]:
afinn_map = dict(afinn_lexicon[['term', 'score']].values)
extended_afinn = {}
for word in tokenizer.word_index.keys():
    if word in word2vec:
        if word in afinn_map:
            extended_afinn[word] = afinn_map[word]
        else:
            extended_afinn[word]=model.predict(word2vec[word].reshape(1,400)).reshape(-1)[0]
# pickle.dump(extended_afinn, open( "features/extended_afinn.p", "wb" ) )