In [5]:
import pandas as pd
import numpy as np
from scipy.sparse import hstack
import nltk
from keras.layers import Dense,Activation, Embedding
from keras.models import Sequential
from sklearn.model_selection import train_test_split
from keras.layers import Dense, Merge
import matplotlib.pyplot as plt

In [6]:
with open("../glove.6B/glove.6B.50d.txt", "rb") as lines:
    w2v = {line.split()[0]: np.array(map(float, line.split()[1:]))
           for line in lines}

In [7]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(word2vec.itervalues().next())

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.mean([w2v[word] if word in w2v else np.random.rand(self.dim)
                        for word in nltk.word_tokenize(X) ],axis=0)

In [8]:
meanVec = MeanEmbeddingVectorizer(w2v)

In [9]:
df = pd.read_csv("data/training/data_train_normalize.csv",encoding='utf-8')

In [24]:
rmse = lambda y_pred, y_true: np.sqrt(mean_squared_error(y_pred,y_val))
chars_count = lambda sens: len(sens)
# ARI = 4.71 * (chars/words) + 0.5 * (words/sen) - 21.43
ARI = lambda sens:np.ceil(4.71 * (len(sens)/len(nltk.word_tokenize(sens))) + 0.5 * (len(nltk.word_tokenize(sens))/len(nltk.sent_tokenize(sens))) - 21.43)
# jaccard_unigram = lambda title,desc: 1.0  * len(title.intersection(desc)) / (len(title.union(desc)))
def jaccard_unigram(title,desc):
    a = set(nltk.word_tokenize(title))
    b = set(nltk.word_tokenize(desc))
    return 1.0  * len(a.intersection(b)) / (len(a.union(b)))
# punc_count = lambda sens: 

In [11]:
def make_continous_features(df):
    df_onehot = pd.get_dummies(df[['country','category_lvl_1','category_lvl_2','category_lvl_3','product_type']]
                               ,columns=['country','category_lvl_1','category_lvl_2','category_lvl_3','product_type'])

    df_real = df[['price','len_title','len_desc']].copy()

    df_real_addtional = df[['title','short_desc_strip']].copy()
    df_real_addtional.fillna('aa',inplace=True)

    df_real_addtional['title_char_count'] = df_real_addtional['title'].map(chars_count)
    df_real_addtional['title_ari'] = df_real_addtional['title'].map(ARI)
    df_real_addtional['desc_char_count'] = df_real_addtional['short_desc_strip'].fillna('aa').map(chars_count)
    df_real_addtional['desc_ari'] = df_real_addtional['short_desc_strip'].fillna('aa').map(ARI)

    jaccard_title_desc = df_real_addtional[['title','short_desc_strip']].apply(
        lambda row: jaccard_unigram(row[0],row[1]),axis=1)

    df_real_addtional['jaccard'] = jaccard_title_desc
    df_real_addtional.drop(['title','short_desc_strip'],axis=1,inplace=True)
    return hstack([df_onehot,df_real,df_real_addtional])
def sen2vec(df,stack=True):
    df_title = []
    for title in df['title']:
        df_title.append(meanVec.transform(title))
    df_desc = []
    for desc in df['short_desc_strip'].fillna('as'):
        df_desc.append(meanVec.transform(desc))
    if stack == True:
        return np.hstack([np.array(df_title),np.array(df_desc)])
    else:
        return np.array(df_title),np.array(df_desc)

In [12]:
continous_dat = make_continous_features(df)

In [13]:
word_dat = sen2vec(df)

In [14]:
total_df = hstack([continous_dat,word_dat])

# Clarity Prediction

In [15]:
X_train, X_val, y_train, y_val = train_test_split(total_df.toarray()
                                                  ,df['clarity'],stratify=df['clarity'],random_state=41)

# Model Definition

In [17]:
deep = Sequential()
deep.add(Dense(input_dim=total_df.shape[1],output_dim=200,activation='relu'))
deep.add(Dense(512, activation='relu'))
deep.add(Dense(256, activation='relu'))
deep.add(Dense(1, activation='sigmoid'))

In [21]:
deep.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy','precision']
)

In [23]:
history = deep.fit(X_train,y_train,validation_split=0.8,
          batch_size=16,nb_epoch=10,verbose=2)

Train on 5442 samples, validate on 21770 samples
Epoch 1/10
23s - loss: 0.8642 - acc: 0.9458 - precision: 0.9458 - val_loss: 0.9125 - val_acc: 0.9428 - val_precision: 0.9428
Epoch 2/10
24s - loss: 0.8642 - acc: 0.9458 - precision: 0.9458 - val_loss: 0.9125 - val_acc: 0.9428 - val_precision: 0.9428
Epoch 3/10
17s - loss: 0.8642 - acc: 0.9458 - precision: 0.9458 - val_loss: 0.9125 - val_acc: 0.9428 - val_precision: 0.9428
Epoch 4/10
16s - loss: 0.8642 - acc: 0.9458 - precision: 0.9458 - val_loss: 0.9125 - val_acc: 0.9428 - val_precision: 0.9428
Epoch 5/10
19s - loss: 0.8642 - acc: 0.9458 - precision: 0.9458 - val_loss: 0.9125 - val_acc: 0.9428 - val_precision: 0.9428
Epoch 6/10
21s - loss: 0.8642 - acc: 0.9458 - precision: 0.9458 - val_loss: 0.9125 - val_acc: 0.9428 - val_precision: 0.9428
Epoch 7/10
24s - loss: 0.8642 - acc: 0.9458 - precision: 0.9458 - val_loss: 0.9125 - val_acc: 0.9428 - val_precision: 0.9428
Epoch 8/10
22s - loss: 0.8642 - acc: 0.9458 - precision: 0.9458 - val_loss: 

KeyboardInterrupt: 

In [182]:
%matplotlib
loss = history.history['loss']
val_loss = history.history['val_loss']
ax = plt.subplot()
ax.plot(loss,'-',label='train_loss')
ax.plot(val_loss,'--',label='val_loss')
plt.legend()
plt.show()

Using matplotlib backend: Qt4Agg
