In [1]:
import pandas as pd
import numpy as np
import keras
from keras_tqdm import TQDMNotebookCallback
import gensim
import re
from tqdm import tqdm,tqdm_notebook,tqdm_pandas
import gc

from nltk.corpus import stopwords
from nltk.stem.snowball import RussianStemmer
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
DATA_PATH = 'E:/Kaggle/Avito/'
REMOVE_NA_DESC = False

In [3]:
num_words = 100000
max_len = 150
seed = 32

## Load word2vec dictionary

In [4]:
word2vec = gensim.models.Word2Vec.load(DATA_PATH+'avito300_sg.w2v')

## Load Data

In [5]:
train = pd.read_csv(DATA_PATH+'train.csv')
test = pd.read_csv(DATA_PATH+'test.csv')
if REMOVE_NA_DESC:
    train=train[train['description'].notnull()]

In [6]:
complete_data = pd.concat([train,test],axis=0,ignore_index=True)

## Feature Engineering

In [7]:
complete_data['params']=complete_data['param_1'].astype(str)+complete_data['param_2'].astype(str)+complete_data['param_3'].astype(str)
for col_text in ['params','title','description']:
    complete_data[col_text].fillna('thisismissing',inplace=True)
    complete_data[col_text] = complete_data[col_text].astype(str)

In [8]:
complete_data['image_top_1'].fillna(-999,inplace=True)
train['image_top_1'].fillna(-999,inplace=True)
test['image_top_1'].fillna(-999,inplace=True)
new_image_top = set(test['image_top_1'])-set(train['image_top_1'])
complete_data['image_top_1']=np.where(complete_data['image_top_1'].isin(new_image_top),
                                      -999,complete_data['image_top_1']) #should actually select the closest image class

In [9]:
complete_data['log_price']=np.log(complete_data['price']+1)
complete_data['log_item_seq'] = np.log(1+complete_data['item_seq_number'])

#### Categoricals as One Hot
* Need to deal with missing values better.
* Deal with low frequency categories and categories not in test set. Either impute it by something close that exists in train or by a new class
* Try form of embedding instead

In [10]:
categoricals_embs = ['region','parent_category_name','category_name','user_type','city',
                     'image_top_1','user_id','param_1']
dense = ['log_item_seq']

In [11]:
catembs = []
le=LabelEncoder()
for cat in categoricals_embs: #Must deal with cities not in train set
    complete_data[cat].fillna(-999,inplace=True)
    complete_data[cat] = le.fit_transform(complete_data[cat].astype(str))
    catembs.append(complete_data[cat].max()+1)
X_categoricals = complete_data[categoricals_embs].values.astype('float32')

In [12]:
print(catembs) 

[28, 9, 47, 3, 1752, 3063, 1009909, 372]


#### Text stuff

In [13]:
english_stopwords = set(stopwords.words('english'))
russian_stopwords = set(stopwords.words('russian'))
russian_stopwords = russian_stopwords.union(english_stopwords)
def preprocess(x,stop_words=None):
    x = keras.preprocessing.text.text_to_word_sequence(x)
    if stop_words:
        return [word for word in x if word not in russian_stopwords]
    else:
        return x

In [14]:
def make_token_matrix(data,text_col,num_words,max_len,stop_words=None):
    
    print('Create Tokenizer...',end=' ')
    
    texts = data[text_col].astype(str)
    tokenizer = keras.preprocessing.text.Tokenizer(num_words=num_words,lower=True)
    tokenizer.fit_on_texts(texts)
    
    print('Preprocess Text...',end=' ')
    texts = texts.apply(lambda x: preprocess(x,stop_words))
    
    print('Create Matrix...',end=' ')
    X = tokenizer.texts_to_sequences(texts)
    X = keras.preprocessing.sequence.pad_sequences(X,padding='pre',truncating='post',maxlen=max_len)
    
    print('Done !')
    return X,tokenizer

In [15]:
complete_data['description'] = complete_data['description'].astype(str)

In [16]:
X_desc, tokenizer_desc = make_token_matrix(complete_data,'description',num_words,max_len,stop_words=russian_stopwords)
word2idx = tokenizer_desc.word_index

Create Tokenizer... Preprocess Text... Create Matrix... Done !


In [17]:
word2idx = tokenizer_desc.word_index
idx2word = {i:w for w,i in word2idx.items()}

In [18]:
for i in X_desc[0]:
    if i!=0:
        print(idx2word[i],end=' ')
complete_data.iloc[0]['description']

кокон сна малыша пользовались меньше месяца цвет серый 

'Кокон для сна малыша,пользовались меньше месяца.цвет серый'

In [19]:
X_title, tokenizer_title = make_token_matrix(complete_data,'title',num_words,max_len,stop_words=russian_stopwords)    
word2idx_title = tokenizer_title.word_index

Create Tokenizer... Preprocess Text... Create Matrix... Done !


In [20]:
params_len = complete_data['params'].str.len().max()
X_params, tokenizer_params = make_token_matrix(complete_data,'params',num_words,params_len)
word2idx_params = tokenizer_params.word_index

Create Tokenizer... Preprocess Text... Create Matrix... Done !


In [21]:
print(params_len)

63


## Dense features

In [22]:
scaler = MinMaxScaler()

In [23]:
X_dense = scaler.fit_transform(complete_data[dense]).astype('float32')

In [24]:
dense_len = X_dense.shape[1]
print(dense_len)

1


## Concatenate and Splits

In [25]:
slices = [0, 
          max_len, #Description
          max_len+max_len, #Title
          max_len+max_len+params_len, #Params
          max_len+max_len+params_len+dense_len, #Dense
          *[max_len+max_len+params_len+dense_len+i+1 for i in range(len(catembs))]] #Categoricals as embeding layer 

slices_bounds = [(slices[i],slices[i+1]) for i,s in enumerate(slices) if i<len(slices)-1]

In [26]:
X = np.concatenate([X_desc,X_title,X_params,X_dense,X_categoricals],axis=1)

In [27]:
test_index = complete_data['log_price'].isna()
train_index = ~test_index

X_tr = X[train_index].astype('float32')
X_board = X[test_index].astype('float32')

y = complete_data[train_index]['log_price']

## Make model

In [28]:
def make_pretrain_embedding(w2idx,word2vec,embed_dim,num_words):
    unknown_words = []
    embeddings = np.zeros((num_words+1,embed_dim))  #0 is a special token
    for word,idx in w2idx.items(): #starts at 1    
        if idx>num_words:
            break 
        try:
            vect = word2vec[word]
            embeddings[idx]=vect/np.linalg.norm(vect)
        except KeyError:
            unknown_words.append(word)
    print('Number of words with no embeddings',len(unknown_words))
    
    return embeddings, unknown_words

In [29]:
pretrained_desc, unknown_words1 = make_pretrain_embedding(word2idx,word2vec,300,num_words)
pretrained_title, unknown_words2 = make_pretrain_embedding(word2idx_title,word2vec,300,num_words)

  


Number of words with no embeddings 211
Number of words with no embeddings 7831


In [32]:
def make_cnn_model(desc_len,
                   dense_len,
                   params_len,
                   catembs,
                   embed_dim,
                   pretrained_desc,pretrained_title,#pretrained_params,
                   dropout=0,
                   trainable_embeddings=False,
                   conv_size=128):
    
    desc_input = keras.layers.Input(shape=(desc_len,))
    title_input = keras.layers.Input(shape=(desc_len,))
    params_input = keras.layers.Input(shape=(params_len,))
    dense_input = keras.layers.Input(shape=(dense_len,))
    pricechar_input = keras.layers.Input(shape=(10,))
    
    #Description part
    embedded = keras.layers.Embedding(input_dim=pretrained_desc.shape[0],
                                      output_dim=embed_dim,
                                      input_length=max_len,
                                      weights=[pretrained_desc],
                                      trainable=trainable_embeddings)(desc_input)
    
    embedded = keras.layers.SpatialDropout1D(dropout+0.1)(embedded)
    X_1 = keras.layers.Conv1D(conv_size,kernel_size=1,activation='relu')(embedded)
    X_1 = keras.layers.GlobalMaxPooling1D()(X_1)
    X_2 = keras.layers.Conv1D(conv_size,kernel_size=3,activation='relu')(embedded)
    X_2 = keras.layers.GlobalMaxPooling1D()(X_2)
    X_3 = keras.layers.Conv1D(conv_size,kernel_size=5,activation='relu')(embedded)
    X_3 = keras.layers.GlobalMaxPooling1D()(X_3)
    
    desc_features = keras.layers.Concatenate()([X_1,X_2,X_3])
    desc_features = keras.layers.Dropout(dropout+0.1)(desc_features)
    desc_features = keras.layers.Dense(32,activation='relu')(desc_features)
    
    #Title
    embedded_title = keras.layers.Embedding(input_dim=pretrained_title.shape[0],
                                      output_dim=embed_dim,
                                      input_length=max_len,
                                      weights=[pretrained_title],
                                      trainable=trainable_embeddings)(title_input)
    
    embedded_title = keras.layers.SpatialDropout1D(dropout+0.1)(embedded_title)
    Z_1 = keras.layers.Conv1D(conv_size,kernel_size=1,activation='relu')(embedded_title)
    Z_1 = keras.layers.GlobalMaxPooling1D()(Z_1)
    Z_2 = keras.layers.Conv1D(conv_size,kernel_size=3,activation='relu')(embedded_title)
    Z_2 = keras.layers.GlobalMaxPooling1D()(Z_2)
    Z_3 = keras.layers.Conv1D(conv_size,kernel_size=5,activation='relu')(embedded_title)
    Z_3 = keras.layers.GlobalMaxPooling1D()(Z_3)
    
    title_features = keras.layers.Concatenate()([Z_1,Z_2,Z_3])
    title_features = keras.layers.Dropout(dropout+0.1)(title_features)
    title_features = keras.layers.Dense(32,activation='relu')(title_features)
    
    #Params
    embedded_params = keras.layers.Embedding(input_dim=len(tokenizer_params.word_index)+1,
                                      output_dim=100,
                                      input_length=params_len,trainable=True)(params_input)
    params_features = keras.layers.CuDNNGRU(32,return_sequences=True)(embedded_params)
    params_features = keras.layers.GlobalAveragePooling1D()(params_features)
    
    #Dense
    dense_features = keras.layers.Dense(dense_len)(dense_input)
    
    #Categoricals    
    cat_embs_inputs = []
    cat_embs_embeded = []
    for i in range(len(catembs)):
        cat_embs_inputs.append(keras.layers.Input(shape=(1,)))
        cat_embs_embeded.append(keras.layers.Embedding(input_dim=catembs[i],
                                                      output_dim=16,
                                                      input_length=1,trainable=True)(cat_embs_inputs[i]))
    
    cat_emb_features = keras.layers.Concatenate()(cat_embs_embeded)
    cat_emb_features = keras.layers.Flatten()(cat_emb_features)
    cat_emb_features = keras.layers.Dropout(dropout)(cat_emb_features)
    cat_emb_features = keras.layers.Dense(128,activation='relu')(cat_emb_features)
    cat_emb_features = keras.layers.Dropout(dropout)(cat_emb_features)
    cat_emb_features = keras.layers.Dense(32,activation='relu')(cat_emb_features)
    
    #Concatenate Features
    X = keras.layers.Concatenate()([cat_emb_features,desc_features,title_features,params_features,dense_features])
    X = keras.layers.Dropout(dropout+0.1)(X)
    X = keras.layers.Dense(256,activation='relu')(X)
    X = keras.layers.Dropout(dropout)(X)
    X = keras.layers.Dense(128,activation='relu')(X)
    X = keras.layers.Dense(32,activation='tanh')(X)
    
    y_hat = keras.layers.Dense(1)(X)
    
    cnn_model = keras.Model(inputs=[desc_input,title_input,params_input,dense_input,*cat_embs_inputs],outputs=y_hat)
    
    return cnn_model

In [36]:
gc.collect()
np.random.seed(seed)


VALID=False
if VALID:
    X_tr_tr, X_val, y_tr, y_val = train_test_split(X_tr,y,test_size=0.1,random_state=seed)
    keras.backend.clear_session() #Reset   
    cnn_model = make_cnn_model(max_len,
                               dense_len,
                               params_len,
                               catembs,
                               300,
                               pretrained_desc,pretrained_title,
                               dropout=0.3,
                               trainable_embeddings=False, conv_size=128) #If allows train of embedding words, will have to restrict vocab to the train set
    Adam = keras.optimizers.Adam(0.001)
    cnn_model.compile(optimizer=Adam,loss='mean_squared_error')
    epochs = 20 #Beware of overfit
    cnn_model.fit([X_tr_tr[:,s[0]:s[1]] for s in slices_bounds],
              y_tr,
              validation_data=([X_val[:,s[0]:s[1]] for s in slices_bounds], y_val),
              batch_size=512,epochs=epochs,verbose = 0,callbacks=[TQDMNotebookCallback(leave_inner=True)],)
    predictions = cnn_model.predict([X_val[:,s[0]:s[1]] for s in slices_bounds]).flatten()
    r = np.sqrt(np.mean((predictions.clip(0,1)-y_val)**2))
    print(f'RMSE : {r:.4f}')
    
else:
    
    cnn_model = make_cnn_model(max_len,
                           dense_len,
                           params_len,
                           catembs,
                           300,
                           pretrained_desc,pretrained_title,
                           dropout=0.3,
                           trainable_embeddings=False, conv_size=128) #If allows train of embedding words, will have to restrict vocab to the train set
    Adam = keras.optimizers.Adam(0.001)
    cnn_model.compile(optimizer=Adam,loss='mean_squared_error')
    epochs = 2 #Beware of overfit
    cnn_model.fit([X_tr[:,s[0]:s[1]] for s in slices_bounds],
              y,
              batch_size=512,epochs=epochs,verbose = 0,callbacks=[TQDMNotebookCallback(leave_inner=True)],)

HBox(children=(IntProgress(value=0, description='Training', max=2), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 0', max=1895915), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 1', max=1895915), HTML(value='')))

In [37]:
guess_price = complete_data[['item_id']].copy()
guess_price['Guess_price'] = 0

In [41]:
for k in range(21):
    X_batch = X[100000*k:100000*(k+1),:]
    predictions_price = cnn_model.predict([X_batch[:,s[0]:s[1]] for s in slices_bounds])
    guess_price.iloc[100000*k:100000*(k+1),1]=predictions_price.flatten()

In [44]:
guess_price.to_csv(DATA_PATH+'price_guess.csv',index=False)

In [45]:
guess_price['real_log_price']=complete_data['log_price']

In [46]:
guess_price

Unnamed: 0,item_id,Guess_price,real_log_price
0,b912c3c6a6ad,6.510823,5.993961
1,2dac0150717d,7.904475,8.006701
2,ba83aefab5dc,7.189164,8.294300
3,02996f1dd2ea,7.594705,7.696667
4,7c90be56d2ab,10.731431,10.596660
5,51e0962387f7,7.174815,7.170888
6,c4f260a2b48a,8.181076,9.305741
7,6b71309d6a8a,6.173020,6.216606
8,c5b969cb63a2,6.380401,6.216606
9,b1570962e68c,6.002148,5.993961
