In [1]:
import pandas as pd
import numpy as np
import keras
from keras_tqdm import TQDMNotebookCallback
import gensim
import re
from tqdm import tqdm,tqdm_notebook,tqdm_pandas
import gc

from nltk.corpus import stopwords
from nltk.stem.snowball import RussianStemmer
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, Imputer
from sklearn.metrics import mean_squared_error

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def rmse(y, y_pred):
    return keras.backend.sqrt(keras.backend.mean(keras.backend.square(y_pred - y)))*10

In [3]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
        `float64` type to `float32`
        `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

## Load Data

In [4]:
DATA_PATH = 'E:/Kaggle/Avito/'

num_words = 100000 #None to deactivate
max_len = 150
seed = 32
FOLDS = 5
min_class_cat = 50

In [5]:
train = pd.read_csv(DATA_PATH+'train.csv')
test = pd.read_csv(DATA_PATH+'test.csv')
guess_image = pd.read_csv(DATA_PATH+'image_guess.csv')
price_guess  = pd.read_csv(DATA_PATH+'price_guess.csv')
param_guess  = pd.read_csv(DATA_PATH+'param_guess.csv')
param2_guess  = pd.read_csv(DATA_PATH+'param2_guess.csv')

images0 = pd.read_csv(DATA_PATH+'image0_features.csv')
images1 = pd.read_csv(DATA_PATH+'image1_features.csv')
images2 = pd.read_csv(DATA_PATH+'image2_features.csv')
images3 = pd.read_csv(DATA_PATH+'image3_features.csv')
images4 = pd.read_csv(DATA_PATH+'image4_features.csv')
imagestest = pd.read_csv(DATA_PATH+'imagetest_features.csv')
imagesdata = pd.concat([images0,images1,images2,images3,images4,imagestest],axis=0)
del images0, images1, images2, images3, images4, imagestest

images_pred0 = pd.read_csv(DATA_PATH+'Image_preds0.csv')
images_pred1 = pd.read_csv(DATA_PATH+'Image_preds1.csv')
images_pred2 = pd.read_csv(DATA_PATH+'Image_preds2.csv')
images_pred3 = pd.read_csv(DATA_PATH+'Image_preds3.csv')
images_pred4 = pd.read_csv(DATA_PATH+'Image_preds4.csv')
imagestest_pred = pd.read_csv(DATA_PATH+'Image_predstest.csv')
imagesdata_pred = pd.concat([images_pred0,images_pred1,images_pred2,images_pred3,images_pred4,imagestest_pred],axis=0)
del images_pred0, images_pred1, images_pred2, images_pred3, images_pred4, imagestest_pred

user_agg = pd.read_csv(DATA_PATH+'user_aggregated_features.csv')

In [6]:
kf = KFold(FOLDS,random_state=seed,shuffle=True)
Fold = 0
for train_index, test_index in kf.split(train):
    train.loc[test_index,'Fold']=Fold
    Fold += 1

In [7]:
complete_data = pd.concat([train,test],axis=0,ignore_index=True)
complete_data = pd.merge(complete_data,guess_image,how='left')
complete_data = pd.merge(complete_data,price_guess,how='left')
complete_data = pd.merge(complete_data,param_guess,how='left')
complete_data = pd.merge(complete_data,param2_guess,how='left')
complete_data = pd.merge(complete_data,imagesdata,how='left')
complete_data = pd.merge(complete_data,imagesdata_pred,how='left')
complete_data = pd.merge(complete_data,user_agg,how='left',on='user_id')
del guess_image, price_guess, param_guess, param2_guess, imagesdata, imagesdata_pred, user_agg

## Load word2vec dictionary

In [8]:
word2vec = gensim.models.Word2Vec.load(DATA_PATH+'avito300_sg.w2v')

## Feature Engineering

In [9]:
complete_data['missing_param_1'] = complete_data['param_1'].isnull().astype(int)
complete_data['missing_param_2'] = complete_data['param_2'].isnull().astype(int)
complete_data['missing_param_3'] = complete_data['param_3'].isnull().astype(int)
complete_data['missing_desc'] = complete_data['description'].isnull().astype(int)
complete_data['missing_price'] = complete_data['price'].isnull().astype(int)
complete_data['missing_image'] = complete_data['image'].isnull().astype(int)
complete_data['number_missings'] = complete_data['missing_param_1']+complete_data['missing_param_2']+complete_data['missing_image']+\
                                    complete_data['missing_param_3']+complete_data['missing_desc']+complete_data['missing_price'] 
del complete_data['missing_param_1'] 
del complete_data['missing_param_2'] 
del complete_data['missing_param_3']
del complete_data['missing_desc'] 
del complete_data['missing_price'] 
del complete_data['missing_image']

In [10]:
temp = complete_data.groupby(['user_id'],as_index=False).agg({'item_id':'count'}).rename(columns={'item_id':'count_item'})
big_users = set(temp[temp['count_item']>=20]['user_id'])
complete_data['user_id'] = np.where(complete_data['user_id'].isin(big_users),complete_data['user_id'],'SmallUser')

In [11]:
complete_data['log_price']=np.log(complete_data['price']+0.01)

In [12]:
complete_data['city_counts']=complete_data.groupby(['city'])['item_id'].transform('count')

In [13]:
complete_data['log_price'].fillna(-1,inplace=True)

In [14]:
complete_data['price_as_char']=complete_data['price']
complete_data['price_as_char']=complete_data['price_as_char'].fillna(-999).astype(int).astype(str)
tokenizer_price = keras.preprocessing.text.Tokenizer(lower=True,char_level=True)
tokenizer_price.fit_on_texts(complete_data['price_as_char'])
X_price = tokenizer_price.texts_to_sequences(complete_data['price_as_char'])
X_price = keras.preprocessing.sequence.pad_sequences(X_price,padding='pre',truncating='pre',maxlen=10)
pricechar_len = X_price.shape[1]

In [15]:
complete_data['log_item_seq']=np.log(0.01+complete_data['item_seq_number'])

In [16]:
complete_data['param_1'].fillna('Unknown',inplace=True)
complete_data['param_2'].fillna('Unknown',inplace=True)
complete_data['param_3'].fillna('Unknown',inplace=True)
complete_data['image_top_1'].fillna('Unknown',inplace=True)

In [17]:
temp = train.groupby(['city'],as_index=False).agg({'item_id':'count'}).rename(columns={'item_id':'count_item'})
big_cities = set(temp[temp['count_item']>=min_class_cat]['city'])
complete_data['city_clean'] = np.where(complete_data['city'].isin(big_cities),complete_data['city'],'SmallCity')

In [18]:
uppercase_regex = re.compile(r'[A-ZА-Я]')
symbols_regex = re.compile(r'[^a-zA-ZА-Я0-9а-я]')
digits_regex = re.compile(r'[0-9]')

In [19]:
complete_data['title_number_uppercase'] = complete_data['title'].str.count(uppercase_regex)
complete_data['title_number_symbols'] = complete_data['title'].str.count(symbols_regex)
complete_data['title_number_digits'] = complete_data['title'].str.count(digits_regex)
complete_data['title_len_chars'] = complete_data['title'].apply(lambda x: len(str(x)))
complete_data['title_len_words'] = complete_data['title'].str.split().apply(lambda x: len(str(x)))
complete_data['title_unique'] = complete_data['title'].str.split().apply(lambda x: len(set(str(x))))
complete_data['share_unique_title'] = complete_data['title_unique']/complete_data['title_len_words']

In [20]:
complete_data['desc_number_uppercase'] = complete_data['description'].str.count(uppercase_regex)
complete_data['desc_number_symbols'] = complete_data['description'].str.count(symbols_regex)
complete_data['desc_number_digits'] = complete_data['description'].str.count(digits_regex) #should do similar stuff for param
complete_data['desc_len_char']=complete_data['description'].apply(lambda x: len(str(x)))
complete_data['desc_len_words']=complete_data['description'].str.split().apply(lambda x: len(str(x)))
complete_data['desc_unique'] = complete_data['description'].str.split().apply(lambda x: len(set(str(x))))
complete_data['share_unique_desc'] = complete_data['desc_unique']/complete_data['desc_len_words']
complete_data['desc_rows'] = complete_data['description'].astype(str).apply(lambda x: x.count('/\n'))
complete_data['r_title_desc'] = complete_data['title_len_chars']/(complete_data['desc_len_char']+1)
complete_data['desc_number_uppercase'].fillna(-1,inplace=True)
complete_data['desc_number_symbols'].fillna(-1,inplace=True)
complete_data['desc_number_digits'].fillna(-1,inplace=True)
complete_data['desc_len_char'].fillna(-1,inplace=True)
complete_data['desc_len_words'].fillna(-1,inplace=True)
complete_data['desc_unique'].fillna(-1,inplace=True)
complete_data['share_unique_desc'].fillna(-1,inplace=True)
complete_data['desc_rows'].fillna(-1,inplace=True)
complete_data['r_title_desc'].fillna(-1,inplace=True)

In [21]:
for var in ['img_size', 'lightness','darkness','pixel_width','avg_red','avg_green','avg_blue','width','height','blurness']:
    complete_data[var].fillna(-1,inplace=True)
for var in ['usermean_days_up_sum','usermean_days_up_count','usermean_days_up_avg','usermean_days_until_activation_sum','usermean_days_until_activation_avg',
            'userstd_days_up_sum','userstd_days_up_count','userstd_days_up_avg','userstd_days_until_activation_sum','userstd_days_until_activation_avg','usermedian_days_up_sum',
            'usermedian_days_up_count','usermedian_days_up_avg','usermedian_days_until_activation_sum','usermedian_days_until_activation_avg','n_user_items']:
    complete_data[var].fillna(-1,inplace=True)

#### Text stuff

In [22]:
english_stopwords = set(stopwords.words('english'))
russian_stopwords = set(stopwords.words('russian'))
russian_stopwords = russian_stopwords.union(english_stopwords)
def preprocess(x,stop_words=None):
    x = keras.preprocessing.text.text_to_word_sequence(x)
    if stop_words:
        return [word for word in x if word not in russian_stopwords]
    else:
        return x

In [23]:
def make_token_matrix(data,text_col,num_words,max_len,stop_words=None):
    
    print('Create Tokenizer...',end=' ')
    
    texts = data[text_col].astype(str)
    tokenizer = keras.preprocessing.text.Tokenizer(num_words=num_words,lower=True)
    tokenizer.fit_on_texts(texts)
    
    print('Preprocess Text...',end=' ')
    texts = texts.apply(lambda x: preprocess(x,stop_words))
    
    print('Create Matrix...',end=' ')
    X = tokenizer.texts_to_sequences(texts)
    X = keras.preprocessing.sequence.pad_sequences(X,padding='pre',truncating='post',maxlen=max_len)
    
    print('Done !')
    return X,tokenizer

In [24]:
complete_data['description'] = complete_data['description'].astype(str)

In [25]:
X_desc, tokenizer_desc = make_token_matrix(complete_data,'description',num_words,max_len,stop_words=russian_stopwords)
word2idx = tokenizer_desc.word_index

Create Tokenizer... Preprocess Text... Create Matrix... Done !


In [26]:
word2idx = tokenizer_desc.word_index
idx2word = {i:w for w,i in word2idx.items()}

In [27]:
for i in X_desc[0]:
    if i!=0:
        print(idx2word[i],end=' ')
complete_data.iloc[0]['description']

кокон сна малыша пользовались меньше месяца цвет серый 

'Кокон для сна малыша,пользовались меньше месяца.цвет серый'

In [28]:
X_title, tokenizer_title = make_token_matrix(complete_data,'title',num_words,max_len,stop_words=russian_stopwords)    
word2idx_title = tokenizer_title.word_index

Create Tokenizer... Preprocess Text... Create Matrix... Done !


In [29]:
complete_data['labels_all'] = (complete_data['Res50_label1'].astype(str)+' '+complete_data['Xcept_label1'].astype(str)+' '+complete_data['Incept_label1'].astype(str)+' '+
                               complete_data['Res50_label2'].astype(str)+' '+complete_data['Xcept_label2'].astype(str)+' '+complete_data['Incept_label2'].astype(str)+' '+
                               complete_data['Res50_label3'].astype(str)+' '+complete_data['Xcept_label3'].astype(str)+' '+complete_data['Incept_label3'].astype(str))

In [56]:
X_labels, tokenizer_labels = make_token_matrix(complete_data,'labels_all',num_words,9)    
word2idx_labels = tokenizer_labels.word_index

Create Tokenizer... Preprocess Text... Create Matrix... Done !


In [58]:
categoricals = ['param_1','image_top_1','city_clean','user_type','category_name']
categoricals += ['param_2','param_3']
#categoricals += ['user_id']

numericals = ['log_price','log_item_seq','number_missings']
#numericals += ['title_number_uppercase','title_number_symbols','title_number_digits','title_len_chars','title_len_words','title_unique','share_unique_title']
#numericals += ['desc_number_uppercase','desc_number_symbols','desc_number_digits','desc_len_char','desc_len_words','desc_unique','share_unique_desc','desc_rows','r_title_desc']
#numericals += [ 'lightness','darkness','avg_red','avg_green','avg_blue','width','height','blurness']
numericals += ['usermean_days_up_sum','n_user_items']

In [32]:
catembs = []
le = LabelEncoder()
for cat in categoricals: #Must deal with cities not in train set
    complete_data[cat].fillna(-999,inplace=True)
    complete_data[cat] = le.fit_transform(complete_data[cat].astype(str))
    catembs.append(complete_data[cat].max()+1)

In [33]:
X_cat = complete_data[categoricals].values.astype('int32')

In [34]:
print(catembs)

[372, 3064, 674, 3, 47, 9, 278, 1277, 3890]


In [59]:
scaler = StandardScaler()

In [60]:
X_dense = scaler.fit_transform(complete_data[numericals]).astype('float32')

In [61]:
dense_len = X_dense.shape[1]
print(dense_len)

6


## Concatenate and Splits

In [62]:
slices = [0, 
          max_len, #Description
          max_len+max_len, #Title
          max_len+max_len+dense_len, #Dense
          *[max_len+max_len+dense_len+i+1 for i in range(len(catembs))]] #Categoricals as embeding layer 

slices_bounds = [(slices[i],slices[i+1]) for i,s in enumerate(slices) if i<len(slices)-1]

In [63]:
X = np.concatenate([X_desc,X_title,X_dense,X_cat],axis=1)

In [64]:
X_tr = X[:len(train),:].astype('float32')
X_board = X[len(train):,:].astype('float32')

y = train['deal_probability']
del X

In [65]:
slices_bounds

[(0, 150),
 (150, 300),
 (300, 306),
 (306, 307),
 (307, 308),
 (308, 309),
 (309, 310),
 (310, 311),
 (311, 312),
 (312, 313),
 (313, 314),
 (314, 315)]

## Make model

In [45]:
def make_pretrain_embedding(w2idx,word2vec,embed_dim,num_words):
    unknown_words = []
    embeddings = np.zeros((num_words+1,embed_dim))  #0 is a special token
    for word,idx in w2idx.items(): #starts at 1    
        if idx>num_words:
            break 
        try:
            vect = word2vec[word]
            embeddings[idx]=vect/np.linalg.norm(vect)
        except KeyError:
            unknown_words.append(word)
    print('Number of words with no embeddings',len(unknown_words))
    
    return embeddings, unknown_words

In [46]:
pretrained_desc, unknown_words1 = make_pretrain_embedding(word2idx,word2vec,300,num_words)
pretrained_title, unknown_words2 = make_pretrain_embedding(word2idx_title,word2vec,300,num_words)

  


Number of words with no embeddings 210
Number of words with no embeddings 7831


In [76]:
def make_nn_model(desc_len,title_len,dense_len,catembs):
    
    desc_input = keras.layers.Input(shape=(desc_len,))
    title_input = keras.layers.Input(shape=(title_len,))
    dense_input = keras.layers.Input(shape=(dense_len,))
    
    desc_embedded = keras.layers.Embedding(input_dim=pretrained_desc.shape[0],
                                  output_dim=300,
                                  input_length=desc_len,
                                  weights=[pretrained_desc],
                                  trainable=False)(desc_input)
    desc_features = keras.layers.Bidirectional(keras.layers.CuDNNGRU(64,return_sequences = True))(desc_embedded)
    desc_features1 = keras.layers.GlobalAveragePooling1D()(desc_features)
    desc_features2 = keras.layers.GlobalMaxPooling1D()(desc_features)
    desc_features = keras.layers.Concatenate()([desc_features1,desc_features2])
    
    
    title_embedded = keras.layers.Embedding(input_dim=pretrained_title.shape[0],
                                  output_dim=300,
                                  input_length=title_len,
                                  weights=[pretrained_title],
                                  trainable=False)(title_input)
    title_features = keras.layers.Bidirectional(keras.layers.CuDNNGRU(64,return_sequences = True))(title_embedded)
    title_features1 = keras.layers.GlobalAveragePooling1D()(title_features)
    title_features2 = keras.layers.GlobalMaxPooling1D()(title_features)
    title_features = keras.layers.Concatenate()([title_features1,title_features2])
    
    
    
    #############################################
    labels_input = keras.layers.Input(shape=(9,))
    labels_embedded = keras.layers.Embedding(input_dim=1050,
                                  output_dim=64,
                                  input_length=9,
                                  trainable=True)(labels_input)
    labels_features = keras.layers.CuDNNGRU(32,return_sequences = False)(labels_embedded)
    #############################################
    
    
    
    cat_embs_inputs = []
    cat_embs_embeded = []
    for i in range(len(catembs)):
        inp = keras.layers.Input(shape=(1,))
        cat_embs_inputs.append(inp)
        embed = keras.layers.Embedding(input_dim=catembs[i],output_dim=min(64,catembs[i]//2),input_length=1,trainable=True)(inp)
        embed = keras.layers.Dropout(0.3)(embed)
        embed = keras.layers.Flatten()(embed)
        cat_embs_embeded.append(embed)
        
    dense_features = keras.layers.Concatenate()(cat_embs_embeded+[dense_input])
    dense_features = keras.layers.Dense(512,kernel_initializer='he_normal')(dense_features)
    dense_features = keras.layers.PReLU()(dense_features)
    dense_features = keras.layers.BatchNormalization()(dense_features)
    dense_features = keras.layers.Dropout(0.3)(dense_features)
    dense_features = keras.layers.Concatenate()([dense_features,title_features,desc_features])
    dense_features = keras.layers.Dense(512,kernel_initializer='he_normal')(dense_features)
    dense_features = keras.layers.PReLU()(dense_features)
    dense_features = keras.layers.BatchNormalization()(dense_features)
    dense_features = keras.layers.Dropout(0.3)(dense_features)
    dense_features = keras.layers.Dense(128,kernel_initializer='he_normal')(dense_features)
    dense_features = keras.layers.PReLU()(dense_features)
    dense_features = keras.layers.BatchNormalization()(dense_features)
    dense_features = keras.layers.Dropout(0.3)(dense_features)
    dense_features = keras.layers.Dense(64,kernel_initializer='he_normal')(dense_features)
    dense_features = keras.layers.PReLU()(dense_features)
    dense_features = keras.layers.BatchNormalization()(dense_features)
    dense_features = keras.layers.Dropout(0.3)(dense_features)
    dense_features = keras.layers.Dense(64,kernel_initializer='he_normal')(dense_features)
    dense_features = keras.layers.PReLU()(dense_features)
    dense_features = keras.layers.BatchNormalization()(dense_features)
    dense_features = keras.layers.Dropout(0.3)(dense_features)
    dense_features = keras.layers.Dense(32,kernel_initializer='he_normal')(dense_features)
    dense_features = keras.layers.PReLU()(dense_features)
    dense_features = keras.layers.BatchNormalization()(dense_features)
    dense_features = keras.layers.Dropout(0.3)(dense_features)
    dense_features = keras.layers.Dense(32,kernel_initializer='he_normal')(dense_features)
    dense_features = keras.layers.PReLU()(dense_features)
    dense_features = keras.layers.BatchNormalization()(dense_features)
            
    y_hat = keras.layers.Dense(1,activation='sigmoid')(dense_features)
    
    nn_model = keras.Model(inputs=[desc_input,title_input,dense_input,*cat_embs_inputs, labels_input],outputs=y_hat)
    
    return nn_model

In [77]:
gc.collect()
np.random.seed(seed)

X_lab_tr = X_labels[:len(train),:].astype('float32')

#X_tr_tr, X_val, y_tr, y_val = train_test_split(X_tr,y,test_size=0.1,random_state=seed)
X_tr_tr, X_val, X_labels_tr, X_labels_val, y_tr, y_val = train_test_split(X_tr,X_lab_tr,y,test_size=0.1,random_state=seed)

keras.backend.clear_session() #Reset   
nn_model = make_nn_model(max_len,max_len,dense_len,catembs)
Adam = keras.optimizers.Adam(0.001)
nn_model.compile(optimizer=Adam,loss='mean_squared_error',metrics=[rmse])
epochs = 10 #Beware of overfit
nn_model.fit([X_tr_tr[:,s[0]:s[1]] for s in slices_bounds]+[X_labels_tr], y_tr,
          validation_data=([X_val[:,s[0]:s[1]] for s in slices_bounds]+[X_labels_val], y_val),
          batch_size=1024,epochs=epochs,verbose = 0,callbacks=[TQDMNotebookCallback(leave_inner=True)])
predict_val = nn_model.predict([X_val[:,s[0]:s[1]] for s in slices_bounds]+[X_labels_val]).flatten().clip(0.0,1.0)
r = mean_squared_error(y_val,predict_val)**0.5
print(f'RMSE : {r:.4f}')

HBox(children=(IntProgress(value=0, description='Training', max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 0', max=1353081), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 1', max=1353081), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 2', max=1353081), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 3', max=1353081), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 4', max=1353081), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 5', max=1353081), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 6', max=1353081), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 7', max=1353081), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 8', max=1353081), HTML(value='')))

HBox(children=(IntProgress(value=0, description='Epoch 9', max=1353081), HTML(value='')))

RMSE : 0.2189


In [None]:
predictions = cnn_model.predict([X_board[:,s[0]:s[1]] for s in slices_bounds])
test['deal_probability']=predictions.clip(0.0,1.0)
test[['item_id','deal_probability']].to_csv(DATA_PATH+f'{r:.5f}_'+'Predictions_NNv2.csv',index=False)

In [None]:
X_labels