In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from  sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import layers
from tensorflow.keras import Model
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Loading the processed train dataset

df =  pd.read_csv("train_processed.csv",index_col=["id"])
df.head()

Unnamed: 0_level_0,name,item_condition_id,category_name,brand_name,price,shipping,item_description,log_price,name_processed,brand_name_processed,category_name_preprocessed,Tier_2,Tier_3,item_description_processed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
17,"Hold Alyssa Frye Harness boots 12R, Sz 7",3,Women/Shoes/Boots,Frye,79,1,Good used condition Women's Fyre harness boots...,4.382027,hold alyssa frye harness boots 12r sz 7,frye,women/shoe/boots,shoe,boots,good used condition women fyre harness boots l...
19,Steve Madden booties,3,Women/Shoes/Boots,Steve Madden,31,0,"The brand is actually ""Steven"" by Steve Madden...",3.465736,steve madden booties,steve madden,women/shoe/boots,shoe,boots,brand actually steven steve madden steve madde...
42,BCBG Tan Booties,1,Women/Shoes/Boots,,48,0,Brand new! Does not include the box.,3.89182,bcbg tan booties,bcbg,women/shoe/boots,shoe,boots,brand new include box
45,NWT Sorel Caribou boots size 8.5,1,Women/Shoes/Boots,,85,0,New in box. Size 8.5,4.454347,nwt sorel caribou boots size 85,sorel,women/shoe/boots,shoe,boots,new box size 85
58,NIB Hunter Tiffany Mint Boots Size 5,1,Women/Shoes/Boots,Hunter,200,0,Brand new never worn only flaw is as you can s...,5.303305,nib hunter tiffany mint boots size 5,hunter,women/shoe/boots,shoe,boots,brand new never worn flaw see picture color we...


In [3]:
# Loading the processed test dataset

df_test = pd.read_csv("test_processed.csv",index_col=["id"])
df_test.head()

Unnamed: 0_level_0,name,item_condition_id,category_name,brand_name,shipping,item_description,name_processed,brand_name_processed,category_name_preprocessed,Tier_2,Tier_3,item_description_processed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
46,Corral boots,2,Women/Shoes/Boots,,0,Corral boots in excellent condition barely used,corral boots,missing,women/shoe/boots,shoe,boots,corral boots excellent condition barely used
88,Vince Camuto Riding boots size 6,2,Women/Shoes/Boots,Vince Camuto,0,super cute brown or cognac knee high riding bo...,vince camuto riding boots size 6,vince camuto,women/shoe/boots,shoe,boots,super cute brown cognac knee high riding boots...
212,Brand new UGG boots,1,Women/Shoes/Boots,UGG Australia,0,New in box,brand new ugg boots,ugg australia,women/shoe/boots,shoe,boots,new box
289,"LL Bean Boots 8"" Red sz 7M",3,Women/Shoes/Boots,L.L. Bean,0,Made to withstand winter climate.,bean boots 8 red sz 7m,ll bean,women/shoe/boots,shoe,boots,made withstand winter climate
299,Black UGGS cargo boot,3,Women/Shoes/Boots,UGG Australia,1,"Unique, super cute and warm! EUC. Only selling...",black uggs cargo boot,ugg australia,women/shoe/boots,shoe,boots,unique super cute warm euc selling lining insi...


In [4]:
# splitting the "df" dataframe into train and validation dataframe

df_train,df_val = train_test_split(df,test_size=0.1,random_state = 3) 

In [5]:
print("Train Shape = ",df_train.shape)
print("Validation Shape = ",df_val.shape)

Train Shape =  (61560, 14)
Validation Shape =  (6840, 14)


In [6]:
train_item_cond = df_train.item_condition_id
val_item_cond = df_val.item_condition_id
test_item_cond = df_test.item_condition_id

In [7]:
train_shipping = df_train.shipping
val_shipping = df_val.shipping
test_shipping = df_test.shipping

In [8]:
# Tokenizing and Padding

def text_vectorizer(feature):

    tk = Tokenizer()
    tk.fit_on_texts(df_train[feature].apply(str))
    
    tk_train = tk.texts_to_sequences(df_train[feature].apply(str))
    tk_val = tk.texts_to_sequences(df_val[feature].apply(str))
    
    max_length = df_train[feature].apply(lambda x :len(str(x).split())).max()
    vocab_size = len(tk.word_index) + 1
    
    train_pad= pad_sequences(tk_train,padding="post",maxlen = max_length)
    val_pad = pad_sequences(tk_val,padding = "post", maxlen = max_length)
    
    # Returning the tokenizer, max length , padded train sequences , padded validation sequences 
    return tk , max_length, vocab_size, train_pad , val_pad 

In [9]:
# Tokenizing Brand_name-processed and padding

tk_brand_name,max_length_brand_name,vocab_size_brand_name,train_brand_name_pad , val_brand_name_pad = text_vectorizer("brand_name_processed")

In [10]:
type(train_brand_name_pad)

numpy.ndarray

In [11]:
print("Train Shape = ",train_brand_name_pad.shape)
print("Validation Shape = ",val_brand_name_pad.shape)
print("Max Length = ", max_length_brand_name)
print("Vocal Size= ",vocab_size_brand_name)

Train Shape =  (61560, 5)
Validation Shape =  (6840, 5)
Max Length =  5
Vocal Size=  1390


In [12]:
# Tokenizing Brand_name_processed and padding for test data

test_brand_name_pad = pad_sequences(tk_brand_name.texts_to_sequences(df_test.brand_name_processed),maxlen=max_length_brand_name,padding="post")
test_brand_name_pad.shape

(31789, 5)

In [13]:
# Tokenizing and padding tier2 for train and validation dataset

tk_tier2 , max_length_tier2 ,vocab_size_tier2, train_tier2_pad , val_tier2_pad = text_vectorizer("Tier_2")

In [14]:
print("Train Shape =",train_tier2_pad.shape)
print("Validation Shape =",val_tier2_pad.shape)
print("Max Length = ", max_length_tier2)
print("Vocal Size= ",vocab_size_tier2)

Train Shape = (61560, 2)
Validation Shape = (6840, 2)
Max Length =  2
Vocal Size=  15


In [15]:
# Tokenizing and padding tier2 for test dataset

test_tier2_pad = pad_sequences(tk_tier2.texts_to_sequences(df_test.Tier_2),maxlen=max_length_tier2,padding="post")
test_tier2_pad.shape

(31789, 2)

In [16]:
# Tokenizing and padding tier3 for train and validation dataset

tk_tier3 , max_length_tier3 , vocab_size_tier3, train_tier3_pad , val_tier3_pad = text_vectorizer("Tier_3")

In [17]:
print("Train Shape = ",train_tier3_pad.shape)
print("Validation Shape = ",val_tier3_pad.shape)
print("Max Length =", max_length_tier3)
print("Vocal Size =",vocab_size_tier3)

Train Shape =  (61560, 3)
Validation Shape =  (6840, 3)
Max Length = 3
Vocal Size = 33


In [18]:
# Tokenizing and padding tier3 for test dataset

test_tier3_pad = pad_sequences(tk_tier3.texts_to_sequences(df_test.Tier_3),maxlen=max_length_tier3,padding="post")
test_tier3_pad.shape

(31789, 3)

In [19]:
# Tokenizing and padding name_processed for train and validation dataset

tk_name_processed , max_length_name_processed ,vocab_size_name_processed , train_name_processed_pad , val_name_processed_pad = text_vectorizer("name_processed")

In [20]:
print("Train Shape = ",train_name_processed_pad.shape)
print("Validation Shape = ",val_name_processed_pad.shape)
print("Max Length = ", max_length_name_processed)
print("Vocal Size= ",vocab_size_name_processed)

Train Shape =  (61560, 10)
Validation Shape =  (6840, 10)
Max Length =  10
Vocal Size=  12137


In [21]:
# Tokenizing and padding name_processed for test dataset

test_name_processed_pad = pad_sequences(tk_name_processed.texts_to_sequences(df_test.name_processed),maxlen=max_length_name_processed,padding="post")
test_name_processed_pad.shape

(31789, 10)

In [22]:
# Tokenizing and padding item_description for train and validation dataset

tk_desc , max_len_desc ,vocab_size_desc,train_desc_pad , val_desc_pad = text_vectorizer("item_description_processed")

In [23]:
print("Trian Shape = ",train_desc_pad.shape)
print("Validation Shape = ",val_desc_pad.shape)
print("Max Length = ", max_len_desc)
print("Vocal Size= ",vocab_size_desc)

Trian Shape =  (61560, 115)
Validation Shape =  (6840, 115)
Max Length =  115
Vocal Size=  18484


In [24]:
# Tokenizing and padding item_description for test dataset

test_item_desc_pad = pad_sequences(tk_desc.texts_to_sequences(df_test.item_description_processed),maxlen=max_len_desc,padding="post")
test_item_desc_pad.shape

(31789, 115)

In [25]:
# Target values (log_price)

y_train = df_train.log_price
y_val = df_val.log_price

In [26]:
# Storing the features as a list

x_train = [train_item_cond,train_shipping,train_brand_name_pad,train_tier2_pad,train_tier3_pad,train_name_processed_pad,train_desc_pad]

x_val= [val_item_cond,val_shipping,val_brand_name_pad,val_tier2_pad,val_tier3_pad,val_name_processed_pad,val_desc_pad]

x_test= [test_item_cond,test_shipping,test_brand_name_pad,test_tier2_pad,test_tier3_pad,test_name_processed_pad,test_item_desc_pad]

In [27]:
tf.keras.backend.clear_session()

# Item_condition_id
inp1 = layers.Input(shape=(1)) 
emb1  = layers.Embedding(6,10,input_length=1)(inp1) 
flat1 = layers.Flatten()(emb1) 


# Shipping
inp2 = layers.Input(shape=(1))  
d2 = layers.Dense(10,activation="relu")(inp2) 


# Brand_name_processed
inp3 = layers.Input(shape= (5)) 
emb3 = layers.Embedding(vocab_size_brand_name ,16 ,input_length= 8 )(inp3) 
flat3 = layers.Flatten()(emb3) 


# Tier_2
inp5= layers.Input(shape = (2)) 
emb5 = layers.Embedding(vocab_size_tier2 , 16 ,input_length= 4 )(inp5) 
flat5 = layers.Flatten()(emb5)

# Tier_3
inp6= layers.Input(shape = (3))  
emb6 = layers.Embedding(vocab_size_tier3, 16 ,input_length= 6 )(inp6) 
flat6 = layers.Flatten()(emb6) 

# Name_processed
inp7= layers.Input(shape = (10)) 
emb7 = layers.Embedding(vocab_size_name_processed,20 ,input_length= 13 )(inp7) 
lstm7 = layers.GRU(64,return_sequences=True)(emb7) 
flat7 = layers.Flatten()(lstm7) 

# Item_description_processed
inp8= layers.Input(shape = (115)) 
emb8 = layers.Embedding(vocab_size_desc , 40 , input_length= 193 )(inp8) 
lstm8 = layers.GRU(64,return_sequences=True)(emb8) 
flat8 = layers.Flatten()(lstm8)

# Concatenate
concat = layers.Concatenate()([flat1,d2,flat3,flat5,flat6,flat7,flat8])

# Dense layer
dense1 = layers.Dense(512,activation="relu")(concat)
# Dropout layer
drop2 = layers.Dropout(0.2)(dense1)
# Dense layer
dense2 = layers.Dense(256,activation="relu")(drop2)
# Dropout layer
drop2 = layers.Dropout(0.3)(dense2)
# Dense layer
dense3 = layers.Dense(128,activation="relu")(drop2)
# Dropout layer
drop2 = layers.Dropout(0.4)(dense3)
# Batchnorm layer
bn2  = layers.BatchNormalization()(drop2)
# Dense layer
dense4 = layers.Dense(1,activation="linear")(bn2)

# Model
model =  Model(inputs= [inp1,inp2,inp3,inp5,inp6,inp7,inp8],outputs=dense4)

# Schedule
def schedule(epoch,lr):
    if epoch<=2:
        return lr
    else:
        return lr*0.1

# Callbacks
lr = tf.keras.callbacks.LearningRateScheduler(schedule,verbose=1)
save = tf.keras.callbacks.ModelCheckpoint("best.h5",monitor="val_root_mean_squared_error",mode="min",save_best_only=True, save_weights_only=True,verbose=1)
earlystop = tf.keras.callbacks.EarlyStopping(monitor="val_root_mean_squared_error",min_delta= 0.01, patience=2,mode="min" )

# Compiling model
model.compile(optimizer="adam",loss="mse",metrics= [tf.keras.metrics.RootMeanSquaredError()])

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [28]:
# Fitting the model

model.fit(x=x_train,y=y_train,validation_data=(x_val,y_val) ,epochs=10,batch_size = 100,callbacks=[save,lr,earlystop])

Train on 61560 samples, validate on 6840 samples
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Epoch 00001: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 1/10
Epoch 00001: val_root_mean_squared_error improved from inf to 0.58201, saving model to best.h5

Epoch 00002: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 2/10
Epoch 00002: val_root_mean_squared_error improved from 0.58201 to 0.49718, saving model to best.h5

Epoch 00003: LearningRateScheduler reducing learning rate to 0.0010000000474974513.
Epoch 3/10
Epoch 00003: val_root_mean_squared_error did not improve from 0.49718

Epoch 00004: LearningRateScheduler reducing learning rate to 0.00010000000474974513.
Epoch 4/10
Epoch 00004: val_root_mean_squared_error improved from 0.49718 to 0.49618, saving model to best.h5


<tensorflow.python.keras.callbacks.History at 0x226be257b00>

In [29]:
# Loading the model
model.load_weights("best.h5")

In [30]:
# Evaluating the model
model.evaluate(x_val,y_val,batch_size=1000)



[0.24619017733119383, 0.49617553]

In [31]:
# Converting the log price to actual price

def log_to_actual(log):
    return np.exp(log)-1

In [32]:
# Predicting the results for test dataset

x_test_pred = model.predict(x_test,batch_size=100,verbose=1)



In [33]:
test_predict = pd.DataFrame(log_to_actual(x_test_pred),columns=["price"])
test_predict.index = df_test.index
test_predict.head()

Unnamed: 0_level_0,price
id,Unnamed: 1_level_1
46,44.822487
88,32.020103
212,54.267452
289,35.186333
299,42.049774


In [34]:
# Storing the predicted values

test_predict.to_csv("submission.csv")

In [35]:
model.save('model.h5')

In [36]:
# Training error : 0.4122
# Validation error : 0.4962