In [None]:
import math
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.cross_validation import train_test_split
from keras.layers import MaxPooling1D, Conv1D
from subprocess import check_output


In [None]:
trainData = pd.read_csv('../input/train.tsv', sep='\t')
testData = pd.read_csv('../input/test.tsv', sep='\t')
print(trainData.shape)
print(testData.shape)

In [None]:
#HANDLE MISSING VALUES
print("Handling missing values...")
def handle_missing(dataset):
    dataset.category_name.fillna(value="missing", inplace=True)
    dataset.brand_name.fillna(value="missing", inplace=True)
    dataset.item_description.fillna(value="missing", inplace=True)
    return (dataset)

trainData = handle_missing(trainData)
testData = handle_missing(testData)
print(trainData.shape)
print(testData.shape)

In [None]:
categoryAll = pd.DataFrame(trainData.category_name.str.split('/').tolist(),columns = ['category1','category2','category3','category4','category5'])
trainData = pd.concat([trainData, categoryAll], axis=1)
idx = []
for i in range(len(trainData)):
    if type(trainData['category4'][i]) == str:
        idx.append(i)
sep = "&"
for i in idx:
    if type(trainData['category5'][i]) == str:
        combining = (trainData['category3'][i],trainData['category4'][i],trainData['category5'][i])
        trainData['category3'][i] = sep.join(combining)
    else:
        combining = (trainData['category3'][i],trainData['category4'][i])
        trainData['category3'][i] = sep.join(combining)

In [None]:
categoryAll = pd.DataFrame(testData.category_name.str.split('/').tolist(),columns = ['category1','category2','category3','category4','category5'])
testData = pd.concat([testData, categoryAll], axis=1)
idx = []
for i in range(len(testData)):
    if type(testData['category4'][i]) == str:
        idx.append(i)
sep = "&"
for i in idx:
    if type(testData['category5'][i]) == str:
        combining = (testData['category3'][i],testData['category4'][i],testData['category5'][i])
        testData['category3'][i] = sep.join(combining)
    else:
        combining = (testData['category3'][i],testData['category4'][i])
        testData['category3'][i] = sep.join(combining)

In [None]:
trainData.category1.fillna(value="missing", inplace=True)
trainData.category2.fillna(value="missing", inplace=True)
trainData.category3.fillna(value="missing", inplace=True)
testData.category1.fillna(value="missing", inplace=True)
testData.category2.fillna(value="missing", inplace=True)
testData.category3.fillna(value="missing", inplace=True)

In [None]:
le = LabelEncoder()

le.fit(np.hstack([trainData.category1, testData.category1]))
trainData.category1 = le.transform(trainData.category1)
testData.category1 = le.transform(testData.category1)

le.fit(np.hstack([trainData.category2, testData.category2]))
trainData.category2 = le.transform(trainData.category2)
testData.category2 = le.transform(testData.category2)

le.fit(np.hstack([trainData.category3, testData.category3]))
trainData.category3 = le.transform(trainData.category3)
testData.category3 = le.transform(testData.category3)

le.fit(np.hstack([trainData.brand_name, testData.brand_name]))
trainData.brand_name = le.transform(trainData.brand_name)
testData.brand_name = le.transform(testData.brand_name)
del le

In [None]:
from keras.preprocessing.text import Tokenizer
raw_text = np.hstack([trainData.item_description.str.lower(), trainData.name.str.lower()])

print("Tokenizing...")
tok_raw = Tokenizer()
tok_raw.fit_on_texts(raw_text)
print("Transforming...")

trainData["seq_item_description"] = tok_raw.texts_to_sequences(trainData.item_description.str.lower())
testData["seq_item_description"] = tok_raw.texts_to_sequences(testData.item_description.str.lower())
trainData["seq_name"] = tok_raw.texts_to_sequences(trainData.name.str.lower())
testData["seq_name"] = tok_raw.texts_to_sequences(testData.name.str.lower())

In [None]:
max_name_seq = np.max([np.max(trainData.seq_name.apply(lambda x: len(x))), np.max(testData.seq_name.apply(lambda x: len(x)))])
max_seq_item_description = np.max([np.max(trainData.seq_item_description.apply(lambda x: len(x)))
                                   , np.max(testData.seq_item_description.apply(lambda x: len(x)))])

In [None]:
#EMBEDDINGS MAX VALUE
#Base on the histograms, we select the next lengths
MAX_NAME_SEQ = 10
MAX_ITEM_DESC_SEQ = 50
MAX_TEXT = np.max([np.max(trainData.seq_name.max())
                   , np.max(testData.seq_name.max())
                  , np.max(trainData.seq_item_description.max())
                  , np.max(testData.seq_item_description.max())])+2
MAX_CATEGORY1 = np.max([trainData.category1.max(), testData.category1.max()])+1
MAX_CATEGORY2 = np.max([trainData.category2.max(), testData.category2.max()])+1
MAX_CATEGORY3 = np.max([trainData.category3.max(), testData.category3.max()])+1

MAX_BRAND = np.max([trainData.brand_name.max(), testData.brand_name.max()])+1
MAX_CONDITION = np.max([trainData.item_condition_id.max(), testData.item_condition_id.max()])+1

In [None]:
trainData["target"] = np.log(trainData.price+1)
target_scaler = MinMaxScaler(feature_range=(-1, 1))
trainData["target"] = target_scaler.fit_transform(trainData.target.reshape(-1,1))

In [None]:
#EXTRACT DEVELOPTMENT TEST
dtrain, dvalid = train_test_split(trainData, random_state=123, train_size=0.99)
print(dtrain.shape)
print(dvalid.shape)

In [None]:
#KERAS DATA DEFINITION
from keras.preprocessing.sequence import pad_sequences

def get_keras_data(dataset):
    X = {
        'name': pad_sequences(dataset.seq_name, maxlen=MAX_NAME_SEQ)
        ,'item_desc': pad_sequences(dataset.seq_item_description, maxlen=MAX_ITEM_DESC_SEQ)
        ,'brand_name': np.array(dataset.brand_name)
        ,'category1': np.array(dataset.category1)
        ,'category2': np.array(dataset.category2)
        ,'category3': np.array(dataset.category3)
        ,'item_condition': np.array(dataset.item_condition_id)
        ,'num_vars': np.array(dataset[["shipping"]])
    }
    return X

X_train = get_keras_data(dtrain)
X_valid = get_keras_data(dvalid)
X_test = get_keras_data(testData)

In [None]:
#KERAS MODEL DEFINITION
from keras.layers import Input, Dropout, Dense, BatchNormalization, Activation, concatenate, GRU, Embedding, Flatten, BatchNormalization
from keras.models import Model
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping
from keras import backend as K
def get_model():
    #params
    dr_r = 0.1
    
    #Inputs
    name = Input(shape=[X_train["name"].shape[1]], name="name")
    item_desc = Input(shape=[X_train["item_desc"].shape[1]], name="item_desc")
    brand_name = Input(shape=[1], name="brand_name")
    category1 = Input(shape=[1], name="category1")
    category2 = Input(shape=[1], name="category2")
    category3 = Input(shape=[1], name="category3")
    item_condition = Input(shape=[1], name="item_condition")
    num_vars = Input(shape=[X_train["num_vars"].shape[1]], name="num_vars")
    
    #Embeddings layers
    emb_name = Embedding(MAX_TEXT, 10)(name)
    emb_item_desc = Embedding(MAX_TEXT, 50)(item_desc)
    emb_brand_name = Embedding(MAX_BRAND, 10)(brand_name)
    emb_category1 = Embedding(MAX_CATEGORY1, 10)(category1)
    emb_category2 = Embedding(MAX_CATEGORY2, 10)(category2)
    emb_category3 = Embedding(MAX_CATEGORY3, 10)(category3)
    emb_item_condition = Embedding(MAX_CONDITION, 5)(item_condition)
    
    cnn_layer1 = Conv1D(64,2,activation = 'relu', strides=1, padding='valid')(emb_item_desc)
    pooling_layer1 = MaxPooling1D(2)(cnn_layer1)
    cnn_layer2 = Conv1D(32,2,activation = 'relu', strides=1, padding='valid')(emb_name)
    pooling_layer2 = MaxPooling1D(2)(cnn_layer2)
    #main layer
    main_l = concatenate([
        Flatten() (emb_brand_name)
        , Flatten() (emb_category1)
        , Flatten() (emb_category2)
        , Flatten() (emb_category3)
        , Flatten() (emb_item_condition)
        ,Flatten() (pooling_layer1)
        ,Flatten() (pooling_layer2)
        , num_vars
    ])
    main_l = Dropout(dr_r) (Dense(128) (main_l))
    main_l = Dropout(dr_r) (Dense(64) (main_l))
    
    #output
    output = Dense(1, activation="linear") (main_l)
    
    #model
    model = Model([name, item_desc, brand_name
                   , category1 , category2 , category3, item_condition, num_vars], output)
    model.compile(loss="mse", optimizer="adam", metrics=["mae"])
    return model

    
model = get_model()
model.summary()

In [None]:
#FITTING THE MODEL
BATCH_SIZE = 20000
epochs = 5

model = get_model()
model.fit(X_train, dtrain.target, epochs=epochs, batch_size=BATCH_SIZE
          , validation_data=(X_valid, dvalid.target)
          , verbose=1)

In [None]:
predict = model.predict(X_test, batch_size=BATCH_SIZE)
predict = target_scaler.inverse_transform(predict)
predict = np.exp(predict)-1


In [None]:
submission = testData[["test_id"]]
submission["price"] = predict
submission.to_csv("cnn_submission.csv", index=False)