In [1]:
%%time
import warnings
warnings.filterwarnings("ignore")
import math
import time
import pickle
import pandas as pd
import numpy as np
import scipy
import scipy.sparse
from tqdm import tqdm,tqdm_notebook
from contextlib import contextmanager
import os
import re
import gc
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input , Dropout, Flatten,concatenate,LSTM
from tensorflow.keras.layers import Embedding
import tensorflow.keras
from tensorflow.keras.models import Model,load_model
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.utils import plot_model
from tensorflow.keras.callbacks import ModelCheckpoint,TensorBoard,ReduceLROnPlateau, EarlyStopping
from tensorflow.keras import backend as K
from tensorflow.keras import optimizers

CPU times: user 1.53 s, sys: 206 ms, total: 1.73 s
Wall time: 1.91 s


In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
# https://gist.github.com/sebleier/554280
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

stopwords = stopwords.words('english')

def clean_text_data(data):
    preprocessed = []
    # tqdm is for printing the status bar
    for sentance in tqdm(data):
        sent = decontracted(sentance)
        sent = sent.replace('\\r', ' ')
        sent = sent.replace('\\"', ' ')
        sent = sent.replace('\\n', ' ')
        sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
        # https://gist.github.com/sebleier/554280
        sent = ' '.join(e for e in sent.split() if e not in stopwords)
        preprocessed.append(sent.lower().strip())
    return preprocessed

In [2]:
def clean_cat_data(cat_column):
    '''takes categorical column values as arguments and returns list of cleaned categories'''
#cleaning is necessary because:
#we do each column analysis and remove if there are any special symbols like & and replace all those and spaces with '_',so that if we encounter any test data point with & symbol,it should not be treated as new point
#for example: if we train our model with computer&mobiles and if we encounter a category computer mobiles,while predicting model should not treat it as new category,so better do all analysis
#here we have 3 million test data points to test our model,so these one or two points may not significantly improve our test score,so that is why here we can exclude all the further analysis,but in real world, where each test point outcome is important we have to do all the detailed analysis
    cat = list(cat_column)
    cat_list = []
    for i in tqdm(cat):
        i = re.sub('[^A-Za-z0-9]+', ' ', i)
        i = i.replace(' ','_')
        i = i.replace('-','_')
        i = i.replace(' & ','_')
        i = i.lower()
        cat_list.append(i.strip())
    
    return cat_list

In [3]:
def concat(x): #concatenating all the text columns and categorical columns seperately by replacing null values with space
  x['name'] = x['name'].fillna('') + ' ' + x['brand_name'].fillna('')
  x['text'] = (x['item_description'].fillna('') + ' ' + x['name'] + ' ' + x['category_name'].fillna(''))
  x['cat']  = x['category_name'].fillna('') + ' ' + x['brand_name'].fillna('')
  return x[['name', 'text','cat','shipping', 'item_condition_id']]

In [4]:
def text_encoder(test,col):
    if col == 'name':
        vectorizer = pickle.load(open('name.pkl','rb')) #pre trained vectorizer on train data
        test_transform = vectorizer.transform(test)
        #del vectorizer
        #gc.collect()
        return test_transform
    else:
        vectorizer = pickle.load(open('text.pkl','rb'))#pre trained vectorizer on train data
        test_transform = vectorizer.transform(test)
        #del vectorizer
        #gc.collect()
    #feat_names = vectorizer.get_feature_names()
        return test_transform

In [5]:
def cat_encoder(col):
  vectorizer = pickle.load(open('cat.pkl','rb')) #pre trained vectorizer on train data
  test_transform = vectorizer.transform(col)
  return test_transform

In [6]:
def dummy_encoder(test):
    test_transform = scipy.sparse.csr_matrix(pd.get_dummies(test[["item_condition_id", 
                                                                         "shipping"]], sparse = True).values)
    return test_transform

In [7]:
#This function groups the [category_name,brand_name,shipping] features and generates aggregate 'price' variable statistics like Mean,
# Median, Std. Deviation, Price Ranges based on 2 std.deviations from mean,coefficient of variance etc. in the Log transformed scale.
# Outputs are standardized using StandardScaler function
def generate_category_features(cv):
    df_group = pd.read_csv('train_cbs')
    df_group['cbs_log_price_std'] = df_group['cbs_log_price_std'].fillna(0)
    df_group['cbs_log_price_conf_variance'] = df_group['cbs_log_price_std'] / df_group['cbs_log_price_mean']
    df_group['cbs_log_count'] = np.log1p(df_group['cbs_count'])
    df_group['cbs_min_expected_log_price'] = (df_group['cbs_log_price_mean'] - (df_group['cbs_log_price_std']*2)).clip(lower=1.0)
    df_group['cbs_max_expected_log_price'] = (df_group['cbs_log_price_mean'] + (df_group['cbs_log_price_std']*2))
    
    df_group_stats = cv.merge(df_group.reset_index(),
                                      how = 'left',
                                      on = 'cat_brand_ship')[['cbs_log_price_mean','cbs_min_expected_log_price','cbs_max_expected_log_price']].fillna(0).values
    
    scaler = pickle.load(open('scaler.pkl','rb'))
    cbs_feats_scaled = scaler.transform(df_group_stats)
    return cbs_feats_scaled

In [8]:
def category_encoding(cv,col):
  if col == 'ship':
    l_cat = pd.read_csv('lstm_ship.csv')
    unique = l_cat['unique'].values
    cat_rank = l_cat['cat_rank'].values
  else:
    l_cat = pd.read_csv('lstm_cond.csv')
    unique = l_cat['unique'].values
    cat_rank = l_cat['cat_rank'].values
    for category in cv:
        if category in unique:
            encoded_cv.append(cat_rank[category]) 
        else:
            encoded_cv.append(0) 
    
    encoded_cv = np.asarray(encoded_cv)
    return encoded_cv

In [9]:
def text_concat(df):
    df['text'] = (df['name'].astype(str).fillna('') + ' ' + df['brand_name'].astype(str).fillna('') + ' ' + 
                df['item_description'].astype(str).fillna('') + " " + df['category_name'].astype(str).fillna(''))
    return df['text']

In [10]:
#This Code uses Keras tokeizer to create Word Tokens from the Text Vocabulary, it takes in train,cv text data as input, returns keras tokenizer and tokeized train,cv datasets 
def tokenize(cv):
    tok = pickle.load(open('tokenizer.pkl','rb'))
    #print('Total number of words in the document are ',len(t.word_index) + 1)
    cv_tokenized = tok.texts_to_sequences(cv)
    return cv_tokenized #return tokens and tokenizer

In [11]:
#This Code takes train,cv data and performs Pre-padding with max_length= 100
def padding(cv,sl):
    padded_cv_text = pad_sequences(cv, maxlen = sl, padding='pre')
    return padded_cv_text

In [12]:
def lstm_data(x):
  max_len = 100 #from our basic analysis,as most of the sentences has around 100 length,we take maximum length as 100 


  x['text'] = text_concat(x)
  x_text = tokenize(clean_text_data(x['text']))
  x_text_pad = padding(x_text,max_len)

  x_ship = category_encoding(x['shipping'].astype('category'),'ship')
                                                  
  x_cond = category_encoding(x['item_condition_id'].astype('category'),'cond')


  x['cat_brand_ship'] = (x['category_name'].astype(str) + "_" +  #merge all category features into single column
                                    x['brand_name'].astype(str) + "_" +  
                                    x['shipping'].astype(str))
  x_cbs_feats = generate_category_features(x)


  return [x_text_pad,np.array(x_ship),np.array(x_cond),x_cbs_feats]
           


Final function 1: It takes inputs (item's features) in the form of dataframe:
["train_id"-numerical,
"name" - string,
"item_condition_id" - numerical,
"category_name" - string,
"brand_name" -string,
"shipping" - numerical,
"item_description" - string]

It returns the predicted price value for the given item's features.

In [42]:
def final_fun_1(x):
  #print("shape of the given input is:",x.shape)
  #cleaning and preprocessing the data
  concatenated_x = concat(x)
  cleaned_text = clean_text_data(concatenated_x['text'])
  cleaned_name = clean_text_data(concatenated_x['name'])
  cleaned_cat  = clean_cat_data(concatenated_x['cat'])
  final_name = text_encoder(cleaned_name,'name')
  final_text = text_encoder(cleaned_text,'text')
  finl_cat   = cat_encoder(cleaned_cat)
  final_dummies = dummy_encoder(pd.DataFrame({"shipping" : x["shipping"].astype("category"),
                                                             "item_condition_id" : x["item_condition_id"].astype("category")}))
  #final data with sparse vectors
  final_data = scipy.sparse.hstack((final_name, final_text, final_cat, final_dummies)).tocsr().astype('float32') #final data for mlp and ridge models

  final_lstm_data = lstm_data(x) #preprocess data for lstm model and get the final values
  y_scaler = pickle.load(open('y_scaler.pkl','rb')) #load the scaler for inverse transforming on predicted values

  #taking the best models and predicting on the given data

  #ridge model
  ridge = pickle.load(open('ridge.pkl','rb'))#load pre trained ridge model
  preds = ridge.predict(final_data)[:, 0]
  preds_ridge = np.expm1(y_scaler.inverse_transform(preds.reshape(-1, 1))[:, 0])
  #mlp model
  mlp = load_model('mlp.h5') #load pre trained mlp model
  preds = mlp.predict(final_data)[:, 0]
  preds_mlp = np.expm1(y_scaler.inverse_transform(preds.reshape(-1, 1))[:, 0])
  #lstm model
  lstm = load_model('lstm.h5') #load pre trained lstm model
  preds = lstm.predict(final_lstm_data)[:, 0]
  preds_lstm = np.expm1(y_scaler.inverse_transform(preds.reshape(-1, 1))[:, 0])
 #final prediction by combining all the above results
  final_prediction = 0.23*preds_ridge + 0.61*preds_mlp + 0.16*preds_lstm #these are the best weights we got after experimenting with test data

  return final_prediction

In [21]:
train = pd.read_csv('train.tsv',sep = '\t')
example = train[1:2]
predicted_price = final_fun_1(example)
print('Predicted Price for ',example.values,"\n is \n",predicted_price)

Predicted Price for  [[1 '25 pcs NEW 7.5"x12" Kraft Bubble Mailers' 1
  'Other/Office supplies/Shipping Supplies' nan 1
  '25 pcs NEW 7.5"x12" Kraft Bubble Mailers Lined with bubble wrap for protection Self Sealing (peel-and-seal), adhesive keeps contents secure and tamper proof Durable and lightweight Kraft material helps save on postage Approved by UPS, FedEx, and USPS.']] 
 is 
 [35.37330328]


In [43]:
def rmsle(y, y_pred): #RMSLE function for finding the score for predicted and true values
    y_pred[y_pred<0] = 0 #if predicted values are negative,replace them with zero https://www.kaggle.com/c/bike-sharing-demand/discussion/18942
    s = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 for i,pred in enumerate(y_pred)]
    return math.sqrt((sum(s)/len(y)))

Final function 2: It takes inputs (item's features) in the form of dataframe:
["train_id"-numerical,
"name" - string,
"item_condition_id" - numerical,
"category_name" - string,
"brand_name" -string,
"shipping" - numerical,
"item_description" - string]

It returns the RMSLE value for predicted price with the given item's features and actual price of the item.

In [44]:
def final_fun_2(x, y):
  predicted_value = final_fun_1(x) #using function1 predict the price of an item
  true_value = y #true value of an item
  return rmsle(true_value,predicted_value)

In [28]:
train = pd.read_csv('train.tsv',sep = '\t')
example = train[1:2]
true_price = [31] 
rmsle_score = final_fun_2(example,true_price)
print('RMSLE score for given inputs is ',rmsle_score)

RMSLE score for given inputs is  0.12809917638064006
