# Assignmet 3


In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import * # Dense, Embedding, LSTM
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re

import pandas as pd
import numpy as np
import datetime
import seaborn as sns 
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
%matplotlib inline
 


Using TensorFlow backend.


Lets look at the data in each dataset in the input.

In [2]:
train = pd.read_csv('../input/train.csv',encoding='latin1')
train.head()

Unnamed: 0,id,product_uid,product_title,search_term,relevance
0,2,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.0
1,3,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.5
2,9,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,3.0
3,16,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,rain shower head,2.33
4,17,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,shower only faucet,2.67


In [3]:
test = pd.read_csv('../input/test.csv',encoding='latin1')
test.head()

Unnamed: 0,id,product_uid,product_title,search_term
0,1,100001,Simpson Strong-Tie 12-Gauge Angle,90 degree bracket
1,4,100001,Simpson Strong-Tie 12-Gauge Angle,metal l brackets
2,5,100001,Simpson Strong-Tie 12-Gauge Angle,simpson sku able
3,6,100001,Simpson Strong-Tie 12-Gauge Angle,simpson strong ties
4,7,100001,Simpson Strong-Tie 12-Gauge Angle,simpson strong tie hcc668


In [4]:
product_descriptions = pd.read_csv('../input/product_descriptions.csv',encoding='latin1')
product_descriptions.head()

Unnamed: 0,product_uid,product_description
0,100001,"Not only do angles make joints stronger, they ..."
1,100002,BEHR Premium Textured DECKOVER is an innovativ...
2,100003,Classic architecture meets contemporary design...
3,100004,The Grape Solar 265-Watt Polycrystalline PV So...
4,100005,Update your bathroom with the Delta Vero Singl...


In [5]:
attributes = pd.read_csv('../input/attributes.csv',encoding='latin1')
attributes.head()

Unnamed: 0,product_uid,name,value
0,100001.0,Bullet01,Versatile connector for various 90Â° connectio...
1,100001.0,Bullet02,Stronger than angled nailing or screw fastenin...
2,100001.0,Bullet03,Help ensure joints are consistently straight a...
3,100001.0,Bullet04,Dimensions: 3 in. x 3 in. x 1-1/2 in.
4,100001.0,Bullet05,Made from 12-Gauge steel


## Preprocessing

In order to have the products corresponding product description, we merge the train and test datasets with the product_description table.

In [6]:
mergedTrain = pd.merge(train, product_descriptions, how='inner', on='product_uid')
mergedTrain.search_term = mergedTrain.search_term.apply(lambda x: x.lower())
mergedTrain.product_description = mergedTrain.product_description.apply(lambda x: x.lower())
mergedTrain.head()

Unnamed: 0,id,product_uid,product_title,search_term,relevance,product_description
0,2,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,3.0,"not only do angles make joints stronger, they ..."
1,3,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,2.5,"not only do angles make joints stronger, they ..."
2,9,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,deck over,3.0,behr premium textured deckover is an innovativ...
3,16,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,rain shower head,2.33,update your bathroom with the delta vero singl...
4,17,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,shower only faucet,2.67,update your bathroom with the delta vero singl...


In [7]:
# move to end
mergedTest= pd.merge(test, product_descriptions, how='inner', on='product_uid')
mergedTest.search_term = mergedTrain.search_term.apply(lambda x: x.lower())
mergedTest.product_description = mergedTrain.product_description.apply(lambda x: x.lower())
mergedTest.head()

Unnamed: 0,id,product_uid,product_title,search_term,product_description
0,1,100001,Simpson Strong-Tie 12-Gauge Angle,angle bracket,"not only do angles make joints stronger, they ..."
1,4,100001,Simpson Strong-Tie 12-Gauge Angle,l bracket,"not only do angles make joints stronger, they ..."
2,5,100001,Simpson Strong-Tie 12-Gauge Angle,deck over,behr premium textured deckover is an innovativ...
3,6,100001,Simpson Strong-Tie 12-Gauge Angle,rain shower head,update your bathroom with the delta vero singl...
4,7,100001,Simpson Strong-Tie 12-Gauge Angle,shower only faucet,update your bathroom with the delta vero singl...


We convert the product_description and search_term features to series of characters.

In [8]:
search_term_chars = []
product_description_chars = []
search_term_chars = mergedTrain.search_term.apply(lambda x: search_term_chars + list(x))
product_description_chars = mergedTrain.product_description.apply(lambda x: product_description_chars + list(x))
search_term_chars = [item for sublist in search_term_chars for item in sublist]
product_description_chars = [item for sublist in product_description_chars for item in sublist]


In [9]:
search_term_char_set = sorted(set(search_term_chars))
product_description_char_set = sorted(set(product_description_chars))
search_term_char_to_int = dict((c, i) for i, c in enumerate(search_term_char_set))
search_term_int_to_char = dict((i, c) for i, c in enumerate(search_term_char_set))
product_description_char_to_int = dict((c, i) for i, c in enumerate(product_description_char_set))
product_description_int_to_char = dict((i, c) for i, c in enumerate(product_description_char_set))

In [10]:
n_chars = len(search_term_chars)
n_vocab = len(search_term_char_set)
print("search_term Total Characters: ", n_chars)
print("search_term Total Vocab: ", n_vocab)

search_term Total Characters:  1407926
search_term Total Vocab:  51


In [11]:
n_chars2 = len(product_description_chars)
n_vocab2 = len(product_description_char_set)
print("product_description Total Characters: ", n_chars2)
print("product_description Total Vocab: ", n_vocab2)

product_description Total Characters:  65598457
product_description Total Vocab:  67


In [12]:
mergedTrain.search_term = mergedTrain.search_term.apply(lambda x: list(x))
mergedTrain.product_description = mergedTrain.product_description.apply(lambda x: list(x))
mergedTrain.head()

Unnamed: 0,id,product_uid,product_title,search_term,relevance,product_description
0,2,100001,Simpson Strong-Tie 12-Gauge Angle,"[a, n, g, l, e, , b, r, a, c, k, e, t]",3.0,"[n, o, t, , o, n, l, y, , d, o, , a, n, g, ..."
1,3,100001,Simpson Strong-Tie 12-Gauge Angle,"[l, , b, r, a, c, k, e, t]",2.5,"[n, o, t, , o, n, l, y, , d, o, , a, n, g, ..."
2,9,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,"[d, e, c, k, , o, v, e, r]",3.0,"[b, e, h, r, , p, r, e, m, i, u, m, , t, e, ..."
3,16,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,"[r, a, i, n, , s, h, o, w, e, r, , h, e, a, d]",2.33,"[u, p, d, a, t, e, , y, o, u, r, , b, a, t, ..."
4,17,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,"[s, h, o, w, e, r, , o, n, l, y, , f, a, u, ...",2.67,"[u, p, d, a, t, e, , y, o, u, r, , b, a, t, ..."


In [13]:
# move to end
mergedTest.search_term = mergedTrain.search_term.apply(lambda x: list(x))
mergedTest.product_description = mergedTrain.product_description.apply(lambda x: list(x))
mergedTest.head()

Unnamed: 0,id,product_uid,product_title,search_term,product_description
0,1,100001,Simpson Strong-Tie 12-Gauge Angle,"[a, n, g, l, e, , b, r, a, c, k, e, t]","[n, o, t, , o, n, l, y, , d, o, , a, n, g, ..."
1,4,100001,Simpson Strong-Tie 12-Gauge Angle,"[l, , b, r, a, c, k, e, t]","[n, o, t, , o, n, l, y, , d, o, , a, n, g, ..."
2,5,100001,Simpson Strong-Tie 12-Gauge Angle,"[d, e, c, k, , o, v, e, r]","[b, e, h, r, , p, r, e, m, i, u, m, , t, e, ..."
3,6,100001,Simpson Strong-Tie 12-Gauge Angle,"[r, a, i, n, , s, h, o, w, e, r, , h, e, a, d]","[u, p, d, a, t, e, , y, o, u, r, , b, a, t, ..."
4,7,100001,Simpson Strong-Tie 12-Gauge Angle,"[s, h, o, w, e, r, , o, n, l, y, , f, a, u, ...","[u, p, d, a, t, e, , y, o, u, r, , b, a, t, ..."


In [14]:
def createData(char_to_int, char_arr):
    #seq_length = 100
    dataX = []
    for i in range(0,len(char_arr)):
        dataX.append(char_to_int[char_arr[i]])
    return dataX

In [15]:
mergedTrain.search_term = mergedTrain.search_term.apply(lambda x: createData(search_term_char_to_int, x))
mergedTrain.product_description = mergedTrain.product_description.apply(lambda x: createData(product_description_char_to_int, x))
mergedTrain.head()


Unnamed: 0,id,product_uid,product_title,search_term,relevance,product_description
0,2,100001,Simpson Strong-Tie 12-Gauge Angle,"[24, 37, 30, 35, 28, 0, 25, 41, 24, 26, 34, 28...",3.0,"[52, 53, 58, 0, 53, 52, 50, 63, 0, 42, 53, 0, ..."
1,3,100001,Simpson Strong-Tie 12-Gauge Angle,"[35, 0, 25, 41, 24, 26, 34, 28, 43]",2.5,"[52, 53, 58, 0, 53, 52, 50, 63, 0, 42, 53, 0, ..."
2,9,100002,BEHR Premium Textured DeckOver 1-gal. #SC-141 ...,"[27, 28, 26, 34, 0, 38, 45, 28, 41]",3.0,"[40, 43, 46, 56, 0, 54, 56, 43, 51, 47, 59, 51..."
3,16,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,"[41, 24, 32, 37, 0, 42, 31, 38, 46, 28, 41, 0,...",2.33,"[59, 54, 42, 39, 58, 43, 0, 63, 53, 59, 56, 0,..."
4,17,100005,Delta Vero 1-Handle Shower Only Faucet Trim Ki...,"[42, 31, 38, 46, 28, 41, 0, 38, 37, 35, 48, 0,...",2.67,"[59, 54, 42, 39, 58, 43, 0, 63, 53, 59, 56, 0,..."


In [17]:
mergedTrain.shape

(74067, 6)