In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from gensim.models import *
import string
from nltk.stem.snowball import SnowballStemmer
from scipy.stats.stats import pearsonr
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

%matplotlib inline

stemmer = SnowballStemmer('english')
printable = set(string.printable)
stop_words = set(stopwords.words('english'))

alphanum = []
for x in printable:
    if x.isalnum():
        alphanum.append(x)
        
alphanum = set(alphanum)



In [2]:
# read dataframes
attributes = pd.read_csv('data/attributes.csv')
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
description = pd.read_csv('data/product_descriptions.csv')

In [12]:
# delete columns
train.drop('Unnamed: 0', 1, inplace=True)
description.drop('Unnamed: 0', 1, inplace=True)

# fill na in attributes dataframe
attributes = attributes.fillna("")

In [4]:
# Find all brands in attributes
brand = attributes[ attributes.name == 'MFG Brand Name' ]
brand.columns = ['product_uid', 'name', 'brand_name']

brand.drop('name', 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [5]:
# Find all colors in attributes
color = attributes[ (attributes.name == 'Color/Finish') | (attributes.name == 'Color') ]
color.columns = ['product_uid', 'name', 'color']

color.drop('name', 1, inplace = True)

color.drop_duplicates(['product_uid'], inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [6]:
# Find all materials in attributes
material = attributes[ attributes.name == 'Material' ]
material.columns = ['product_uid', 'name', 'material']

material.drop('name', 1, inplace = True)

material.drop_duplicates(['product_uid'], inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [7]:
# Merge train and test with product descriptions
train = train.set_index('product_uid').join(description.set_index('product_uid'), how = 'left')\
    .reset_index()
test = test.set_index('product_uid').join(description.set_index('product_uid'), how = 'left')\
    .reset_index()

# Merge train and test with brand
train = train.set_index('product_uid').join(brand.set_index('product_uid'), how = 'left')\
    .reset_index()
test = test.set_index('product_uid').join(brand.set_index('product_uid'), how = 'left')\
    .reset_index()

# Merge train and test with color
train = train.set_index('product_uid').join(color.set_index('product_uid'), how = 'left')\
    .reset_index()
test = test.set_index('product_uid').join(color.set_index('product_uid'), how = 'left')\
    .reset_index()

# Merge train and test with material
train = train.set_index('product_uid').join(material.set_index('product_uid'), how = 'left')\
    .reset_index().set_index('id')
test = test.set_index('product_uid').join(material.set_index('product_uid'), how = 'left')\
    .reset_index().set_index('id')

In [8]:
# fill NaN with ""
train = train.fillna("")
test = test.fillna("")

In [9]:
# calc minimum distance between string text2 and substring of text1
# dynamic programming
# delete one char costs 1
# insert one char costs 1
# change one char costs 1
# return value in range [0, 1]
# if return 0 then string text1 apears in text2
# if return 1 then any chars from text1 not apears in text2
def get_dist(text1, text2):
    d = [np.ones(len(text2) + 1) * int(1e9) for i in range(len(text1) + 1)]
    for i in range(len(text1) + 1):
        d[i][0] = 0
        
    for i in range(len(text1) + 1):
        for j in range(len(text2) + 1):
            dv = d[i][j]
            if dv == int(1e9):
                continue
            
            if i < len(text1) and j < len(text2):
                d[i + 1][j + 1] = min(d[i + 1][j + 1], dv + int(text1[i] != text2[j]))
            if i < len(text1):
                d[i + 1][j] = min(d[i + 1][j], dv + 1)
            if j < len(text2):
                d[i][j + 1] = min(d[i][j + 1], dv + 1)
                    
    result = len(text2)
    for i in range(len(text1) + 1):
        result = min(result, d[i][len(text2)])
        
    return float(result) / len(text2)

In [8]:
# create map <product_uid, concatenating of all bulletXX attributes for this product_uid>
bullet = {}

for i in range(attributes.shape[0]):
    id = attributes.index[i]
    
    if str(attributes['name'][id]) == "":
        continue
    
    if str(attributes['name'][id])[0:6] != "Bullet":
        continue
        
    pid = attributes['product_uid'][id]
    
    bullet[pid] = bullet.get(pid, "") +  " " + str(attributes['value'][id])
    
print len(bullet)

86263


In [13]:
# string to words list
# delete all non-ascii symbols
# convert all alpha to lower case
# tokenize with nltk.word_tokenize()
# stem each word with snowball stemmer
def string_to_normalized_word_list(s):
    s = str(filter(lambda x: x in printable, s)).lower()
    return [stemmer.stem(word) for word in nltk.word_tokenize(s)]

# delete all non-alphanumeric symbols
def filter_string_alphanum_only(s):
    return str(filter(lambda x: x in alphanum, s))

In [17]:
# calc feature for brand_name with function get_dist()
def calc_brand_feature(data):
    result = []
    
    for i in range(data.shape[0]):
        text = filter_string_alphanum_only(data.iloc[i]['brand_name']).lower()
        query = filter_string_alphanum_only(data.iloc[i]['search_term']).lower()

        if text == "":
            result.append(0)
        else:
            cur = get_dist(query, text)
            result.append(cur)
        
    return result

In [18]:
# calc feature for product_title with function get_dist()
def calc_title_feature(data):
    result = []
    
    for i in range(data.shape[0]):
        text = filter_string_alphanum_only(data.iloc[i]["product_title"]).lower()
        query = filter_string_alphanum_only(data.iloc[i]['search_term']).lower()

        if text == "":
            result.append(0)
        else:
            cur = get_dist(query, text)
            result.append(cur)
        
    return result

In [19]:
# calc two features for 'param_name'
# for each row in dataframe data: get parameters 'param_name' and 'search_term', normalized them, deleted stop-words 
# and join words of parameter 'param_name' by space
# then finding occurrences of each word (1-gram) of 'search_term' in string of 'param_name' (result1)
#                      and of each pair of adjacent words (bi-gram) of 'search_term' in string of 'param_name' (result2)
# and finally each values divide by number of words in 'search_term'
# so we calc ratio of words (and pairs of adjacent words) that appears in 'param_name' as substrings
def calc_features(data, param_name):
    result1 = []
    result2 = []
    
    for i in range(data.shape[0]):
        text = " ".join(filter(lambda word: word not in stop_words, string_to_normalized_word_list(data.iloc[i][param_name])))
        query = filter(lambda word: word not in stop_words, string_to_normalized_word_list(data.iloc[i]['search_term']))
        
        if len(query) == 0:
            result1.append(0)
        else:        
            cnt1 = 0
            for word in query:
                if text.find(word) != -1:
                    cnt1 += 1
                        
            result1.append(float(cnt1) / len(query))
        
        if len(query) <= 1:
            result2.append(0)
        else:
            cnt2 = 0
            for j in range(len(query) - 1):
                if text.find(query[j] + " " + query[j + 1]) != -1:
                    cnt2 += 1
                        
            result2.append(float(cnt2) / (len(query) - 1))

    return result1, result2

In [25]:
# all as in function calc_features(), but only for bullet attributes instead of 'param_name'
def calc_bullet_features(data):
    result1 = []
    result2 = []
    
    for i in range(data.shape[0]):
        text = " ".join(filter(lambda word: word not in stop_words, 
                               string_to_normalized_word_list(bullet.get(data.iloc[i]['product_uid'], ""))))
        query = filter(lambda word: word not in stop_words, string_to_normalized_word_list(data.iloc[i]['search_term']))
        
        if len(query) == 0:
            result1.append(0)
        else:        
            cnt1 = 0
            for word in query:
                for word2 in text:
                    if word == word2:
                        cnt1 += 1
                        break
                        
            result1.append(float(cnt1) / len(query))
        
        if len(query) <= 1:
            result2.append(0)
        else:
            cnt2 = 0
            for j in range(len(query) - 1):
                for k in range(len(text) - 1):
                    if query[j] == text[k] and query[j + 1] == text[k + 1]:
                        cnt2 += 1
                        break
                        
            result2.append(float(cnt2) / (len(query) - 1))

    return result1, result2

In [21]:
# calc dataframe of features for dataframe data and list param_names of 'param_name's
def calc_X (data, param_names):
    frame_dict = {'ones': np.ones(len(data)).tolist()} # add column with 1s for linear models
    for name in param_names:
        cur = calc_features(data, name)
        id = 1
        for vec in cur:
            frame_dict[name + "_" + str(id)] = vec
            id += 1

    return pd.DataFrame(frame_dict)

In [22]:
# creating dataframes with count features normalized by count of words for parameters 'brand_name', 
# 'product_title', 'product_description', 'color', 'material'
X_train = calc_X(train, ["brand_name", "product_title", "product_description", "color", "material"])
X_test = calc_X(test, ["brand_name", "product_title", "product_description", "color", "material"])

In [23]:
# creating features for 'brand_name' and 'product_title' with function get_dist()
X_train['brand'] = calc_brand_feature(train)
X_test['brand'] = calc_brand_feature(test)

X_train['title'] = calc_title_feature(train)
X_test['title'] = calc_title_feature(test)

In [26]:
# creating features for 'bullet' attributes
cur = calc_bullet_features(train)
X_train['bullet_1'] = cur[0]
X_train['bullet_2'] = cur[1]

cur = calc_bullet_features(test)
X_test['bullet_1'] = cur[0]
X_test['bullet_2'] = cur[1]

In [27]:
# creating boolean features for 'brand_name', 'color', 'material' and 'bullet' attributes
# 1 if we have that value for that product, 0 - otherwise
b1_train = []
b2_train = []
b3_train = []
b4_train = []

for i in range(len(train)):
    b1_train.append(int(train.iloc[i]['brand_name'] == ""))
    b2_train.append(int(train.iloc[i]['color'] == ""))
    b3_train.append(int(train.iloc[i]['material'] == ""))
    b4_train.append(int(bullet.get(train.iloc[i]['product_uid'], "") == ""))
    
    
b1_test = []
b2_test = []
b3_test = []
b4_test = []

for i in range(len(test)):
    b1_test.append(int(test.iloc[i]['brand_name'] == ""))
    b2_test.append(int(test.iloc[i]['color'] == ""))
    b3_test.append(int(test.iloc[i]['material'] == ""))
    b4_test.append(int(bullet.get(test.iloc[i]['product_uid'], "") == ""))
    
    
X_train['b1'] = b1_train
X_train['b2'] = b2_train
X_train['b3'] = b3_train
X_train['b4'] = b4_train

X_test['b1'] = b1_test
X_test['b2'] = b2_test
X_test['b3'] = b3_test
X_test['b4'] = b4_test

In [28]:
# creating word2vec model from 'product_description' with 100 features

documents = [nltk.word_tokenize(text.lower()) for text in description['product_description'].tolist()]# + \
            #[nltk.word_tokenize(text.lower()) for text in train['search_term'].tolist()] + \
            #[nltk.word_tokenize(text.lower()) for text in test['search_term'].tolist()]

word2vec_model = Word2Vec(documents, 100)

In [29]:
# calc average vector of word2vec vectors for words from list words
def sum_vector(words):
    res = np.zeros(100)
    cnt = 0
    for word in words:
        if word in word2vec_model.wv:
            res += np.array(word2vec_model.wv[word])
            cnt += 1
            
    if cnt > 0:
        res /= cnt
            
    return res

In [30]:
# calc cosine similarity between two vectors 
def similarity(vec1, vec2):
    res = 0
    len1 = 0
    len2 = 0
    for i in range(len(vec1)):
        res += vec1[i] * vec2[i]
        len1 += vec1[i] ** 2
        len2 += vec2[i] ** 2
        
    if len1 == 0 or len2 == 0:
        return 1
        
    res /= math.sqrt(len1)
    res /= math.sqrt(len2)
    
    return res

In [31]:
# calc feature: cosine similarity between average word2vec vectors of words 'param_name' and 'search_term' 
def calc_word2vec_features(data, param_name):
    result = []
    
    for i in range(data.shape[0]):
        text = string_to_normalized_word_list(data.iloc[i][param_name])
        query = string_to_normalized_word_list(data.iloc[i]['search_term'])
        
        result.append(similarity(sum_vector(text), sum_vector(query)))

    return result

In [33]:
# calc word2vec cosine similarity functions for 'product_descriptions'
X_train['word2vec_description'] = calc_word2vec_features(train, "product_description")
X_test['word2vec_description'] = calc_word2vec_features(test, "product_description")

In [35]:
# create tf-idf vectorizer for 'product_description'
# normalize 'product_description' at first
docs = map(lambda x: " ".join(string_to_normalized_word_list(x)), 
           description['product_description'].tolist())

# analyze by words, tokenize with nltk.word_tokenize(), deleting stop-words, all alpha to lower case 
tf_idf = TfidfVectorizer(docs, analyzer='word', tokenizer=nltk.word_tokenize, stop_words='english', lowercase=True)

tf_idf_description = tf_idf.fit_transform(docs)
tf_idf_X_train = tf_idf.transform(map(lambda x: " ".join(string_to_normalized_word_list(x)),
                                      train['search_term'].tolist()))
tf_idf_X_test = tf_idf.transform(map(lambda x: " ".join(string_to_normalized_word_list(x)),
                                     test['search_term'].tolist()))


desc_map = {}
for i in range(description.shape[0]):
    desc_map[ description.iloc[i]['product_uid'] ] = i

In [36]:
# create tf_idf features 

tf_idf_train = []
for i in range(train.shape[0]):
    pid = train.iloc[i]['product_uid']
    tf_idf_train.append(cosine_similarity(tf_idf_X_train[i], tf_idf_description[ desc_map[pid] ])[0][0])
     
X_train['tf_idf'] = tf_idf_train


tf_idf_test = []
for i in range(test.shape[0]):
    pid = test.iloc[i]['product_uid']
    tf_idf_test.append(cosine_similarity(tf_idf_X_test[i], tf_idf_description[ desc_map[pid] ])[0][0])
     
X_test['tf_idf'] = tf_idf_test

In [39]:
X_train.to_csv('X_train.csv')
X_test.to_csv('X_test.csv')