In [1]:
"""
This is config for HomeDepot Project
Competition: HomeDepot Search Relevance
"""


import os


DATA_DIR= "/Users/Hermione/MasterUCL/Info retrieval and data mining/coursework/dataset"
PROCESSINGTEXT_DIR= "/Users/Hermione/MasterUCL/Info retrieval and data mining/coursework/processing_text"
FEATURES_DIR= "/Users/Hermione/MasterUCL/Info retrieval and data mining/coursework/features"
SAVEDMODELS_DIR= "/Users/Hermione/MasterUCL/Info retrieval and data mining/coursework/saved_models"
MODELS_DIR= "/Users/Hermione/MasterUCL/Info retrieval and data mining/coursework/models"
MODELSENSEMBLE_DIR= "/Users/Hermione/MasterUCL/Info retrieval and data mining/coursework/models_ensemble"
FEATURESETS_DIR="/Users/Hermione/MasterUCL/Info retrieval and data mining/coursework/feature_sets"


if not os.path.exists(PROCESSINGTEXT_DIR):
    os.mkdir(PROCESSINGTEXT_DIR)
if not os.path.exists(FEATURES_DIR):
    os.mkdir(FEATURES_DIR)
if not os.path.exists(SAVEDMODELS_DIR):
    os.mkdir(SAVEDMODELS_DIR)
if not os.path.exists(MODELS_DIR):
    os.mkdir(MODELS_DIR)
if not os.path.exists(MODELSENSEMBLE_DIR):
    os.mkdir(MODELSENSEMBLE_DIR)   


In [42]:
import numpy as np
import pandas as pd
from time import time
import re
import csv
import os
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
stoplist = stopwords.words('english')
stoplist.append('till')  # add 'till' to stoplist

# 'can' also might mean 'a container' like in 'trash can' 
# so we create a separate stop list without 'can' to be used for query and product title
stoplist_wo_can=stoplist[:]
stoplist_wo_can.remove('can')


from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')
from nltk.corpus import wordnet_ic
brown_ic = wordnet_ic.ic('ic-brown.dat')
import difflib



In [31]:
def simple_parser(s):
    s = re.sub('&amp;', '&', s)
    s = re.sub('&nbsp;', '', s)
    s = re.sub('&#39;', '', s)
    s = s.replace("-"," ")
    s = s.replace("+"," ")
    s = re.sub(r'(?<=[a-zA-Z])\/(?=[a-zA-Z])', ' ', s)
    s = re.sub(r'(?<=\))(?=[a-zA-Z0-9])', ' ', s) # add space between parentheses and letters
    s = re.sub(r'(?<=[a-zA-Z0-9])(?=\()', ' ', s) # add space between parentheses and letters
    s = re.sub(r'(?<=[a-zA-Z][\.\,])(?=[a-zA-Z])', ' ', s) # add space after dot or colon between letters
    s = re.sub('[^a-zA-Z0-9\n\ ]', '', s)
    return s

In [12]:
####### PREPROCESSING ######################


### load train and test ###################
df_train = pd.read_csv(DATA_DIR+'/train.csv', encoding="ISO-8859-1")
df_test = pd.read_csv(DATA_DIR+'/test.csv', encoding="ISO-8859-1")
df_all = pd.concat((df_train, df_test), axis=0, ignore_index=True)

### load product attributes ###############
df_attr = pd.read_csv(DATA_DIR+'/attributes.csv', encoding="ISO-8859-1")


In [16]:
### find unique brands from the attributes file
### for a few product_uids there are at least two different names in "MFG Brand Name"
### in such cases we keep only one of the names
df_all = pd.merge(df_all, df_attr[df_attr['name']=="MFG Brand Name"][['product_uid','value']], how='left', on='product_uid')
df_all['brand']=df_all['value'].fillna("").map(lambda x: x.encode("utf-8"))
df_all=df_all.drop('value',axis=1)
#df_all= df_all.decode('GBK')

In [34]:
### Create a list of words with lowercase and uppercase letters 
### Examples: 'InSinkErator', 'EpoxyShield'
### They are words from brand names or words from product title.
### The dict is used to correct product description which contins concatenated 
### lines of text without separators : 
### ---View lawn edgings and brick/ paver edgingsUtility stakes can be used for many purposes---
### Here we need to replace 'edgingsUtility' with 'edgings utility'. 
### But we don't need to replace 'InSinkErator' with 'in sink erator'
add_space_stop_list=[]
uniq_brands=list(set(list(df_all['brand'])))
for i in range(0,len(uniq_brands)):
    uniq_brand = uniq_brands[i].decode("utf-8") 
    uniq_brands[i]=simple_parser(uniq_brand)
    if re.search(r'[a-z][A-Z][a-z]',uniq_brands[i])!=None:
        for word in uniq_brands[i].split():
            if re.search(r'[a-z][A-Z][a-z]',word)!=None:
                add_space_stop_list.append(word.lower())
add_space_stop_list=list(set(add_space_stop_list))      
print(len(add_space_stop_list)," words from brands in add_space_stop_list")
                
uniq_titles=list(set(list(df_all['product_title'])))
for i in range(0,len(uniq_titles)):
    uniq_title = uniq_titles[i]
    uniq_titles[i]=simple_parser(uniq_title)
    if re.search(r'[a-z][A-Z][a-z]',uniq_titles[i])!=None:
        for word in uniq_titles[i].split():
            if re.search(r'[a-z][A-Z][a-z]',word)!=None:
                add_space_stop_list.append(word.lower())    
add_space_stop_list=list(set(add_space_stop_list))      
print(len(add_space_stop_list) ," total words from brands and product titles in add_space_stop_list\n")

458  words from brands in add_space_stop_list
1722  total words from brands and product titles in add_space_stop_list



In [37]:
from google_dict import *

df_all['search_term'] = df_all['search_term'].map(lambda x: google_dict[x] if x in google_dict.keys() else x)

In [38]:
def is_word_in_string(word, s):
    return word in s.split()


def create_bigrams(s):
    lst = [word for word in s.split() if len(re.sub('[^0-9]', '', word)) == 0 and len(word) > 2]
    output = ""
    i = 0
    if len(lst) >= 2:
        while i < len(lst) - 1:
            output += " " + lst[i] + "_" + lst[i + 1]
            i += 1
    return output


In [39]:
df_all['product_title_simpleparsed'] = df_all['product_title'].map(lambda x: simple_parser(x).lower())
df_all['search_term_simpleparsed'] = df_all['search_term'].map(lambda x: simple_parser(x).lower())

str_title = " ".join(list(df_all['product_title'].map(lambda x: simple_parser(x).lower())))
str_query = " ".join(list(df_all['search_term'].map(lambda x: simple_parser(x).lower())))

# create bigrams
bigrams_str_title = " ".join(list(df_all['product_title'].map(lambda x: create_bigrams(simple_parser(x).lower()))))
bigrams_set = set(bigrams_str_title.split())


In [40]:
### count word frequencies for query and product title
my_dict = {}
str1 = str_title + " " + str_query
for word in list(set(list(str1.split()))):
    my_dict[word] = {"title": 0, "query": 0, 'word': word}
for word in str_title.split():
    my_dict[word]["title"] += 1
for word in str_query.split():
    my_dict[word]["query"] += 1

In [43]:
#Return ratio and scaled ratio from difflib.SequenceMatcher()

def seq_matcher(s1, s2):
    seq = difflib.SequenceMatcher(None, s1, s2)
    rt = round(seq.ratio(), 7)
    l1 = len(s1)
    l2 = len(s2)
    if len(s1) == 0 or len(s2) == 0:
        rt = 0
        rt_scaled = 0
    else:
        rt_scaled = round(rt * max(l1, l2) / min(l1, l2), 7)
    return rt, rt_scaled


In [44]:
### 1. Process words without digits
### Potential errors: words that appear only in query
### Correct words: 5 or more times in product_title
errors_dict = {}
correct_dict = {}
for word in my_dict.keys():
    if len(word) >= 3 and len(re.sub('[^0-9]', '', word)) == 0:
        if my_dict[word]["title"] == 0:
            if len(wn.synsets(word)) > 0 \
                    or (word.endswith('s') and (word[:-1] in my_dict.keys()) and my_dict[word[:-1]]["title"] > 0) \
                    or (word[-1] != 's' and (word + 's' in my_dict.keys()) and my_dict[word + 's']["title"] > 0):
                1
            else:
                errors_dict[word] = my_dict[word]
        elif my_dict[word]["title"] >= 5:
            correct_dict[word] = my_dict[word]

### for each error word try finding a good match in bigrams, matched products, all products
cnt = 0
NN = len(errors_dict.keys())
t0 = time()
for i in range(0, len(errors_dict.keys())):
    word = sorted(errors_dict.keys())[i]
    cnt += 1
    lst = []
    lst_tuple = []
    suggested = False
    suggested_word = ""
    rt_max = 0

    # if only one word in query, use be more selective in choosing a correction
    min_query_len = min(df_all['search_term_simpleparsed'][
                            df_all['search_term_simpleparsed'].map(lambda x: is_word_in_string(word, x))].map(
        lambda x: len(x.split())))
    delta = 0.05 * int(min_query_len < 2)

    words_from_matched_titles = [item for item in \
                                 " ".join(list(set(df_all['product_title_simpleparsed'][
                                                       df_all['search_term_simpleparsed'].map(
                                                           lambda x: is_word_in_string(word, x))]))).split() \
                                 if len(item) > 2 and len(re.sub('[^0-9]', '', item)) == 0]
    words_from_matched_titles = list(set(words_from_matched_titles))
    words_from_matched_titles.sort()

    source = ""
    for bigram in bigrams_set:
        if bigram.replace("_", "") == word:
            suggested = True
            suggested_word = bigram.replace("_", " ")
            source = "from bigrams"

    if source == "":
        for correct_word in words_from_matched_titles:
            rt, rt_scaled = seq_matcher(word, correct_word)
            # print correct_word, rt,rt_scaled

            if rt > 0.75 + delta or (len(word) < 6 and rt > 0.68 + delta):
                lst.append(correct_word)
                lst_tuple.append((correct_word, my_dict[correct_word]["title"]))
                if rt > rt_max:
                    rt_max = rt
                    suggested = True
                    source = "from matched products"
                    suggested_word = correct_word
                elif rt == rt_max and seq_matcher("".join(sorted(word)), "".join(sorted(correct_word)))[0] > \
                        seq_matcher("".join(sorted(word)), "".join(sorted(suggested_word)))[0]:
                    suggested_word = correct_word
                elif rt == rt_max:
                    suggested = False
                    source = ""

    if source == "" and len(lst) == 0:
        source = "from all products"
        for correct_word in correct_dict.keys():
            rt, rt_scaled = seq_matcher(word, correct_word)
            # print correct_word, rt,rt_scaled
            if correct_dict[correct_word]["title"] > 10 and (rt > 0.8 + delta or (len(word) < 6 and rt > 0.73 + delta)):
                # print correct_word, rt,rt_scaled
                lst.append(correct_word)
                lst_tuple.append((correct_word, correct_dict[correct_word]["title"]))
                if rt > rt_max:
                    rt_max = rt
                    suggested = True
                    suggested_word = correct_word
                elif rt == rt_max and seq_matcher("".join(sorted(word)), "".join(sorted(correct_word)))[0] > \
                        seq_matcher("".join(sorted(word)), "".join(sorted(suggested_word)))[0]:
                    suggested_word = correct_word
                elif rt == rt_max:
                    suggested = False

    if suggested == True:
        errors_dict[word]["suggestion"] = suggested_word
        errors_dict[word]["others"] = lst_tuple
        errors_dict[word]["source"] = source
    else:
        errors_dict[word]["suggestion"] = ""
        errors_dict[word]["others"] = lst_tuple
        errors_dict[word]["source"] = source
        # print(cnt, word, errors_dict[word]["query"], errors_dict[word]["suggestion"], source, errors_dict[word]["others"])
        # if (cnt % 20)==0:
        #    print cnt, " out of ", NN, "; ", round((time()-t0),1) ,' sec'


In [45]:
for word in my_dict.keys():
    if my_dict[word]['query'] > 0 and my_dict[word]['title'] == 0 \
            and len(re.sub('[^0-9]', '', word)) != 0 and len(re.sub('[^a-z]', '', word)) != 0:
        srch = re.search(r'(?<=^)[a-z][a-z][a-z]+(?=[0-9])', word)
        if srch != None and len(wn.synsets(srch.group(0))) > 0 \
                and len(re.sub('[^aeiou]', '', word)) > 0 and word[-1] in '0123456789':
            errors_dict[word] = my_dict[word]
            errors_dict[word]["source"] = "added space before digit"
            errors_dict[word]["suggestion"] = re.sub(r'(?<=^)' + srch.group(0) + r'(?=[a-zA-Z0-9])',
                                                     srch.group(0) + ' ', word)
            # print word, re.sub(r'(?<=^)'+srch.group(0)+r'(?=[a-zA-Z0-9])',srch.group(0)+' ',word)

### save dictionary
corrections_df = pd.DataFrame(errors_dict).transpose()
#corrections_df.to_csv(PROCESSINGTEXT_DIR + "/automatically_generated_word_corrections.csv")

print('building spell checker time:', round((time() - t0) / 60, 1), 'minutes\n')


building spell checker time: 10.6 minutes



In [46]:
"""
There are many non-unique queries and products. To save time, in some cases we processed only unique entries.
The following function applies str_parser() function to unique entries only.
"""


def col_parser(clmn, automatic_spell_check_dict={}, remove_from_brackets=False, parse_material=False,
               add_space_stop_list=[]):
    t0 = time()
    aa = list(set(list(clmn)))
    my_dict = {}
    for i in range(0, len(aa)):
        my_dict[aa[i]] = str_parser(aa[i], automatic_spell_check_dict=automatic_spell_check_dict,
                                    remove_from_brackets=remove_from_brackets, \
                                    parse_material=parse_material, add_space_stop_list=add_space_stop_list)
        if (i % 10000) == 0:
            print
            "parsed " + str(i) + " out of " + str(len(aa)) + " unique values; " + str(
                round((time() - t0) / 60, 1)) + " minutes"
    return clmn.map(lambda x: my_dict[x])


In [48]:


def str_parser(s, automatic_spell_check_dict={}, remove_from_brackets=False, parse_material=False,
               add_space_stop_list=[]):
    # the following three replacements are shared on the forum
    s = s.replace("craftsm,an", "craftsman")
    s = re.sub(r'depot.com/search=', '', s)
    s = re.sub(r'pilers,needlenose', 'pliers, needle nose', s)

    s = re.sub(r'\bmr.', 'mr ', s)
    s = re.sub(r'&amp;', '&', s)
    s = re.sub('&nbsp;', '', s)
    s = re.sub('&#39;', '', s)
    s = re.sub(r'(?<=[0-9]),[\ ]*(?=[0-9])', '', s)
    s = s.replace(";", ".")
    s = s.replace(",", ".")
    s = s.replace(":", ". ")
    s = s.replace("+", " ")
    s = re.sub(r'\bU.S.', 'US ', s)
    s = s.replace(" W x ", " ")
    s = s.replace(" H x ", " ")
    s = re.sub(' [\#]\d+[\-\d]*[\,]*', '', s)
    s = re.sub('(?<=[0-9\%])(?=[A-Z][a-z])', '. ', s)  # add dot between number and cap letter
    s = re.sub(r'(?<=\))(?=[a-zA-Z0-9])', ' ', s)  # add space between parentheses and letters
    s = re.sub(r'(?<=[a-zA-Z0-9])(?=\()', ' ', s)  # add space between parentheses and letters

    if parse_material:
        replace_dict = {'Medium Density Fiberboard (MDF)': 'mdf', 'High Density Fiberboard (HDF)': 'hdf', \
                        'Fibre Reinforced Polymer (FRP)': 'frp', 'Acrylonitrile Butadiene Styrene (ABS)': 'abs', \
                        'Cross-Linked Polyethylene (PEX)': 'pex', 'Chlorinated Poly Vinyl Chloride (CPVC)': 'cpvc', \
                        'PVC (vinyl)': 'pvc', 'Thermoplastic rubber (TPR)': 'tpr', 'Poly Lactic Acid (PLA)': 'pla', \
                        '100% Polyester': 'polyester', '100% UV Olefin': 'olefin',
                        '100% BCF Polypropylene': 'polypropylene', \
                        '100% PVC': 'pvc'}

        if s in replace_dict.keys():
            s = replace_dict[s]

    s = re.sub('[^a-zA-Z0-9\n\ \%\$\-\#\@\&\/\.\'\*\(\)]', ' ', s)
    s = " ".join(s.split())

    s = s.replace("-", " ")

    if len(add_space_stop_list) > 0:
        s = " ".join(
            [re.sub('(?<=[a-z])(?=[A-Z][a-z\ ])', '. ', word) if word.lower() not in add_space_stop_list else word for
             word in s.split()])

    s = s.lower()
    s = re.sub('\.(?=[a-z])', '. ', s)  # dots before words -> replace with spaces
    # s = re.sub('(?<=[a-z])(?=[A-Z][a-z\ ])', ' ', s) # add space if uppercase after lowercase
    s = re.sub('(?<=[a-z][a-z][a-z])(?=[0-9])', ' ', s)  # add cpase if number after at least three letters
    ##s = re.sub('(?<=[a-zA-Z])\.(?=\ |$)', '', s) #remove dots at the end of string
    # s = re.sub('(?<=[0-9])\.(?=\ |$)', '', s) # dot after digit before space
    s = re.sub('^\.\ ', '', s)  # dot at the beginning before space

    if len(automatic_spell_check_dict.keys()) > 0:
        s = spell_correction(s, automatic_spell_check_dict=automatic_spell_check_dict)

    if remove_from_brackets == True:
        s = re.sub('(?<=\()[a-zA-Z0-9\n\ \%\$\-\#\@\&\/\.\'\*\(\)]*(?=\))', '', s)
    else:
        s = s.replace(" (", ". ")
        s = re.sub('(?<=[a-zA-Z0-9\%\$])\(', '. ', s)
        s = s.replace(" )", ". ")
        s = s.replace(")", ". ")
        s = s.replace("  ", " ")
        s = re.sub('\ \.', '\.', s)

    #######s = re.sub('(?<=[0-9\%])(?=[a-wyz])', ' ', s) # add space between number and text (except letter x)
    # s = re.sub('(?<=[a-zA-Z])-(?=[a-zA-Z])', ' ', s) # replace '-' in words with space
    s = s.replace("at&t", "att")
    s = s.replace("&", " and ")
    s = s.replace("*", " x ")
    s = re.sub('(?<=[a-z\ ])\/(?=[a-z\ ])', ' ', s)  # replace "/" between words with space
    s = re.sub('(?<=[a-z])\\\\(?=[a-z])', ' ', s)  # replace "/" between words with space
    s = s.replace("  ", " ")
    s = s.replace("  ", " ")

    # s=re.sub('(?<=\ [a-ux-z])\ (?=[0-9])', '', s)   #remove spaces
    # s=re.sub('(?<=^[a-z])\ (?=[0-9])', '', s)   #remove spaces




    #####################################
    ### thesaurus replacement in all vars
    s = replace_in_parser(s)

    s = re.sub('half(?=\ inch)', '1/2', s)
    s = re.sub('\ba half\b', '1/2', s)
    # s = re.sub('half\ ', 'half-', s)

    s = re.sub(r'(?<=\')s\b', '', s)
    s = re.sub('(?<=[0-9])\'\'', ' in ', s)
    s = re.sub('(?<=[0-9])\'', ' in ', s)

    s = re.sub(r'(?<=[0-9])[\ ]*inch[es]*\b', '-in ', s)
    s = re.sub(r'(?<=[0-9])[\ ]*in\b', '-in ', s)

    s = re.sub(r'(?<=[0-9])[\-|\ ]*feet[s]*\b', '-ft ', s)
    s = re.sub(r'(?<=[0-9])[\ ]*foot[s]*\b', '-ft ', s)
    s = re.sub(r'(?<=[0-9])[\ ]*ft[x]*\b', '-ft ', s)

    s = re.sub('(?<=[0-9])[\ ]*volt[s]*(?=\ |$|\.)', '-V ', s)
    s = re.sub('(?<=[0-9])[\ ]*v(?=\ |$|\.)', '-V ', s)

    s = re.sub('(?<=[0-9])[\ ]*wat[t]*[s]*(?=\ |$|\.)', '-W ', s)
    s = re.sub('(?<=[0-9])[\ ]*w(?=\ |$|\.)', '-W ', s)

    s = re.sub('(?<=[0-9])[\ ]*kilo[\ ]*watt[s]*(?=\ |$|\.)', '-KW ', s)
    s = re.sub('(?<=[0-9])[\ ]*kw(?=\ |$|\.)', '-KW ', s)

    s = re.sub('(?<=[0-9])[\ ]*amp[s]*(?=\ |$|\.)', '-A ', s)
    # s = re.sub('(?<=[0-9]) a(?=\ |$|\.)', '-A. ', s)
    s = re.sub('(?<=[0-9])a(?=\ |$|\.)', '-A ', s)

    s = re.sub('(?<=[0-9])[\ ]*gallon[s]*(?=\ |$|\.)', '-gal ', s)
    s = re.sub('(?<=[0-9])[\ ]*gal(?=\ |$|\.)', '-gal ', s)

    s = re.sub('(?<=[0-9])[\ ]*pound[s]*(?=\ |$|\.)', '-lb ', s)
    s = re.sub('(?<=[0-9])[\ ]*lb[s]*(?=\ |$|\.)', '-lb ', s)

    s = re.sub('(?<=[0-9])[\ ]*mi[l]+imet[er]*[s]*(?=\ |$|\.)', '-mm ', s)
    s = re.sub('(?<=[0-9])[\ ]*mm(?=\ |$|\.)', '-mm ', s)

    s = re.sub('(?<=[0-9])[\ ]*centimeter[s]*(?=\ |$|\.)', '-cm ', s)
    s = re.sub('(?<=[0-9])[\ ]*cm(?=\ |$|\.)', '-cm ', s)

    s = re.sub('(?<=[0-9])[\ ]*ounce[s]*(?=\ |$|\.)', '-oz ', s)
    s = re.sub('(?<=[0-9])[\ ]*oz(?=\ |$|\.)', '-oz ', s)

    s = re.sub('(?<=[0-9])[\ ]*liter[s]*(?=\ |$|\.)', '-L ', s)
    s = re.sub('(?<=[0-9])[\ ]*litre[s]*(?=\ |$|\.)', '-L ', s)
    s = re.sub('(?<=[0-9])[\ ]*l(?=\ |$|\.)', '-L. ', s)

    s = re.sub('(?<=[0-9])[\ ]*square feet[s]*(?=\ |$|\.)', '-sqft ', s)
    s = re.sub('(?<=[0-9])square feet[s]*(?=\ |$|\.)', '-sqft ', s)
    s = re.sub('(?<=[0-9])[\ ]*sq[\ |\.|\.\ ]*ft(?=\ |$|\.)', '-sqft ', s)
    s = re.sub('(?<=[0-9])[\ ]*sq. ft(?=\ |$|\.)', '-sqft', s)
    s = re.sub('(?<=[0-9])[\ ]*sq.ft(?=\ |$|\.)', '-sqft', s)

    s = re.sub('(?<=[0-9])[\ ]*cubic f[e]*t[s]*(?=\ |$|\.)', '-cuft ', s)
    s = re.sub('(?<=[0-9])[\ ]*cu[\ |\.|\.\ ]*ft(?=\ |$|\.)', '-cuft ', s)
    s = re.sub('(?<=[0-9])[\ ]*cu[\.]*[\ ]*ft(?=\ |$|\.)', '-cuft', s)

    # remove 'x'
    s = re.sub('(?<=[0-9]) x (?=[0-9])', '-X ', s)
    s = re.sub('(?<=[0-9])x (?=[0-9])', '-X ', s)
    s = re.sub('(?<=[0-9]) x(?=[0-9])', '-X ', s)
    s = re.sub('(?<=[0-9])x(?=[0-9])', '-X ', s)

    # s=s.replace("..",".")
    s = s.replace("\n", " ")
    s = s.replace("  ", " ")

    words = s.split()

    if s.find("-X") >= 0:
        for cnt in range(0, len(words) - 1):
            if words[cnt].find("-X") >= 0:
                if words[cnt + 1].find("-X") and cnt < len(words) - 2:
                    cntAdd = 2
                else:
                    cntAdd = 1
                to_replace = re.search(r'(?<=[0-9]\-)\w+\b', words[cnt + cntAdd])
                if not (to_replace == None):
                    words[cnt] = words[cnt].replace("-X", "-" + to_replace.group(0) + "")
                else:
                    words[cnt] = words[cnt].replace("-X", "x")
    s = " ".join([word for word in words])

    s = re.sub('[^a-zA-Z0-9\ \%\$\-\@\&\/\.]', '', s)  # remove "'" and "\n" and "#" and characters
    ##s = re.sub('(?<=[a-zA-Z])[\.|\/](?=\ |$)', '', s) #remove dots at the end of string
    s = re.sub('(?<=[0-9])x(?=\ |$)', '', s)  # remove
    s = re.sub('(?<=[\ ])x(?=[0-9])', '', s)  # remove
    s = re.sub('(?<=^)x(?=[0-9])', '', s)
    # s = re.sub('[\ ]\.(?=\ |$)', '', s) #remove dots
    s = s.replace("  ", " ")
    s = s.replace("..", ".")
    s = re.sub('\ \.', '', s)

    s = re.sub('(?<=\ [ch-hj-np-su-z][a-z])\ (?=[0-9])', '', s)  # remove spaces
    s = re.sub('(?<=^[ch-hj-np-su-z][a-z])\ (?=[0-9])', '', s)  # remove spaces

    s = re.sub('(?<=\ )\.(?=[0-9])', '0.', s)
    s = re.sub('(?<=^)\.(?=[0-9])', '0.', s)
    return " ".join([word for word in s.split()])

In [50]:
def spell_correction(s, automatic_spell_check_dict={}):
    s = s.replace("ttt", "tt")
    s = s.replace("lll", "ll")
    s = s.replace("nnn", "nn")
    s = s.replace("rrr", "rr")
    s = s.replace("sss", "ss")
    s = s.replace("zzz", "zz")
    s = s.replace("ccc", "cc")
    s = s.replace("eee", "ee")

    s = s.replace("hinges with pishinges with pins", "hinges with pins")
    s = s.replace("virtue usa", "virtu usa")
    s = re.sub('outdoor(?=[a-rt-z])', 'outdoor ', s)
    s = re.sub(r'\bdim able\b', "dimmable", s)
    s = re.sub(r'\blink able\b', "linkable", s)
    s = re.sub(r'\bm aple\b', "maple", s)
    s = s.replace("aire acondicionado", "air conditioner")
    s = s.replace("borsh in dishwasher", "bosch dishwasher")
    s = re.sub(r'\bapt size\b', 'appartment size', s)
    s = re.sub(r'\barm[e|o]r max\b', 'armormax', s)
    s = re.sub(r' ss ', ' stainless steel ', s)
    s = re.sub(r'\bmay tag\b', 'maytag', s)
    s = re.sub(r'\bback blash\b', 'backsplash', s)
    s = re.sub(r'\bbum boo\b', 'bamboo', s)
    s = re.sub(r'(?<=[0-9] )but\b', 'btu', s)
    s = re.sub(r'\bcharbroi l\b', 'charbroil', s)
    s = re.sub(r'\bair cond[it]*\b', 'air conditioner', s)
    s = re.sub(r'\bscrew conn\b', 'screw connector', s)
    s = re.sub(r'\bblack decker\b', 'black and decker', s)
    s = re.sub(r'\bchristmas din\b', 'christmas dinosaur', s)
    s = re.sub(r'\bdoug fir\b', 'douglas fir', s)
    s = re.sub(r'\belephant ear\b', 'elephant ears', s)
    s = re.sub(r'\bt emp gauge\b', 'temperature gauge', s)
    s = re.sub(r'\bsika felx\b', 'sikaflex', s)
    s = re.sub(r'\bsquare d\b', 'squared', s)
    s = re.sub(r'\bbehring\b', 'behr', s)
    s = re.sub(r'\bcam\b', 'camera', s)
    s = re.sub(r'\bjuke box\b', 'jukebox', s)
    s = re.sub(r'\brust o leum\b', 'rust oleum', s)
    s = re.sub(r'\bx mas\b', 'christmas', s)
    s = re.sub(r'\bmeld wen\b', 'jeld wen', s)
    s = re.sub(r'\bg e\b', 'ge', s)
    s = re.sub(r'\bmirr edge\b', 'mirredge', s)
    s = re.sub(r'\bx ontrol\b', 'control', s)
    s = re.sub(r'\boutler s\b', 'outlets', s)
    s = re.sub(r'\bpeep hole', 'peephole', s)
    s = re.sub(r'\bwater pik\b', 'waterpik', s)
    s = re.sub(r'\bwaterpi k\b', 'waterpik', s)
    s = re.sub(r'\bplex[iy] glass\b', 'plexiglass', s)
    s = re.sub(r'\bsheet rock\b', 'sheetrock', s)
    s = re.sub(r'\bgen purp\b', 'general purpose', s)
    s = re.sub(r'\bquicker crete\b', 'quikrete', s)
    s = re.sub(r'\bref ridge\b', 'refrigerator', s)
    s = re.sub(r'\bshark bite\b', 'sharkbite', s)
    s = re.sub(r'\buni door\b', 'unidoor', s)
    s = re.sub(r'\bair tit\b', 'airtight', s)
    s = re.sub(r'\bde walt\b', 'dewalt', s)
    s = re.sub(r'\bwaterpi k\b', 'waterpik', s)
    s = re.sub(r'\bsaw za(ll|w)\b', 'sawzall', s)
    s = re.sub(r'\blg elec\b', 'lg', s)
    s = re.sub(r'\bhumming bird\b', 'hummingbird', s)
    s = re.sub(r'\bde ice(?=r|\b)', 'deice', s)
    s = re.sub(r'\bliquid nail\b', 'liquid nails', s)

    s = re.sub(r'\bdeck over\b', 'deckover', s)
    s = re.sub(r'\bcounter sink(?=s|\b)', 'countersink', s)
    s = re.sub(r'\bpipes line(?=s|\b)', 'pipeline', s)
    s = re.sub(r'\bbook case(?=s|\b)', 'bookcase', s)
    s = re.sub(r'\bwalkie talkie\b', '2 pair radio', s)
    s = re.sub(r'(?<=^)ks\b', 'kwikset', s)
    s = re.sub('(?<=[0-9])[\ ]*ft(?=[a-z])', 'ft ', s)
    s = re.sub('(?<=[0-9])[\ ]*mm(?=[a-z])', 'mm ', s)
    s = re.sub('(?<=[0-9])[\ ]*cm(?=[a-z])', 'cm ', s)
    s = re.sub('(?<=[0-9])[\ ]*inch(es)*(?=[a-z])', 'in ', s)

    s = re.sub(r'(?<=[1-9]) pac\b', 'pack', s)

    s = re.sub(r'\bcfl bulbs\b', 'cfl light bulbs', s)
    s = re.sub(r' cfl(?=$)', ' cfl light bulb', s)
    s = re.sub(r'candelabra cfl 4 pack', 'candelabra cfl light bulb 4 pack', s)
    s = re.sub(r'\bthhn(?=$|\ [0-9]|\ [a-rtuvx-z])', 'thhn wire', s)
    s = re.sub(r'\bplay ground\b', 'playground', s)
    s = re.sub(r'\bemt\b', 'emt electrical metallic tube', s)
    s = re.sub(r'\boutdoor dining se\b', 'outdoor dining set', s)

    if "a/c" in s:
        if ('unit' in s) or ('frost' in s) or ('duct' in s) or ('filt' in s) or ('vent' in s) or ('clean' in s) or (
            'vent' in s) or ('portab' in s):
            s = s.replace("a/c", "air conditioner")
        else:
            s = s.replace("a/c", "ac")

    external_data_dict = {'airvents': 'air vents',
                          'antivibration': 'anti vibration',
                          'autofeeder': 'auto feeder',
                          'backbrace': 'back brace',
                          'behroil': 'behr oil',
                          'behrwooden': 'behr wooden',
                          'brownswitch': 'brown switch',
                          'byefold': 'bifold',
                          'canapu': 'canopy',
                          'cleanerakline': 'cleaner alkaline',
                          'colared': 'colored',
                          'comercialcarpet': 'commercial carpet',
                          'dcon': 'd con',
                          'doorsmoocher': 'door smoocher',
                          'dreme': 'dremel',
                          'ecobulb': 'eco bulb',
                          'fantdoors': 'fan doors',
                          'gallondrywall': 'gallon drywall',
                          'geotextile': 'geo textile',
                          'hallodoor': 'hallo door',
                          'heatgasget': 'heat gasket',
                          'ilumination': 'illumination',
                          'insol': 'insulation',
                          'instock': 'in stock',
                          'joisthangers': 'joist hangers',
                          'kalkey': 'kelkay',
                          'kohlerdrop': 'kohler drop',
                          'kti': 'kit',
                          'laminet': 'laminate',
                          'mandoors': 'main doors',
                          'mountspacesaver': 'mount space saver',
                          'reffridge': 'refrigerator',
                          'refrig': 'refrigerator',
                          'reliabilt': 'reliability',
                          'replaclacemt': 'replacement',
                          'searchgalvanized': 'search galvanized',
                          'seedeater': 'seed eater',
                          'showerstorage': 'shower storage',
                          'straitline': 'straight line',
                          'subpumps': 'sub pumps',
                          'thromastate': 'thermostat',
                          'topsealer': 'top sealer',
                          'underlay': 'underlayment',
                          'vdk': 'bdk',
                          'wallprimer': 'wall primer',
                          'weedbgon': 'weed b gon',
                          'weedeaters': 'weed eaters',
                          'weedwacker': 'weed wacker',
                          'wesleyspruce': 'wesley spruce',
                          'worklite': 'work light'}

    for word in external_data_dict.keys():
        s = re.sub(r'\b' + word + r'\b', external_data_dict[word], s)

    ############ replace words from dict
    for word in automatic_spell_check_dict.keys():
        s = re.sub(r'\b' + word + r'\b', automatic_spell_check_dict[word], s)

    return s


In [52]:
def replace_in_parser(s):
    # the first three shared on forum
    s = s.replace("acccessories", "accessories")
    s = re.sub(r'\bscott\b', 'scotts', s)  # brand
    s = re.sub(r'\borgainzer\b', 'organizer', s)

    # the others are not shared
    s = re.sub(r'\aluminuum\b', 'aluminum', s)
    s = re.sub(r'\bgeneral electric', 'ge', s)
    s = s.replace("adaptor", "adapter")
    s = re.sub(r'\bfibre', 'fiber', s)
    s = re.sub(r'\bbuilt in\b', 'builtin', s)
    s = re.sub(r'\bshark bite\b', 'sharkbite', s)
    s = re.sub('barbeque', 'barbecue', s)
    s = re.sub(r'\bbbq\b', 'barbecue', s)
    s = re.sub(r'\bbathroom[s]*\b', 'bath', s)
    s = re.sub(r'\bberkeley\b', 'berkley', s)
    s = re.sub(r'\bbookshelves\b', 'book shelf', s)
    s = re.sub(r'\bbookshelf\b', 'book shelf', s)
    s = re.sub(r'\bin line ', ' inline ', s)
    s = re.sub(r'round up\b', ' roundup', s)
    s = re.sub(r'\blg electronics\b', 'lg', s)
    s = re.sub(r'\bhdtv\b', 'hd tv', s)
    s = re.sub(r'black [and ]*decker', 'black and decker', s)
    s = re.sub(r'backer board[s]*', 'backerboard', s)
    s = re.sub(r'\bphillips\b', 'philips', s)
    s = re.sub(r'\bshower head[s]*\b', 'showerhead', s)
    s = re.sub(r'\bbull nose\b', 'bullnose', s)
    s = re.sub(r'\bflood light\b', 'floodlight', s)
    s = re.sub(r'\barrester\b', 'arrestor', s)
    s = re.sub(r'\bbi fold\b', 'bifold', s)
    s = re.sub(r'\bfirepit[s]*\b', 'fire pit', s)
    s = re.sub(r'\bbed bug[s]*\b', 'bedbug', s)
    s = re.sub(r'\bhook up[s]*\b', 'hookup', s)
    s = re.sub(r'\bjig saw[s]*\b', 'jigsaw', s)
    s = re.sub(r'\bspacesav(?=er[s]*|ing)', 'space sav', s)
    s = re.sub(r'\bwall paper', 'wallpaper', s)
    s = re.sub(r'\bphotocell', 'photo cells', s)
    s = re.sub(r'\bplasti dip\b', 'plastidip', s)
    s = re.sub(r'\bflexi dip\b', 'flexidip', s)
    s = re.sub(r'\bback splash', 'backsplash', s)
    s = re.sub(r'\bbarstool(?=\b|s)', 'bar stool', s)
    s = re.sub(r'\blampholder(?=\b|s)', 'lamp holder', s)
    s = re.sub(r'\brainsuit(?=\b|s)', 'rain suit', s)
    s = re.sub(r'\bback up\b', 'backup', s)
    s = re.sub(r'\bwheel barrow', 'wheelbarrow', s)
    s = re.sub(r'\bsaw horse', 'sawhorse', s)
    s = re.sub(r'\bscrew driver', 'screwdriver', s)
    s = re.sub(r'\bnut driver', 'nutdriver', s)
    s = re.sub(r'\bflushmount', 'flush mount', s)
    s = re.sub(r'\bcooktop(?=\b|s\b)', 'cook top', s)
    s = re.sub(r'\bcounter top(?=s|\b)', 'countertop', s)
    s = re.sub(r'\bbacksplash', 'back splash', s)
    s = re.sub(r'\bhandleset', 'handle set', s)
    s = re.sub(r'\bplayset', 'play set', s)
    s = re.sub(r'\bsidesplash', 'side splash', s)
    s = re.sub(r'\bdownlight', 'down light', s)
    s = re.sub(r'\bbackerboard', 'backer board', s)
    s = re.sub(r'\bshoplight', 'shop light', s)
    s = re.sub(r'\bdownspout', 'down spout', s)
    s = re.sub(r'\bpowerhead', 'power head', s)
    s = re.sub(r'\bnightstand', 'night stand', s)
    s = re.sub(r'\bmicro fiber[s]*\b', 'microfiber', s)
    s = re.sub(r'\bworklight', 'work light', s)
    s = re.sub(r'\blockset', 'lock set', s)
    s = re.sub(r'\bslatwall', 'slat wall', s)
    s = re.sub(r'\btileboard', 'tile board', s)
    s = re.sub(r'\bmoulding', 'molding', s)
    s = re.sub(r'\bdoorstop', 'door stop', s)
    s = re.sub(r'\bwork bench\b', 'workbench', s)
    s = re.sub(r'\bweed[\ ]*eater', 'weed trimmer', s)
    s = re.sub(r'\bweed[\ ]*w[h]*acker', 'weed trimmer', s)
    s = re.sub(r'\bnightlight(?=\b|s)', 'night light', s)
    s = re.sub(r'\bheadlamp(?=\b|s)', 'head lamp', s)
    s = re.sub(r'\bfiber board', 'fiberboard', s)
    s = re.sub(r'\bmail box', 'mailbox', s)

    replace_material_dict = {'aluminium': 'aluminum',
                             'medium density fiberboard': 'mdf',
                             'high density fiberboard': 'hdf',
                             'fiber reinforced polymer': 'frp',
                             'cross linked polyethylene': 'pex',
                             'poly vinyl chloride': 'pvc',
                             'thermoplastic rubber': 'tpr',
                             'poly lactic acid': 'pla',
                             'acrylonitrile butadiene styrene': 'abs',
                             'chlorinated poly vinyl chloride': 'cpvc'}
    for word in replace_material_dict.keys():
        if word in s:
            s = s.replace(word, replace_material_dict[word])

    return s


In [53]:
########################################
##### load words for spell checker
spell_check_dict = {}
for word in errors_dict.keys():
    if errors_dict[word]['suggestion'] != "":
        spell_check_dict[word] = errors_dict[word]['suggestion']

"""
spell_check_dict={}
with open(PROCESSINGTEXT_DIR+'/automatically_generated_word_corrections.csv') as csvfile:
     reader = csv.DictReader(csvfile)
     for row in reader:
         if row['suggestion']!="":
             spell_check_dict[row['word']]=row['suggestion']
"""

###############################################
### parse query and product title
df_all['search_term_parsed'] = col_parser(df_all['search_term'], automatic_spell_check_dict=spell_check_dict, \
                                          add_space_stop_list=[]).map(lambda x: x.encode('utf-8'))
df_all['search_term_parsed_wospellcheck'] = col_parser(df_all['search_term'], automatic_spell_check_dict={}, \
                                                       add_space_stop_list=[]).map(lambda x: x.encode('utf-8'))
print('search_term parsing time:', round((time() - t0) / 60, 1), 'minutes\n')

t0 = time()


search_term parsing time: 36.2 minutes

