<h1>Recommendation Feature Creation<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Define-Functions" data-toc-modified-id="Define-Functions-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Define Functions</a></span></li><li><span><a href="#Creating-Features" data-toc-modified-id="Creating-Features-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Creating Features</a></span><ul class="toc-item"><li><span><a href="#Recommendation-Token" data-toc-modified-id="Recommendation-Token-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Recommendation Token</a></span><ul class="toc-item"><li><span><a href="#Patterns" data-toc-modified-id="Patterns-2.1.1"><span class="toc-item-num">2.1.1&nbsp;&nbsp;</span>Patterns</a></span></li><li><span><a href="#Extract-Patterns" data-toc-modified-id="Extract-Patterns-2.1.2"><span class="toc-item-num">2.1.2&nbsp;&nbsp;</span>Extract Patterns</a></span></li></ul></li><li><span><a href="#POS-Token" data-toc-modified-id="POS-Token-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>POS Token</a></span></li><li><span><a href="#Brand-Token" data-toc-modified-id="Brand-Token-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Brand Token</a></span></li><li><span><a href="#Merge-feature-for-initial-product" data-toc-modified-id="Merge-feature-for-initial-product-2.4"><span class="toc-item-num">2.4&nbsp;&nbsp;</span>Merge feature for initial product</a></span></li><li><span><a href="#Final-Tags" data-toc-modified-id="Final-Tags-2.5"><span class="toc-item-num">2.5&nbsp;&nbsp;</span>Final Tags</a></span></li><li><span><a href="#Lemmatization-&amp;-Remove-Stopwords" data-toc-modified-id="Lemmatization-&amp;-Remove-Stopwords-2.6"><span class="toc-item-num">2.6&nbsp;&nbsp;</span>Lemmatization &amp; Remove Stopwords</a></span></li><li><span><a href="#Remove-Special-Words" data-toc-modified-id="Remove-Special-Words-2.7"><span class="toc-item-num">2.7&nbsp;&nbsp;</span>Remove Special Words</a></span></li><li><span><a href="#Drop-Duplicate-words-in-Recommendation-doc" data-toc-modified-id="Drop-Duplicate-words-in-Recommendation-doc-2.8"><span class="toc-item-num">2.8&nbsp;&nbsp;</span>Drop Duplicate words in Recommendation doc</a></span></li><li><span><a href="#Remove-Extra-Space-and-Output-file" data-toc-modified-id="Remove-Extra-Space-and-Output-file-2.9"><span class="toc-item-num">2.9&nbsp;&nbsp;</span>Remove Extra Space and Output file</a></span></li></ul></li></ul></div>

In [None]:
import pandas as pd
import re
import spacy
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
nlp = spacy.load("en_core_web_lg")

In [None]:
product = pd.read_excel('Behold+product+data+04262021.xlsx')

# Define Functions

In [None]:
def remove_n(col):
    '''Remove change line character'''
    return col.fillna(' ').astype(str).apply(lambda x: re.sub('\n',' ',x))

In [None]:
def remove_comma(col):
    '''Replace comma with space'''
    return col.apply(lambda x: re.sub(',',' ',x))

In [None]:
def decomposite_list(col):
    '''decomposite list into string'''
    return col.apply(lambda x: ' '.join(x))

In [None]:
def nltk_tag_to_wordnet_tag(nltk_tag):
    '''function to convert nltk tag to wordnet tag'''
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

In [None]:
def lemmatize_sentence(sentence):
    '''Use NLTK package to perform lemmatize'''
    lemmatizer = WordNetLemmatizer()
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(word_tokenize(sentence))  
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return ' '.join(lemmatized_sentence)

In [None]:
def remove_sw(text):
    '''Remove stop words'''
    from nltk.corpus import stopwords
    from nltk import word_tokenize

    stopwords_list = stopwords.words('english')

    import re

    # split sentence into words
    words = word_tokenize(text)

    new_words = []
    # remove stopwords
    for w in words:
        if w in stopwords_list:
            continue
        new_words.append(w)

    return ' '.join(new_words)

In [None]:
def textClean(text):   
    '''Function combines lemmatization and stopwords removal'''
    return(remove_sw(lemmatize_sentence(text)))

In [None]:
def product_POS(text):
    '''Use nlp to extract pattern of speech for product features, which extract both noun and adjectives'''
    word = []
    doc = nlp(text)
    for token in doc:
        if (token.pos_ == 'ADJ') | (token.pos_ == 'NOUN') :
            word.append(token.text)
    return word


In [None]:
def recom_POS(text):
    '''Use nlp to extract pattern of speech for recommendation features, which extract only adjectives'''
    word = []
    doc = nlp(text)
    for token in doc:
        if (token.pos_ == 'ADJ'):
            word.append(token.text)
    return word

In [None]:
def removeExtraSpace(doc):
    '''Remove extra spaces'''
    cleaned_list = []
    for text in doc.split():
        cleaned_list.append(text.strip())
    return ' '.join(cleaned_list)

In [None]:
def remove_special_word(col):
    '''Remove special word'''
    return col.apply(lambda x: re.sub(r'[^A-Za-z0-9%_ ]+', '',x))

# Creating Features

Preproceesing description and details, remove '\n'

In [None]:
product['description'] = remove_n(product['description'])
product['details'] = remove_n(product['details'])

## Recommendation Token

- We analyzed the description and details columns, and figured out some patterns that implies the potential outfit recommendation styles and combinations, which includes with, allow for, style tip, pair, under, wear over, tuck in, and continue.

### Patterns

In [None]:
# create regex patterns for recommendation tokens

with_pattern = r'([\w -,]+?(?:with\b)[\w ,]+)'
for_pattern = r'([\w -,]+?(?:allow\b|allows\b)?:[\w ,]+(?:\bfor\b)[\w ,]+)'
style_pattern = r'([\w -,]+?(?:style\b|Style\b)?:[\w ,]+(?:\bTip\b|\btip\b|trick\b)[\w ,]+)' 
pair_pattern = r'([\w -,]+?(?:pair)[\w ,]+)' 
under_pattern = r'([\w -,]+?(?:under\b)[\w ,]+)'
over_pattern = r'([\w -,]+?(?:wear|worn)[\w ,]+?(?:over\b)[\w ,]+)' 
tuck_in_pattern = r'([\w -,]+?(?:\btuck\b)[\w ,]+?(?:\bin\b)[\w ,]+)'
continue_pattern = r'([\w -,]+?(?:\bcontinu)[\w ,]+)'

### Extract Patterns

In [None]:
## extract patterns from Description column
product['With Pattern'] = product['description'].astype(str).apply(lambda x: re.findall(with_pattern, x))
product['Continue Pattern'] = product['description'].astype(str).apply(lambda x: re.findall(continue_pattern, x))
product['Tuck Pattern'] = product['description'].astype(str).apply(lambda x: re.findall(tuck_in_pattern, x))
product['Over Pattern'] = product['description'].astype(str).apply(lambda x: re.findall(over_pattern, x))
product['Under Pattern'] = product['description'].astype(str).apply(lambda x: re.findall(under_pattern, x))
product['Pair Pattern'] = product['description'].astype(str).apply(lambda x: re.findall(pair_pattern, x))
product['Style Pattern'] = product['description'].astype(str).apply(lambda x: re.findall(style_pattern, x))
product['With Pattern'] = product['description'].astype(str).apply(lambda x: re.findall(with_pattern, x))
product['For Pattern']=product['description'].astype(str).apply(lambda x: re.findall(for_pattern, x))

## extract patterns from Details column
product['With Pattern2'] = product['details'].astype(str).apply(lambda x: re.findall(with_pattern, x))
product['Continue Pattern2'] = product['details'].astype(str).apply(lambda x: re.findall(continue_pattern, x))
product['Tuck Pattern2'] = product['details'].astype(str).apply(lambda x: re.findall(tuck_in_pattern, x))
product['Over Pattern2'] = product['details'].astype(str).apply(lambda x: re.findall(over_pattern, x))
product['Under Pattern2'] = product['details'].astype(str).apply(lambda x: re.findall(under_pattern, x))
product['Pair Pattern2'] = product['details'].astype(str).apply(lambda x: re.findall(pair_pattern, x))
product['Style Pattern2'] = product['details'].astype(str).apply(lambda x: re.findall(style_pattern, x))
product['With Pattern2'] = product['details'].astype(str).apply(lambda x: re.findall(with_pattern, x))
product['For Pattern2']=product['details'].astype(str).apply(lambda x: re.findall(for_pattern, x))

In [None]:
## combine the extracted patterns and remove signal words (e.g. with)

product['Recommendation pattern'] = product['Continue Pattern']+product['Tuck Pattern']+product['Over Pattern']+product['Under Pattern']+product['Pair Pattern']+product['Style Pattern']+product['With Pattern']+product['For Pattern']+product['Continue Pattern2']+product['Tuck Pattern2']+product['Over Pattern2']+product['Under Pattern2']+product['Pair Pattern2']+product['Style Pattern2']+product['With Pattern2']+product['For Pattern2']
product['Recommendation pattern'] = decomposite_list(product['Recommendation pattern'])
product['Recommendation pattern'] = product['Recommendation pattern'].apply(lambda x: re.sub('with|allow|allows|for|under|wear|over|continue',' ',x))

## POS Token

- Here we extract noun and adjectives from details and description to add into product feature
- Extract adjectives from details and description to add into recommendation feature: the overfit that the user is looking for may have large overlapping in adjectives

In [None]:
product['POS_product1'] = product['description'].apply(product_POS)
product['POS_product2'] = product['details'].apply(product_POS)

product['POS_recom1'] = product['description'].apply(recom_POS)
product['POS_recom2'] = product['details'].apply(recom_POS)

product['name'] = remove_n(product['name'])
product['POS_name']=product['name'].fillna(' ').astype(str).apply(recom_POS)

product['POS_product'] = product['POS_product1']+product['POS_product2']
product['POS_recom'] = product['POS_recom1']+product['POS_recom2']+product['POS_name']


product['POS_product'] = decomposite_list(product['POS_product'])
product['POS_recom'] = decomposite_list(product['POS_recom'])

## Brand Token

- To avoid stopword removal in brand, we created a single brand token

In [None]:
for j in range(len(product)):
    a='BRAND_'
    if type(product.loc[j,'brand'])==int:
        product.loc[j,'brand'] = str(product.loc[j,'brand'])
    for i in product.loc[j,'brand'].split(' '):
        a+=i
    a+='_TOKEN'
    product.loc[j,'Brand_token']=a

In [None]:
product['brand_token'] = product['brand'].astype(str).apply(lambda x: re.sub('\W',' ',x))

## Merge feature for initial product 

- Import cleaned data from feature extraction and merge useful features to a single token

In [None]:
feature_df = pd.read_csv('all_features.csv')
feature_df = feature_df.fillna('')

## Final Tags

- Select the target columns that will be used in similarity calculation and recommendation providing
- Combine all the target columns into one dataframe called 'final'

In [None]:
final=product[['product_id','brand','name']]

In [None]:
final['general_category']=feature_df['general_category']

In [None]:
final['recommendation_doc']=product['brand_token']+' '+product['POS_recom']+' '+product['Recommendation pattern']+' '+feature_df['style']+' '+feature_df['pattern']+' '+feature_df['color']+' '+feature_df['occasion']+' '+feature_df['material']+' '+feature_df['material_percent']+' '+feature_df['trend']


In [None]:
final['product_doc']=product['name']+' '+product['POS_product']+' '+product['brand_token']+' '+product['Brand_token']+' '+feature_df['all_features']

In [None]:
final['recommendation_doc'] = remove_comma(final['recommendation_doc'])
final['product_doc'] = remove_comma(final['product_doc'])

## Lemmatization & Remove Stopwords

In [None]:
final["product_doc"] =final["product_doc"].apply(textClean)
final["recommendation_doc"] =final["recommendation_doc"].apply(textClean)

## Remove Special Words

In [None]:
final['product_doc'] = remove_special_word(final['product_doc'])
final['recommendation_doc'] = remove_special_word(final['recommendation_doc'])

## Drop Duplicate words in Recommendation doc

In [None]:
for i in range(len(final)):
    words=final.loc[i,'recommendation_doc'].split()
    final.loc[i,'recommendation_doc'] = " ".join(sorted(set(words), key=words.index))

In [None]:
final['product_doc'] = final['product_doc'].apply(lambda x: re.sub('unknown','',x))

## Remove Extra Space and Output file

In [None]:
final['product_doc'] = final['product_doc'].apply(removeExtraSpace)
final['recommendation_doc'] = final['recommendation_doc'].apply(removeExtraSpace)

In [None]:
# export the final feature csv
final.to_csv('final_feature.csv',index=False)