In [1]:
# load the packages
import pandas as pd
import re
import en_core_web_md
from gensim.parsing.preprocessing import STOPWORDS
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import process

import warnings
warnings.filterwarnings("ignore")

# load medium-sized corpus of English documents
nlp = en_core_web_md.load()



In [2]:
# define stemming function
def stem_text(text):
    porter=PorterStemmer()
    tokens = text.split()
    stemmed_tokens = [porter.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

In [3]:
# read outfits file
outfit = pd.read_csv('https://dso-560-nlp-text-analytics.s3.amazonaws.com/outfit_combinations.csv')
# replace accessory1/2/3 with accessory
outfit['outfit_item_type'] = outfit.outfit_item_type.str.replace(r'(accessory\d)', r'accessory')
# read full data file
full_data = pd.read_csv('https://dso-560-nlp-text-analytics.s3.amazonaws.com/Full+data.csv')
# select necessary columns
full_data = full_data.loc[:,['product_id', 'description']] 
# perform inner join
combined = pd.merge(outfit, full_data, on = 'product_id')

col = ['outfit_item_type', 'brand', 'product_full_name', 'description'] 
# lower-case the columns
for columns in col:
    combined[columns] = combined[columns].str.lower() 
combined.sort_values(by='outfit_id', inplace = True)
combined.reset_index(drop = True, inplace = True)

# combine features
combined['freeformtext'] = combined['brand'] + combined['description'] 

# define stopwords
stopwords_gensim = list(STOPWORDS)
stopwords_NLTK = list(stopwords.words("english"))
stopwords_combined = list(set(stopwords_gensim+stopwords_NLTK)) # to remove duplicates
negatives = ['not','nor','no','neither', 'never', 'bottom', 'top'] # took out the negative words for a more accurate analysis
stopwords_combined = list(filter(lambda x: x not in negatives, stopwords_combined))
stopwords_combined.sort()
stopwords_expression = '|'.join(stopwords_combined)
stopwords_pattern = f'({stopwords_expression})'

# clean, stem and remove stopwords from the freeformtext
combined['freeformtext'] = combined['freeformtext'].astype(str)
combined['freeformtext'] = combined['freeformtext'].str.replace(r'[^\w\s]',' ')
combined['freeformtext'] = combined['freeformtext'].str.replace(rf'\b{stopwords_pattern}\b','')
combined['freeformtext'] = combined['freeformtext'].apply(stem_text)

In [4]:
def outfit_recommendation(outfit_type: str(), input_value: str(), isproductid=True):
    
    # need to add details in docstring if it exists
    '''
    This function takes as an input three arguments:
    1) outfit_type: string; indicates type of the outfit that a user is looking for 
    (should be one of the following: ['bottom','top','accessory','shoe','onepiece'])
    2) input_value: string; represents the string (either product ID or free form text such as brand and description)
    for which the recommendations are returned
    3) isproductid: boolean; if True, input_value is considered a product ID; 
    else, input_value is treated as a free form text. By default is equal to 'True'
    
    For an input outfit, the function returns a data frame with the product recommendations
    '''
     # if input_value represents a product ID
    if isproductid: 
        # if input product ID is found in existing product ID-s
        if input_value in combined.product_id.values:
            # match info for that product ID
            output = combined.loc[combined.product_id == input_value,:]
            print('Returning the product with the first-matched exact product ID')
        # if input product ID is misspelled/not found in data
        else:
            # find the closest product ID using fuzzymatching
            highestfuzz = process.extractOne(input_value, combined.product_id.values)
            # match info for that product ID
            output = combined.loc[combined.product_id == highestfuzz[0],:]
            print('Returning the product with the first-matched closest product ID')
        # print the first match
        display(output.head(1))
        # find all outfits belonging to same outfit ID 
        target = outfit[outfit.outfit_id == output.values[0][0]]
        
        # making sure the matched item appears on top of the recommendation
        p_id = output.values[0][1]
        match = target[target.product_id == p_id]
        recommendation = target[target.product_id != p_id]
        target = match.append(recommendation)
        target.reset_index(inplace = True, drop = True)
    
    else: 
        # filter for the user-defined outfit type
        description = list(combined[combined.outfit_item_type == outfit_type].freeformtext) # making the freeformtext into a list;
        description = [str(i) for i in description] # making all the elements into string
        description = list(set(description))
        # initialize an empty list
        description_vectors = []
        # perform vectorization of freeformtext using pretrained word2vec embeddings
        for i in description:
            temp_description = nlp(i)
            description_vectors.append(temp_description.vector)
        
        # clean, stem and remove stopwords from the input text
        input_clean = input_value.lower() 
        input_clean= re.sub(r'[^\w\s]',' ',input_clean)
        input_clean = re.sub(rf'\b{stopwords_pattern}\b','',input_clean)
        input_clean = stem_text(input_clean)

        # append cleaned input to the list of existing strings
        description.append(input_clean)

        # initialize an empty list
        input_clean_vectors = []
        # perform vectorization of the cleaned input
        temp_input_clean = nlp(input_clean)
        # append to list of vectorized items
        description_vectors.append(temp_input_clean.vector)

        # create a string-by-string 'matrix'
        vector_df =pd.DataFrame(description_vectors)
        vector_df["description"] = description
        vector_df.set_index("description", inplace=True)
        
        # calculate cosine similarity between two vectors
        similarities = pd.DataFrame(cosine_similarity(vector_df.values), columns=description, index=description)

        # find most similar pairs
        top_similarities = similarities.unstack().reset_index()
        top_similarities.columns = ["input_clean", "original_description", "similarity"]
        top_similarities = top_similarities.sort_values(by="similarity", ascending=False)
        top_similarities = top_similarities[top_similarities["similarity"] < 0.9999]

        # find most similar vectors to the input vector
        match = top_similarities.loc[top_similarities['input_clean']== input_clean,:].reset_index(drop = True)

        # retrieve the freeformtext of the closest matched vector
        input_clean_item = match.loc[0,'original_description']
        # find the information corresponding to that freeformtext
        output = combined.loc[combined.freeformtext == input_clean_item,:]
        # find the outfit_id of the closest match 
        print('Returning the product with the closest matched description')
        display(output)
        
        # return all outfits in that outfit_id
        target = outfit[outfit.outfit_id == output.values[0][0]]
        
        # making sure the matched item appears on top of the recommendation
        p_id = output.values[0][1]
        match = target[target.product_id == p_id]
        recommendation = target[target.product_id != p_id]

        target = match.append(recommendation)
        target.reset_index(inplace = True, drop = True)
    
    print('Returning the recommendations for the identified product')
    return target

## Example 1 (Product ID, Exact Match)

In [5]:
outfit_recommendation('shoe', '01DMBRYVA2ZFDYRYY5TRQZJTBD')

Returning the product with the first-matched exact product ID


Unnamed: 0,outfit_id,product_id,outfit_item_type,brand,product_full_name,description,freeformtext
3,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2ZFDYRYY5TRQZJTBD,shoe,tory burch,penelope mid cap toe pump,a nice shoe,tori burcha nice shoe


Returning the recommendations for the identified product


Unnamed: 0,outfit_id,product_id,outfit_item_type,brand,product_full_name
0,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2ZFDYRYY5TRQZJTBD,shoe,Tory Burch,Penelope Mid Cap Toe Pump
1,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2P5H24WK0HTK4R0A1,bottom,Eileen Fisher,Slim Knit Skirt
2,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2PEPWFTT7RMP5AA1T,top,Eileen Fisher,Rib Mock Neck Tank
3,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2S5T9W793F4CY41HE,accessory,kate spade new york,medium margaux leather satchel


## Example 2 (Product ID, Mismatch)

In [6]:
outfit_recommendation('shoe', '01DMBRYVA2ZFDYRYY5TRQZJTBA')

Returning the product with the first-matched closest product ID


Unnamed: 0,outfit_id,product_id,outfit_item_type,brand,product_full_name,description,freeformtext
3,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2ZFDYRYY5TRQZJTBD,shoe,tory burch,penelope mid cap toe pump,a nice shoe,tori burcha nice shoe


Returning the recommendations for the identified product


Unnamed: 0,outfit_id,product_id,outfit_item_type,brand,product_full_name
0,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2ZFDYRYY5TRQZJTBD,shoe,Tory Burch,Penelope Mid Cap Toe Pump
1,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2P5H24WK0HTK4R0A1,bottom,Eileen Fisher,Slim Knit Skirt
2,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2PEPWFTT7RMP5AA1T,top,Eileen Fisher,Rib Mock Neck Tank
3,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2S5T9W793F4CY41HE,accessory,kate spade new york,medium margaux leather satchel


## Examples 3 and 4 (Product Description)

In [7]:
outfit_recommendation('bottom', 'Sexy silky, a-line mini skirt zipper Benson skirt', isproductid=False)

Returning the product with the closest matched description


Unnamed: 0,outfit_id,product_id,outfit_item_type,brand,product_full_name,description,freeformtext
14,01DQ63P636Q4BQVCKT6Z4S41G5,01DPKMGJ33SDFXM7XHGPQJWQ12,bottom,reformation,benson skirt,sexy silky. this is an a-line mini skirt with ...,reformationsexi silki line mini skirt center z...


Returning the recommendations for the identified product


Unnamed: 0,outfit_id,product_id,outfit_item_type,brand,product_full_name
0,01DQ63P636Q4BQVCKT6Z4S41G5,01DPKMGJ33SDFXM7XHGPQJWQ12,bottom,Reformation,Benson Skirt
1,01DQ63P636Q4BQVCKT6Z4S41G5,01DPCRZWX4S2Z8Q5HYDFM4HNEG,shoe,J.Crew,Pointed-toe flats in suede
2,01DQ63P636Q4BQVCKT6Z4S41G5,01DPET2NWSA221STZF740BZ9SW,top,Veronica Beard,Ashlynn Blouse


In [8]:
outfit_recommendation('bottom', 'slim fitting, straight leg pant with a center back zipper and slightly cropped leg', isproductid=False)

Returning the product with the closest matched description


Unnamed: 0,outfit_id,product_id,outfit_item_type,brand,product_full_name,description,freeformtext
26,01DQ8ME3M3QS9MQGZCQHXDHE1R,01DPKMH0D252JKMAA27MFCT5GM,bottom,reformation,marlon pant,let your pants do the talking. this is a slim ...,reformationlet pant talk slim fit straight leg...


Returning the recommendations for the identified product


Unnamed: 0,outfit_id,product_id,outfit_item_type,brand,product_full_name
0,01DQ8ME3M3QS9MQGZCQHXDHE1R,01DPKMH0D252JKMAA27MFCT5GM,bottom,Reformation,Marlon Pant
1,01DQ8ME3M3QS9MQGZCQHXDHE1R,01DPEHS0XH9PDD1GH5ZE4P43A2,accessory,Sole Society,Cassi Belt Bag
2,01DQ8ME3M3QS9MQGZCQHXDHE1R,01DPKN20Q3J0BE3CS896DQB6ER,top,Reformation,Jane Sweater
3,01DQ8ME3M3QS9MQGZCQHXDHE1R,01DPKNHQDG6GPTKV97CFQRJDHE,shoe,Reformation,Giulia Satin Heel
