In [1]:
import pandas as pd
import re
import en_core_web_md
from gensim.parsing.preprocessing import STOPWORDS
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
from fuzzywuzzy import process

import warnings
warnings.filterwarnings("ignore")

nlp = en_core_web_md.load()

In [10]:

outfit = pd.read_csv('https://dso-560-nlp-text-analytics.s3.amazonaws.com/outfit_combinations.csv')
outfit['outfit_item_type'] = outfit.outfit_item_type.str.replace(r'(accessory\d)', r'accessory')
full_data = pd.read_csv('https://dso-560-nlp-text-analytics.s3.amazonaws.com/Full+data.csv')
full_data = full_data.loc[:,['product_id', 'description']]

combined = pd.merge(outfit,
                full_data,
                on = 'product_id')

In [32]:
def outfit_recommendation_updated(outfit_type: str(), input_value: str(), isproductid = True):
    
    '''
    This function takes as an input three arguments:
    1) outfit_type: string; indicates type of the outfit that a user is looking for 
    (should be one of the following: ['bottom','top','accessory','shoe','onepiece'])
    2) input_value: string; represents the string (either product ID or product description)
    for which the recommendations are returned
    3) isproductid: boolean; if True, input_value is considered a product ID; 
    else, input_value is treated as a description. By default is equal to 'True'
    
    For an input outfit, the function returns a data frame with the product recommendations
    
    Note that the runtime of this function is fairly slow because the data has to be retrieved whenever it is called
    '''
    
#     outfit = pd.read_csv('https://dso-560-nlp-text-analytics.s3.amazonaws.com/outfit_combinations.csv')
#     ## replaced accessory1/2/3 with accessory
#     outfit['outfit_item_type'] = outfit.outfit_item_type.str.replace(r'(accessory\d)', r'accessory')
#     full_data = pd.read_csv('https://dso-560-nlp-text-analytics.s3.amazonaws.com/Full+data.csv')
#     full_data = full_data.loc[:,['product_id', 'description']]
    
#     combined = pd.merge(outfit,
#                     full_data,
#                     on = 'product_id')
    
    if isproductid: 
        if input_value in combined.product_id.values:
            output = combined.loc[combined.product_id == input_value,:]
            print('Returning the product with the first-matched exact product ID')
        else:
            highestfuzz = process.extractOne(input_value, combined.product_id.values)
            output = combined.loc[combined.product_id == highestfuzz[0],:]
            print('Returning the product with the first-matched closest product ID')
        display(output.head(1))
        target = outfit[outfit.outfit_id == output.values[0][0]]
    
    else: 
        input_value = input_value.upper()
        if combined.loc[combined['product_id']==input_value,'product_id'].any() == input_value:
            input_descr = combined.loc[combined['product_id']==input_value,'description'].values[0]
        else:
            input_descr = input_value


        col = ['outfit_item_type', 'brand', 'product_full_name', 'description']

        for columns in col:
            combined[columns] = combined[columns].str.lower() 
        combined.sort_values(by='outfit_id', inplace = True)
        combined.reset_index(drop = True, inplace = True)

        stopwords_gensim = list(STOPWORDS)
        stopwords_NLTK = list(stopwords.words("english"))
        stopwords_combined = list(set(stopwords_gensim+stopwords_NLTK)) #to remove duplicates
        negatives = ['not','nor','no','neither', 'never', 'bottom', 'top'] #took out the negative words for a more accurate analysis
        stopwords_combined = list(filter(lambda x: x not in negatives, stopwords_combined))
        stopwords_combined.sort()
        stopwords_expression = '|'.join(stopwords_combined)
        stopwords_pattern = f'({stopwords_expression})'

        def stem_text(text):
            porter=PorterStemmer()
            tokens = text.split()
            stemmed_tokens = [porter.stem(token) for token in tokens]
            return ' '.join(stemmed_tokens)

        combined['description'] = combined['description'].astype(str)
        combined['description'] = combined['description'].str.replace(r'[^\w\s]',' ')
        combined['description'] = combined['description'].str.replace(rf'\b{stopwords_pattern}\b','')
        combined['description'] = combined['description'].apply(stem_text)

    
        ## UPDATED - filtered for the user-defined outfit type
        description = list(combined[combined.outfit_item_type == outfit_type].description) #making the description into a list;
        description = [str(i) for i in description] #making all the elements into string
        description = list(set(description))
        description_vectors = []
        for i in description:
            temp_description = nlp(i)
            description_vectors.append(temp_description.vector)

        input_clean = input_descr.lower() 
        input_clean= re.sub(r'[^\w\s]',' ',input_clean)
        input_clean = re.sub(rf'\b{stopwords_pattern}\b','',input_clean)
        input_clean = stem_text(input_clean)


        description.append(input_clean)

        input_clean_vectors = []
        temp_input_clean = nlp(input_clean)
        description_vectors.append(temp_input_clean.vector)


        vector_df =pd.DataFrame(description_vectors)
        vector_df["description"] = description

        vector_df.set_index("description", inplace=True)

        similarities = pd.DataFrame(cosine_similarity(vector_df.values), columns=description, index=description)

        top_similarities = similarities.unstack().reset_index()
        top_similarities.columns = ["input_clean", "original_description", "similarity"]
        top_similarities = top_similarities.sort_values(by="similarity", ascending=False)
        top_similarities = top_similarities[top_similarities["similarity"] < .9999]

        match = top_similarities.loc[top_similarities['input_clean']== input_clean,:].reset_index(drop = True)

        input_clean_item = match.loc[0,'original_description']
        output = combined.loc[combined.description == input_clean_item,:]
        ## UPDATED - find the outfit_id of the closest match, and return all outfits in that outfit_id
        print('Returning the product with the closest matched description')
        display(output)
        target = outfit[outfit.outfit_id == output.values[0][0]]
    print('Returning the recommendations for the identified product')
    return target

## Example 1 (Product ID, Exact Match)

In [38]:
outfit_recommendation_updated('shoe', '01DMBRYVA2ZFDYRYY5TRQZJTBD', isproductid = True)

Returning the product with the first-matched exact product ID


Unnamed: 0,outfit_id,product_id,outfit_item_type,brand,product_full_name,description
6,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2ZFDYRYY5TRQZJTBD,shoe,Tory Burch,Penelope Mid Cap Toe Pump,A nice shoe


Returning the recommendations for the identified product


Unnamed: 0,outfit_id,product_id,outfit_item_type,brand,product_full_name
0,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2P5H24WK0HTK4R0A1,bottom,Eileen Fisher,Slim Knit Skirt
1,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2PEPWFTT7RMP5AA1T,top,Eileen Fisher,Rib Mock Neck Tank
2,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2S5T9W793F4CY41HE,accessory,kate spade new york,medium margaux leather satchel
3,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2ZFDYRYY5TRQZJTBD,shoe,Tory Burch,Penelope Mid Cap Toe Pump


## Example 2 (Product ID, Mismatch)

In [34]:
outfit_recommendation_updated('shoe', '01DMBRYVA2ZFDYRYY5TRQZJTBA')

Returning the product with the first-matched closest product ID


Unnamed: 0,outfit_id,product_id,outfit_item_type,brand,product_full_name,description
3,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2ZFDYRYY5TRQZJTBD,shoe,tory burch,penelope mid cap toe pump,nice shoe


Returning the recommendations for the identified product


Unnamed: 0,outfit_id,product_id,outfit_item_type,brand,product_full_name
0,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2P5H24WK0HTK4R0A1,bottom,Eileen Fisher,Slim Knit Skirt
1,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2PEPWFTT7RMP5AA1T,top,Eileen Fisher,Rib Mock Neck Tank
2,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2S5T9W793F4CY41HE,accessory1,kate spade new york,medium margaux leather satchel
3,01DDBHC62ES5K80P0KYJ56AM2T,01DMBRYVA2ZFDYRYY5TRQZJTBD,shoe,Tory Burch,Penelope Mid Cap Toe Pump


## Examples 3 and 4 (Product Description)

In [35]:
outfit_recommendation_updated('bottom', 'Sexy silky, a-line mini skirt zipper Benson skirt', isproductid = False)

Returning the product with the closest matched description


Unnamed: 0,outfit_id,product_id,outfit_item_type,brand,product_full_name,description
16,01DQ63P636Q4BQVCKT6Z4S41G5,01DPKMGJ33SDFXM7XHGPQJWQ12,bottom,reformation,benson skirt,sexi silki line mini skirt center zipper benso...


Returning the recommendations for the identified product


Unnamed: 0,outfit_id,product_id,outfit_item_type,brand,product_full_name
14,01DQ63P636Q4BQVCKT6Z4S41G5,01DPCRZWX4S2Z8Q5HYDFM4HNEG,shoe,J.Crew,Pointed-toe flats in suede
15,01DQ63P636Q4BQVCKT6Z4S41G5,01DPET2NWSA221STZF740BZ9SW,top,Veronica Beard,Ashlynn Blouse
16,01DQ63P636Q4BQVCKT6Z4S41G5,01DPKMGJ33SDFXM7XHGPQJWQ12,bottom,Reformation,Benson Skirt


In [36]:
outfit_recommendation_updated('bottom', 'slim fitting, straight leg pant with a center back zipper and slightly cropped leg',\
                               isproductid = False)

Returning the product with the closest matched description


Unnamed: 0,outfit_id,product_id,outfit_item_type,brand,product_full_name,description
24,01DQ8ME3M3QS9MQGZCQHXDHE1R,01DPKMH0D252JKMAA27MFCT5GM,bottom,reformation,marlon pant,let pant talk slim fit straight leg pant cente...


Returning the recommendations for the identified product


Unnamed: 0,outfit_id,product_id,outfit_item_type,brand,product_full_name
24,01DQ8ME3M3QS9MQGZCQHXDHE1R,01DPEHS0XH9PDD1GH5ZE4P43A2,accessory1,Sole Society,Cassi Belt Bag
25,01DQ8ME3M3QS9MQGZCQHXDHE1R,01DPKMH0D252JKMAA27MFCT5GM,bottom,Reformation,Marlon Pant
26,01DQ8ME3M3QS9MQGZCQHXDHE1R,01DPKN20Q3J0BE3CS896DQB6ER,top,Reformation,Jane Sweater
27,01DQ8ME3M3QS9MQGZCQHXDHE1R,01DPKNHQDG6GPTKV97CFQRJDHE,shoe,Reformation,Giulia Satin Heel
