In [3]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [4]:
import pandas as pd
import numpy as np
import re
from gensim.parsing.preprocessing import STOPWORDS
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [5]:
start = pd.datetime.now()

  """Entry point for launching an IPython kernel.


## Preprocessing

In [6]:
# load full_data from the dataset of SQL
df_SQL = pd.read_csv('/content/drive/My Drive/NLP/full_data.csv')
df_SQL.drop(columns=['created_at', 'updated_at', 'deleted_at', 'bc_product_id'], inplace=True)
for columns in df_SQL.columns:
    df_SQL[columns] = df_SQL[columns].str.lower() 
df_SQL.drop_duplicates(subset= ['product_id'], keep = 'first', inplace=True)
df_SQL.shape

(48072, 9)

In [7]:
# load tagged_product_attributes from the dataset of SQL
tag_SQL = pd.read_csv('/content/drive/My Drive/NLP/tagged_product_attributes.csv')

for columns in tag_SQL.columns:
    tag_SQL[columns] = tag_SQL[columns].str.lower() 
    
remove = [' ', '_', '(', ')', '-', ',', '&', '"', '"', '/']
for i in remove:
    tag_SQL['attribute_name'] = tag_SQL['attribute_name'].str.replace(i, '')
    tag_SQL['attribute_value'] = tag_SQL['attribute_value'].str.replace(i, '')

tag_SQL.drop(columns='file', inplace=True)
tag_SQL.drop_duplicates(keep='first', inplace=True) #only removed duplicates that have same data in all of the columns
tag_SQL.shape

(97950, 4)

In [8]:
# inner join
df_join = pd.merge(df_SQL, tag_SQL.drop(columns='product_color_id'), how='inner', on='product_id')
focus_attribute = ['style', 'occasion', 'category', 'fit', 'embellishment']
df_messy = df_join[df_join.attribute_name.isin(focus_attribute)].reset_index(drop=True)

for att in focus_attribute:
    df_messy[att] = np.where(df_messy.attribute_name==att, df_messy.attribute_value, None)
df_messy.replace(np.nan, '', regex=True, inplace=True)
df_messy.drop_duplicates(inplace=True)
df_messy.reset_index(inplace=True)
df_messy.shape

(26944, 17)

In [0]:
# lemmatize
def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    tokens = text.split()
    stemmed_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(stemmed_tokens)

In [0]:
# preprocess
def preprocessing(data):
    '''inputdata should have 5 columns: brand, product_full_name, description, brand_category, and details'''
    
    data.replace(np.nan, '', regex=True, inplace=True)

    # regex cleaning
    for i in range(0,len(data)):
        text = data.loc[i,'description']
        text = re.sub(r'([0-9]+)', '', text)
        text = re.sub(r'\b(jeans|pants|skirt|shorts|leggings|trousers)\b', 'bottom', text)
        text = re.sub(r'\b(sweater|shirt|jacket|tshirt|coat|blazer|cardigan|hoodie)\b', 'top', text)
        text = re.sub(r'\b(sneakers|boots|flats|heels|slippers|sandals)\b', 'shoe', text)
        text = re.sub(r'\b(dress|one piece|jumpsuit)\b', 'onepiece', text)  
        data.loc[i,'description'] = text
    
    data['product_full_name'] = data['product_full_name'].str.replace(r'([0-9]+)','')
    data['details'] = data['details'].str.replace(r'([0-9]+)','')
    data['brand_category'] = data['brand_category'].str.replace(r'([0-9]+)','')

    # define stopwords
    stopwords_gensim = list(STOPWORDS)
    stopwords_NLTK = list(stopwords.words("english"))
    stopwords_combined = list(set(stopwords_gensim+stopwords_NLTK)) #to remove duplicates
    negatives = ['not','nor','no','neither', 'never', 'bottom', 'top'] #took out the negative words for a more accurate analysis
    stopwords_combined = list(filter(lambda x: x not in negatives, stopwords_combined))
    stopwords_combined.sort()
    stopwords_expression = '|'.join(stopwords_combined)
    stopwords_pattern = f'({stopwords_expression})'

    # remove stopwords & lemmatization
    for i in ['product_full_name', 'description', 'brand_category', 'details']:
        data[i] = data[i].astype(str)
        data[i] = data[i].str.replace(r'[^\w\s]',' ')
        data[i] = data[i].str.replace(r'\n', ' ')
        data[i] = data[i].str.replace(rf'\b{stopwords_pattern}\b','')
        data[i] = data[i].apply(lemmatize)

    # combine features into one text variable
    data['vars'] = data['brand']+' '+data['product_full_name']+' '+data['description']+' '+data['brand_category']+' '+data['details']

    return data

In [0]:
df = preprocessing(df_messy)
df.drop(columns=['mpn','brand_canonical_url','labels'], inplace=True)
#df.to_csv('/content/drive/My Drive/NLP/df.csv')
#df = pd.read_csv('/content/drive/My Drive/NLP/df.csv', index_col=0)

In [12]:
df.shape

(26944, 15)

In [0]:
df.replace(r'^\s*$', np.nan, regex=True, inplace=True)

In [0]:
df['category'].unique()

array([nan, 'bottom', 'top', 'sweater', 'onepiece', 'blazerscoatsjackets',
       'shoe', 'accessory', 'sweatshirthoodie'], dtype=object)

In [0]:
# adjust categories to the list in the excel
df['category'] = np.where(df['category']=='sweater', 'top', np.where(df['category']=='blazerscoatsjackets', 'top', np.where(df['category']=='sweatshirthoodie', 'top', df['category'])))

In [0]:
cols = ['brand','product_full_name','description','brand_category','details']
for col in cols:
    df[col].fillna('',inplace=True)

In [0]:
df.shape

(26944, 15)

In [0]:
df.isnull().sum()

index                    0
product_id               0
brand                    0
product_full_name        0
description              0
brand_category           0
details                  0
attribute_name           0
attribute_value          0
style                16430
occasion             17890
category             22968
fit                  23904
embellishment        26584
vars                     0
dtype: int64

## Feature Engineering

### Word2vec

In [0]:
import nltk
nltk.download('punkt')
from nltk import word_tokenize
from gensim.models import Word2Vec

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [0]:
# train word2vec using all 5 columns
cols = ['brand','product_full_name','description','brand_category','details']
embedding_size=100
docs=[]
for col in cols:
    docs += [word_tokenize(i) for i in df[col]] 
wordvec = Word2Vec(docs, size=embedding_size, min_count=1)
print(wordvec)

Word2Vec(vocab=6813, size=100, alpha=0.025)


In [0]:
vocab = list(wordvec.wv.vocab)
wordvec_num = [wordvec.wv[i] for i in vocab]
wordvec_dict = dict(zip(vocab, wordvec_num))

# Deep NN

In [0]:
from random import randint
from numpy import array, argmax, asarray, zeros
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers.recurrent import SimpleRNN
from keras.layers import Flatten, Masking
from keras import regularizers
from keras.callbacks import ModelCheckpoint, EarlyStopping, History
import tensorflow as tf

Using TensorFlow backend.


In [0]:
# prepare inputs for NN
def preparemodel(data, embedding_size):
    tokenizer = Tokenizer(num_words=None, oov_token="UNKNOWN_TOKEN")
    tokenizer.fit_on_texts(data['vars'])
    encoded_docs = tokenizer.texts_to_sequences(data['vars'])

    max_sequence_len = 180
    # Used the code below to find out the max_sequence_len on the df (173)
    #max_sequence_len = 0
    #for i in encoded_docs:
        #if len(i) > max_sequence_len:
            #max_sequence_len=len(i)

    padded_docs = pad_sequences(encoded_docs, maxlen=max_sequence_len, padding='post')
    
    vocab_size = 6900
    # Used the code below to find out the max vocab_size on the df (6807)
    #vocab_size = len(tokenizer.word_index)+1
    
    embedding_matrix = zeros((vocab_size, embedding_size))
    for word, i in tokenizer.word_index.items():
        embedding_vector = wordvec_dict.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    
    return padded_docs, vocab_size, max_sequence_len, embedding_matrix

In [0]:
# deep nn with softmax
def deepnn_multiclass(X, y, vocab_size, embedding_size, embedding_matrix, max_sequence_len, node=10, val_data:tuple=None, val_split=0):
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_size, weights=[embedding_matrix], input_length=max_sequence_len, trainable=False))
    model.add(Flatten())
    model.add(Dense(node, kernel_regularizer=regularizers.l2(0.01), activation='relu'))
    model.add(Dense(len(y[0]), activation='softmax'))
    
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    history = model.fit(X, y, validation_data=val_data, validation_split=val_split, epochs=30, verbose=0, callbacks= [EarlyStopping(patience = 5), ModelCheckpoint(filepath = "/content/drive/My Drive/NLP/weights.hdf5", save_best_only= True), History()])
    model_new = tf.keras.models.load_model(filepath = "/content/drive/My Drive/NLP/weights.hdf5")
    return model_new

In [0]:
# deep nn with sigmoid
def deepnn_binary(X, y, vocab_size, embedding_size, embedding_matrix, max_sequence_len, node=10, val_data:tuple=None, val_split=0):
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_size, weights=[embedding_matrix], input_length=max_sequence_len, trainable=False))
    model.add(Flatten())
    model.add(Dense(node, kernel_regularizer=regularizers.l2(0.01), activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    history = model.fit(X, y, validation_data=val_data, validation_split=val_split, epochs=30, verbose=0, callbacks= [EarlyStopping(patience = 5), ModelCheckpoint(filepath = "/content/drive/My Drive/NLP/weights.hdf5", save_best_only= True), History()])
    model_new = tf.keras.models.load_model(filepath = "/content/drive/My Drive/NLP/weights.hdf5")
    return model_new

In [0]:
# cross validation for multiclass attributes
def modelcv_multiclass(labelname, n_splits=5):
    data = df.dropna(subset=[labelname])
    encoder = LabelEncoder()
    labels = to_categorical(encoder.fit_transform(data[labelname]))
    
    temp = preparemodel(data, embedding_size)
    padded_docs, vocab_size, max_sequence_len, embedding_matrix = temp[0], temp[1], temp[2], temp[3]

    kf = KFold(n_splits=n_splits, shuffle = True)
    kf.get_n_splits(padded_docs)

    maximum, node_final, train_acc_final, test_acc_final = 0,0,0,0
    for node in [10,20,30]:
        train_acc = 0
        test_acc = 0
        for train_index, test_index in kf.split(padded_docs):
            X_train, X_test, y_train, y_test = padded_docs[train_index], padded_docs[test_index], labels[train_index], labels[test_index]
            model = deepnn_multiclass(X_train, y_train, vocab_size, embedding_size, embedding_matrix, max_sequence_len, node=node, val_data=(X_test,y_test))
            loss_train, accuracy_train = model.evaluate(X_train, y_train, verbose=0)
            loss_test, accuracy_test = model.evaluate(X_test, y_test, verbose=0)
            train_acc += accuracy_train
            test_acc += accuracy_test
        if maximum < test_acc/n_splits:
            maximum = test_acc/n_splits
            node_final = node
            train_acc_final = train_acc/n_splits
            test_acc_final = test_acc/n_splits
          
    return node_final, train_acc_final, test_acc_final

In [0]:
# cross validation for binary attributes
def modelcv_binary(labelname, uniquename, n_splits=5):
    data = df.dropna(subset=[labelname]).copy()
    data['temp'] = np.where(data[labelname]==uniquename, 0, 1)
    data.sort_values(by=['product_id','temp'], inplace=True)
    data.drop_duplicates(subset=['product_id'], keep='first', inplace=True)
    data.reset_index(drop=True, inplace=True)
    labels = np.where(data[labelname]==uniquename,1,0)

    temp = preparemodel(data, embedding_size)
    padded_docs, vocab_size, max_sequence_len, embedding_matrix = temp[0], temp[1], temp[2], temp[3] 

    kf = KFold(n_splits=n_splits, shuffle = True)
    kf.get_n_splits(padded_docs)

    maximum, node_final, train_acc_final, test_acc_final = 0,0,0,0
    for node in [10,20,30]:
        train_acc = 0
        test_acc = 0
        for train_index, test_index in kf.split(padded_docs):
            X_train, X_test, y_train, y_test = padded_docs[train_index], padded_docs[test_index], labels[train_index], labels[test_index]
            model = deepnn_binary(X_train, y_train, vocab_size, embedding_size, embedding_matrix, max_sequence_len, node=node, val_data=(X_test,y_test))
            loss_train, accuracy_train = model.evaluate(X_train, y_train, verbose=0)
            loss_test, accuracy_test = model.evaluate(X_test, y_test, verbose=0)
            train_acc += accuracy_train
            test_acc += accuracy_test
        if maximum < test_acc/n_splits:
            maximum = test_acc/n_splits
            node_final = node
            train_acc_final = train_acc/n_splits
            test_acc_final = test_acc/n_splits
          
    return node_final, train_acc_final, test_acc_final

### Cross Validation 
- hyperparameter: number of nodes

In [0]:
target, best_node, train_accuracy, test_accuracy = [],[],[],[]
for col in ['category','fit']:
    temp = modelcv_multiclass(col)
    target.append(col)
    best_node.append(temp[0])
    train_accuracy.append(temp[1])
    test_accuracy.append(temp[2])
result_multiclass = pd.DataFrame(zip(target,best_node,train_accuracy,test_accuracy), columns=['target','best_node','train_accuracy','test_accuracy'])

In [0]:
target, best_node, train_accuracy, test_accuracy = [],[],[],[]
for col in ['style','occasion','embellishment']:
    for name in df.dropna(subset=[col])[col].unique():
        temp = modelcv_binary(col, name)
        target.append(col+'-'+name)
        best_node.append(temp[0])
        train_accuracy.append(temp[1])
        test_accuracy.append(temp[2])
result_binary = pd.DataFrame(zip(target,best_node,train_accuracy,test_accuracy), columns=['target','best_node','train_accuracy','test_accuracy'])

In [0]:
# Accuracy table on the category
result_multiclass

Unnamed: 0,target,best_node,train_accuracy,test_accuracy
0,category,10,0.997107,0.942404
1,fit,30,0.645806,0.491118


In [0]:
# Accuracy table on the style, occasion, embellishment
result_binary

Unnamed: 0,target,best_node,train_accuracy,test_accuracy
0,style-modern,20,0.789839,0.69433
1,style-businesscasual,30,0.852973,0.756894
2,style-classic,20,0.739658,0.653721
3,style-casual,20,0.878381,0.755874
4,style-androgynous,20,0.867531,0.8238
5,style-boho,20,0.897344,0.880491
6,style-retro,30,0.947524,0.943565
7,style-edgy,30,0.831142,0.799285
8,style-glam,20,0.902004,0.891219
9,style-romantic,10,0.869318,0.857001


In [0]:
#result_multiclass.to_csv('/content/drive/My Drive/NLP/result_multiclass.csv')
#result_binary.to_csv('/content/drive/My Drive/NLP/result_binary.csv')

## Implementing the Final Model

In [0]:
#result_multiclass = pd.read_csv('/content/drive/My Drive/NLP/result_multiclass.csv', index_col=0)
#result_binary = pd.read_csv('/content/drive/My Drive/NLP/result_binary.csv', index_col=0)

In [0]:
# Retrain the final model on the whole dataset and then make prediction
def implement(padded_docs_input, labelname:str):
    '''labelname should be in ['category', 'style', 'occasion', 'embellishment'] '''
    data = df.dropna(subset=[labelname]).copy()

    if labelname=='category':
        encoder = LabelEncoder()
        labels = to_categorical(encoder.fit_transform(data[labelname]))
        temp = preparemodel(data, embedding_size)
        padded_docs_train, vocab_size, max_sequence_len, embedding_matrix = temp[0], temp[1], temp[2], temp[3]
        model_multiclass = deepnn_multiclass(padded_docs_train, labels, vocab_size, embedding_size, embedding_matrix, max_sequence_len, node=int(result_multiclass[result_multiclass.target==labelname]['best_node']), val_split=0.1)

        score = pd.DataFrame(model_multiclass.predict(padded_docs_input), columns=category_label)
        result = score.idxmax(axis=1)
        return result
    
    else:
        uniquenames = df.dropna(subset=[labelname])[labelname].unique()

        table = pd.DataFrame()
        for uniquename in uniquenames:    
            data['temp'] = np.where(data[labelname]==uniquename, 0, 1)
            data.sort_values(by=['product_id','temp'], inplace=True)
            data.drop_duplicates(subset=['product_id'], keep='first', inplace=True)
            data.reset_index(drop=True, inplace=True)
            labels = np.where(data[labelname]==uniquename,1,0)
            temp = preparemodel(data, embedding_size)
            padded_docs_train, vocab_size, max_sequence_len, embedding_matrix = temp[0], temp[1], temp[2], temp[3]
            model_binary = deepnn_binary(padded_docs_train, labels, vocab_size, embedding_size, embedding_matrix, max_sequence_len, node=int(result_binary[result_binary.target==labelname+'-'+uniquename]['best_node']), val_split=0.1)

            score = model_binary.predict(padded_docs_input).flatten()
            table[uniquename] = np.where(score>0.5, uniquename,'')
        
        result = table[uniquenames].agg(' '.join, axis=1)
        return result

In [14]:
# subsetting the full_data
inputdata = df_SQL[df_SQL.product_id.isin(df.product_id.unique())].reset_index(drop=True)
inputdata.shape

(3970, 9)

In [0]:
# create the columns with predicted label
encoder = LabelEncoder()
category_label = pd.DataFrame(zip(df.dropna(subset=['category'])['category'], encoder.fit_transform(df.dropna(subset=['category'])['category']))).drop_duplicates()
category_label = category_label.sort_values(by=1)[0].tolist()

data_clean = preprocessing(inputdata)
temp = preparemodel(data_clean, embedding_size)
padded_docs, vocab_size, max_sequence_len, embedding_matrix = temp[0], temp[1], temp[2], temp[3]

tags = ['category','style','occasion','embellishment']
result = pd.DataFrame()
for tag in tags:
    result[tag] = implement(padded_docs, labelname=tag)
    print(f'{tag} is done')

#result.to_csv('/content/drive/My Drive/NLP/result.csv')

ok
category is done
style is done
occasion is done
embellishment is done


In [0]:
result

Unnamed: 0,category,style,occasion,embellishment
0,bottom,,,
1,top,,,
2,bottom,modern,work,
3,top,modern,,
4,top,,,
...,...,...,...,...
3965,bottom,,,
3966,bottom,,work,trim
3967,bottom,,work,
3968,onepiece,,work,


In [0]:
subsetdata = pd.concat([inputdata, result], axis=1, sort=False)

In [17]:
#subsetdata.to_csv('/content/drive/My Drive/NLP/subsetdata.csv')
subsetdata

Unnamed: 0.1,product_id,brand,mpn,product_full_name,description,brand_category,brand_canonical_url,details,labels,Unnamed: 0,category,style,occasion,embellishment
0,01e5zxp5h0btezt9qd2hrzj47a,a.l.c.,5529544,lennox high waist cotton & linen pants,high-rise trousers tailored from a cool italia...,unknown,https://shop.nordstrom.com/s/a-l-c-lennox-high...,"true to size. high rise.\n31"" inseam; 14"" leg ...",[],0,bottom,,,
1,01dseczpagjjc1edc79jrbf4wk,banana republic,492444,mock-neck sweater top,"designed to be worn with high-waisted bottoms,...",unknown,https://bananarepublic.gap.com/browse/product....,"designed to be worn with high-waisted bottoms,...","{""needs review""}",1,top,,,
2,01e607bhrqajdz76mjfn7rprk1,simon miller,5450059,rost belted shorts,cinched at the natural waist and pleated for f...,unknown,https://shop.nordstrom.com/s/simon-miller-rost...,"true to size. xs=0-2, s=4-6, m=6-8, l=8-10, xl...",[],2,bottom,modern,work,
3,01e5zxj6g03r7177x723ct04w0,a.l.c.,5526479,minelli silk sleeveless top,painterly brushes of color that convey the flu...,unknown,https://shop.nordstrom.com/s/a-l-c-minelli-sil...,"true to size.\n25 1/2"" length (size medium)\nf...",[{'value': 'unsure'}],3,top,modern,,
4,01e6074pqa697jz1sbm6nm8tbg,simon miller,5450071,nepa mismatched button rib cardigan,the west coast–based label channels beachy vib...,unknown,https://shop.nordstrom.com/s/simon-miller-nepa...,"true to size. xs=0-2, s=4-6, m=6-8, l=8-10, xl...",[],4,top,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3965,01e2kz045bct1q2z2pbc0cwrgh,atm anthony thomas melillo,5441202,micro twill pull on pants,"these casual trousers are cut for an easy, dra...",unknown,https://shop.nordstrom.com/s/atm-anthony-thoma...,"true to size. xs=0, s=2-4, m=6-8, l=10. high r...",[],3965,bottom,,,
3966,01e2kxz0wnq96s64tdze3wb7cx,atm anthony thomas melillo,5504588,brushed twill crop wide leg pants,brushed twill gives these cropped wide-leg pan...,unknown,https://shop.nordstrom.com/s/atm-anthony-thoma...,true to size. high rise.,[],3966,bottom,,work,trim
3967,01e2kxwwy6jagened7p62090np,atm anthony thomas melillo,4496777,slim crop pants,"tailored from a soft, dense knit with cropped,...",unknown,https://shop.nordstrom.com/s/atm-anthony-thoma...,mid rise. true to size.,[],3967,bottom,,work,
3968,01e2kztepe7qk1v8thasf7hwnr,atm anthony thomas melillo,5289605,camo print silk skirt,flatlock-stitched bias seams enhance the fluid...,unknown,https://shop.nordstrom.com/s/atm-anthony-thoma...,"true to size. xs=0, s=2-4, m=6-8, l=10.",[],3968,onepiece,,work,


In [0]:
end = pd.datetime.now()
print(end-start)

0:32:23.384532


  """Entry point for launching an IPython kernel.


We had out of vocabular errors, so we just performed predictions on the  subsetted labelled data as professor mentioned in the slack. 