In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import pandas as pd
import numpy as np
import re

In [3]:
start = pd.datetime.now()

  """Entry point for launching an IPython kernel.


In [0]:
df = pd.read_csv('/content/drive/My Drive/NLP/df_clean.csv', index_col=0)

In [5]:
df.head()

Unnamed: 0,index,product_id,brand,mpn,product_full_name,description,brand_category,brand_canonical_url,details,labels,attribute_name,attribute_value,style,occasion,category,fit
0,0,01e5zxp5h0btezt9qd2hrzj47a,a.l.c.,5529544,lennox high waist cotton linen pant,high rise bottom tailored cool italian cotton ...,unknown,https://shop.nordstrom.com/s/a-l-c-lennox-high...,true size high rise inseam leg opening rise ri...,[],style,modern,modern,,,
1,1,01e5zxp5h0btezt9qd2hrzj47a,a.l.c.,5529544,lennox high waist cotton linen pant,high rise bottom tailored cool italian cotton ...,unknown,https://shop.nordstrom.com/s/a-l-c-lennox-high...,true size high rise inseam leg opening rise ri...,[],style,businesscasual,businesscasual,,,
2,2,01e5zxp5h0btezt9qd2hrzj47a,a.l.c.,5529544,lennox high waist cotton linen pant,high rise bottom tailored cool italian cotton ...,unknown,https://shop.nordstrom.com/s/a-l-c-lennox-high...,true size high rise inseam leg opening rise ri...,[],style,classic,classic,,,
3,3,01e5zxp5h0btezt9qd2hrzj47a,a.l.c.,5529544,lennox high waist cotton linen pant,high rise bottom tailored cool italian cotton ...,unknown,https://shop.nordstrom.com/s/a-l-c-lennox-high...,true size high rise inseam leg opening rise ri...,[],occasion,work,,work,,
4,4,01e5zxp5h0btezt9qd2hrzj47a,a.l.c.,5529544,lennox high waist cotton linen pant,high rise bottom tailored cool italian cotton ...,unknown,https://shop.nordstrom.com/s/a-l-c-lennox-high...,true size high rise inseam leg opening rise ri...,[],category,bottom,,,bottom,


In [6]:
df['category'].unique()

array([nan, 'bottom', 'top', 'sweater', 'onepiece', 'blazerscoatsjackets',
       'shoe', 'accessory', 'sweatshirthoodie'], dtype=object)

In [0]:
df['category'] = np.where(df['category']=='sweater', 'top', np.where(df['category']=='blazerscoatsjackets', 'top', np.where(df['category']=='sweatshirthoodie', 'top', df['category'])))

In [8]:
df['occasion'].unique()

array([nan, 'work', 'daytonight', 'weekend', 'vacation', 'nightout',
       'coldweather', 'workout'], dtype=object)

In [0]:
df.drop(columns=['mpn','brand_canonical_url','labels'], inplace=True)

In [0]:
cols = ['brand','product_full_name','description','brand_category','details']
for col in cols:
    df[col].fillna('',inplace=True)

In [11]:
df.isnull().sum()

index                    0
product_id               0
brand                    0
product_full_name        0
description              0
brand_category           0
details                  0
attribute_name           0
attribute_value          0
style                16070
occasion             17530
category             22608
fit                  23544
dtype: int64

## Feature Engineering

### Word2vec

In [12]:
import nltk
nltk.download('punkt')
from nltk import word_tokenize
from gensim.models import Word2Vec

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
cols = ['brand','product_full_name','description','brand_category','details']
embedding_size=100
docs=[]
for col in cols:
    docs += [word_tokenize(i) for i in df[col]] 
wordvec = Word2Vec(docs, size=embedding_size, min_count=1)
print(wordvec)

Word2Vec(vocab=6813, size=100, alpha=0.025)


In [0]:
vocab = list(wordvec.wv.vocab)
wordvec_num = [wordvec.wv[i] for i in vocab]
wordvec_dict = dict(zip(vocab, wordvec_num))

### Combining features

In [0]:
df['vars'] = df['brand']+' '+df['product_full_name']+' '+df['description']+' '+df['brand_category']+' '+df['details']

# Deep NN

In [16]:
from random import randint
from numpy import array, argmax, asarray, zeros
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers.recurrent import SimpleRNN
from keras.layers import Flatten, Masking
from keras import regularizers
from keras.callbacks import ModelCheckpoint, EarlyStopping, History
import tensorflow as tf

Using TensorFlow backend.


In [0]:
def deepnn_multiclass(X, y, vocab_size, embedding_size, embedding_matrix, max_sequence_len, node=10, val_data:tuple=None, val_split=0):
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_size, weights=[embedding_matrix], input_length=max_sequence_len, trainable=False))
    model.add(Flatten())
    model.add(Dense(node, kernel_regularizer=regularizers.l2(0.01), activation='relu'))
    model.add(Dense(len(y[0]), activation='softmax'))
    
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    #model.summary()
    history = model.fit(X, y, validation_data=val_data, validation_split=val_split, epochs=30, verbose=0, callbacks= [EarlyStopping(patience = 5), ModelCheckpoint(filepath = "/content/drive/My Drive/NLP/weights.hdf5", save_best_only= True), History()])
    model_new = tf.keras.models.load_model(filepath = "/content/drive/My Drive/NLP/weights.hdf5")
    return model_new

In [0]:
def modelcv_multiclass(labelname, n_splits=5):
    data = df.dropna(subset=[labelname])
    tokenizer = Tokenizer(num_words=None, oov_token="UNKNOWN_TOKEN")
    tokenizer.fit_on_texts(data['vars'])
    encoded_docs = tokenizer.texts_to_sequences(data['vars'])   

    max_sequence_len = 0
    for i in encoded_docs:
        if len(i) > max_sequence_len:
            max_sequence_len=len(i)        

    padded_docs = pad_sequences(encoded_docs, maxlen=max_sequence_len, padding='post')

    encoder = LabelEncoder()
    labels = to_categorical(encoder.fit_transform(data[labelname]))

    vocab_size = len(tokenizer.word_index)+1
    embedding_matrix = zeros((vocab_size, embedding_size))
    for word, i in tokenizer.word_index.items():
        embedding_vector = wordvec_dict.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    
    kf = KFold(n_splits=n_splits, shuffle = True)
    kf.get_n_splits(padded_docs)

    maximum, node_final, train_acc_final, test_acc_final = 0,0,0,0
    for node in [10,20,30]:
        train_acc = 0
        test_acc = 0
        for train_index, test_index in kf.split(padded_docs):
            X_train, X_test, y_train, y_test = padded_docs[train_index], padded_docs[test_index], labels[train_index], labels[test_index]
            model = deepnn_multiclass(X_train, y_train, vocab_size, embedding_size, embedding_matrix, max_sequence_len, node=node, val_data=(X_test,y_test))
            loss_train, accuracy_train = model.evaluate(X_train, y_train, verbose=0)
            loss_test, accuracy_test = model.evaluate(X_test, y_test, verbose=0)
            train_acc += accuracy_train
            test_acc += accuracy_test
        if maximum < test_acc/n_splits:
            maximum = test_acc/n_splits
            node_final = node
            train_acc_final = train_acc/n_splits
            test_acc_final = test_acc/n_splits
          
    return node_final, train_acc_final, test_acc_final

In [0]:
def deepnn_binary(X, y, vocab_size, embedding_size, embedding_matrix, max_sequence_len, node=10, val_data:tuple=None, val_split=0):
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_size, weights=[embedding_matrix], input_length=max_sequence_len, trainable=False))
    model.add(Flatten())
    model.add(Dense(node, kernel_regularizer=regularizers.l2(0.01), activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    #model.summary()
    history = model.fit(X, y, validation_data=val_data, validation_split=val_split, epochs=30, verbose=0, callbacks= [EarlyStopping(patience = 5), ModelCheckpoint(filepath = "/content/drive/My Drive/NLP/weights.hdf5", save_best_only= True), History()])
    model_new = tf.keras.models.load_model(filepath = "/content/drive/My Drive/NLP/weights.hdf5")
    return model_new

In [0]:
def modelcv_binary(labelname, uniquename, n_splits=5):
    data = df.dropna(subset=[labelname]).copy()
    data['temp'] = np.where(data[labelname]==uniquename, 0, 1)
    data.sort_values(by=['product_id','temp'], inplace=True)
    data.drop_duplicates(subset=['product_id'], keep='first', inplace=True)
    data.reset_index(drop=True, inplace=True)

    tokenizer = Tokenizer(num_words=None, oov_token="UNKNOWN_TOKEN")
    tokenizer.fit_on_texts(data['vars'])
    encoded_docs = tokenizer.texts_to_sequences(data['vars'])

    max_sequence_len = 0
    for i in encoded_docs:
        if len(i) > max_sequence_len:
            max_sequence_len=len(i)        

    padded_docs = pad_sequences(encoded_docs, maxlen=max_sequence_len, padding='post')

    labels = np.where(data[labelname]==uniquename,1,0)

    vocab_size = len(tokenizer.word_index)+1
    embedding_matrix = zeros((vocab_size, embedding_size))
    for word, i in tokenizer.word_index.items():
        embedding_vector = wordvec_dict.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    kf = KFold(n_splits=n_splits, shuffle = True)
    kf.get_n_splits(padded_docs)

    maximum, node_final, train_acc_final, test_acc_final = 0,0,0,0
    for node in [10,20,30]:
        train_acc = 0
        test_acc = 0
        for train_index, test_index in kf.split(padded_docs):
            X_train, X_test, y_train, y_test = padded_docs[train_index], padded_docs[test_index], labels[train_index], labels[test_index]
            model = deepnn_binary(X_train, y_train, vocab_size, embedding_size, embedding_matrix, max_sequence_len, node=node, val_data=(X_test,y_test))
            loss_train, accuracy_train = model.evaluate(X_train, y_train, verbose=0)
            loss_test, accuracy_test = model.evaluate(X_test, y_test, verbose=0)
            train_acc += accuracy_train
            test_acc += accuracy_test
        if maximum < test_acc/n_splits:
            maximum = test_acc/n_splits
            node_final = node
            train_acc_final = train_acc/n_splits
            test_acc_final = test_acc/n_splits
          
    return node_final, train_acc_final, test_acc_final

In [21]:
for col in ['category','fit']:
    print(f'{col} best node and accuracy (train, test): {modelcv_multiclass(col)}')

category best node and accuracy (train, test): (30, 0.9967932581901551, 0.9376280665397644)
fit best node and accuracy (train, test): (10, 0.6078947424888611, 0.49572367668151857)


In [22]:
for col in ['style','occasion']:
    for name in df.dropna(subset=[col])[col].unique():
        print(f'{col}-{name} best node and accuracy (train, test): {modelcv_binary(col, name)}')

style-modern best node and accuracy (train, test): (20, 0.7823657870292664, 0.6958655834197998)
style-businesscasual best node and accuracy (train, test): (30, 0.8548249006271362, 0.7586822628974914)
style-classic best node and accuracy (train, test): (20, 0.7905983209609986, 0.6603679656982422)
style-casual best node and accuracy (train, test): (30, 0.9023878455162049, 0.755107581615448)
style-androgynous best node and accuracy (train, test): (30, 0.864019775390625, 0.8255874156951905)
style-boho best node and accuracy (train, test): (30, 0.896067452430725, 0.8804900765419006)
style-retro best node and accuracy (train, test): (20, 0.947906231880188, 0.9433119535446167)
style-edgy best node and accuracy (train, test): (20, 0.8313317775726319, 0.7990274786949157)
style-glam best node and accuracy (train, test): (20, 0.9069833278656005, 0.8919820427894593)
style-romantic best node and accuracy (train, test): (10, 0.8739781618118286, 0.8592915058135986)
style-athleisure best node and accu

In [23]:
end = pd.datetime.now()
print(end-start)

0:15:44.972571


  """Entry point for launching an IPython kernel.
