# Packages and Data

In [1]:
import Preprocessing_Function
import pandas as pd
import numpy as np
import spacy
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings("ignore")

In [2]:
## install if needed

#!python -m spacy download en_core_web_md  

In [3]:
start = pd.datetime.now()

In [4]:
fulldata = Preprocessing_Function.import_data()
data_label = Preprocessing_Function.labelled_data(fulldata)

In [5]:
data = fulldata # full data
training = data_label # data with labels
data1 = data.copy() # copy
training1 = training.copy() # copy

In [6]:
nlp = spacy.load("en_core_web_md")

# Functions

In [7]:
# drop records with missing values for X-s and specified y
def getclean(df: pd.DataFrame(), y: str()):
    temp = df.copy()
    # drop redundant index columns
    temp.drop(['index'], axis = 1, inplace = True)
    # remove records with missing data for X-s and y (here - category)
    temp.dropna(subset=['brand','description','details',y], inplace = True)
    temp.reset_index(inplace = True)
    temp.drop('index', axis = 1, inplace = True)
    return temp

In [8]:
# vectorize descriptions in df using doc2vecs trained on fulL_data descriptions
def doc2vecdesc(df: pd.DataFrame()):
    temp = data1.copy()
    temp.dropna(subset = ['description'], inplace = True)
    temp.reset_index(inplace = True)
    temp.drop(['index'], axis = 1, inplace = True)
    # retrieve description documents
    documents = [TaggedDocument(str(temp.loc[i,'description']), [i]) for i in temp.index]
    # train doc2vec on full data descriptions and vectorize training documents
    model = Doc2Vec(documents, vector_size=300, window=3, workers=4)
    df['VectorizedDesc'] = df.description.apply(lambda x: model.infer_vector([x]))
    description_vectors = list(df.VectorizedDesc.values)
    return description_vectors

In [9]:
# tf-idf doc vectorization for names and details
def tfidfvectorize(x: list()):
    # tf-idf vectorize on all unique tokens in list 'x' and sum the scores for each doc
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(x)
    tf_idf_lookup_table = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
    DOCUMENT_SUM_COLUMN = "DOCUMENT_TF_IDF_SUM"
    tf_idf_lookup_table[DOCUMENT_SUM_COLUMN] = tf_idf_lookup_table.sum(axis=1)
    # create a list of all tokens extracted by tf-idf
    available_tf_idf_scores = tf_idf_lookup_table.columns
    available_tf_idf_scores = list(map(lambda x: x, available_tf_idf_scores))
    # define an empty list
    docvectors = []
    # obtain tf-idf doc embeddings for every element in 'x'
    for idx, x in enumerate(x): 
        tokens = nlp(x)
        total_tf_idf_score_per_document = 0
        running_total_word_embedding = np.zeros(300) 
        for token in tokens:
            if token.has_vector and token.text in available_tf_idf_scores:
                tf_idf_score = tf_idf_lookup_table.loc[idx, token.text]
                running_total_word_embedding += tf_idf_score * token.vector
                total_tf_idf_score_per_document += tf_idf_score
        document_embedding = running_total_word_embedding / total_tf_idf_score_per_document
        docvectors.append(document_embedding)
    # return the embeddings
    return docvectors

In [10]:
## function to perform classification for single-label targets (using softmax as our 'activation')
# preparing the data for xgboost, which is then run in this function 
# takes about 5-7 minutes to run with 100 trees and 5 folds
def softmax(df: pd.DataFrame(), description_vectors: list(), names_vectors: list(), details_vectors: list(),\
            attribute: str(), fold: int(), seed: int()):
    brandsdf = pd.get_dummies(df.brand)
    namesdf = pd.DataFrame(np.array(names_vectors))
    detailsdf = pd.DataFrame(np.array(details_vectors))
    descriptiondf = pd.DataFrame(np.array(description_vectors))
    X = pd.concat([brandsdf,namesdf,detailsdf,descriptiondf], axis = 1) # combine embeddings into a data frame
    X.columns = [f'col{i}' for i in range(X.shape[1])]
    d = {}
    for i in range(len(df[attribute].unique())):
        d.update({df[attribute].unique()[i]: i}) # create a dicionary where a unique attribute from class is stored as key 
                                                 # and is assigned a unique index as value
    y = df[attribute].map(d) # map keys to values
    kf = KFold(n_splits = fold, random_state = seed, shuffle = True) # define the k-fold constructor
    temp = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        # define the constructor, where hyperparameters were tuned using GridSearchCV on 1350 different combinations of 
        # parameters ['max_depth','n_estimators','reg_lambda','learning_rate','colsample_bytree']
        clf = xgb.XGBClassifier(max_depth=8, objective='multi:softprob', n_estimators=100, reg_lambda=0.25,\
                         learning_rate=0.1,colsample_bytree=0.25,num_classes=len(y.unique()))
        clf.fit(X_train, y_train)  
        pred = clf.predict(X_test)
        temp.append(np.mean(pred != y_test)) # misclassification rate 
    accuracy = f'{attribute} accuracy: {(1-np.mean(temp))*100}'
    d = {j:i for i,j in d.items()}
    y_test = y_test.reset_index()
    y_test = pd.Series(y_test[attribute])
    y_test = y_test.map(d) # map back the actual labels
    pred = pd.Series(pred)
    pred = pred.map(d) # map back the actual labels
    temp = pd.DataFrame({f'{attribute}_actual': y_test, f'{attribute}_predicted': pred}) # combine into a df
    results = [accuracy, temp]    # accuracy over all folds, sample (by default last iteration of Kfold) actual labels,
                                  # and sample predicted labels
    return results

In [11]:
## function to perform classification for multi-label targets (using sigmoid as our 'activation')
# preparing the data for xgboost, which is then run in this function
# takes about 5-7 minutes to run with 100 trees and 5 folds
def sigmoid(df: pd.DataFrame(), description_vectors: list(), names_vectors: list(), details_vectors: list(),\
            attribute: str(), fold: int(), seed: int()):
    brandsdf = pd.get_dummies(df.brand)
    namesdf = pd.DataFrame(np.array(names_vectors))
    detailsdf = pd.DataFrame(np.array(details_vectors))
    descriptiondf = pd.DataFrame(np.array(description_vectors))
    X = pd.concat([brandsdf,namesdf,detailsdf,descriptiondf], axis = 1) # combine embeddings into a data frame
    X.columns = [f'col{i}' for i in range(X.shape[1])]
    l = []
    for i in range(X.shape[0]):
        for j in df.loc[i,attribute].split(' '):
            if j not in l:
                l.append(j) # retrieve all unique attributes from class and store them in a list
    results = []
    for subattr in l: # iterate xgboost over each unique attribute - resulting in a separate model for every attribute
        y = df[attribute].apply(lambda x: 1 if subattr in x.split(' ') else 0)
        kf = KFold(n_splits = fold, random_state = seed, shuffle = True)
        temp = []
        for train_index, test_index in kf.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            # define the constructor, where hyperparameters were tuned using GridSearchCV on 1350 different combinations of 
            # parameters ['max_depth','n_estimators','reg_lambda','learning_rate','colsample_bytree']
            clf = xgb.XGBClassifier(max_depth=8, objective='binary:logistic', n_estimators=100, reg_lambda=0.25,\
                             learning_rate=0.1,colsample_bytree=0.25,num_classes=len(y.unique()))
            clf.fit(X_train, y_train)  
            pred = clf.predict(X_test)
            temp.append(np.mean(pred != y_test)) # misclassification rate
        accuracy = f'{attribute} accuracy: {(1-np.mean(temp))*100}'
        y_test = y_test.reset_index()
        y_test = pd.Series(y_test[attribute])
        y_test = y_test.apply(lambda x: f'{subattr}' if x else f'not {subattr}') # 0 (not a match) is translated 
                                                                                 # into 'not subattribute'
        pred = pd.Series(pred)
        pred = pred.apply(lambda x: f'{subattr}' if x else f'not {subattr}')     # 0 (not a match) is translated 
                                                                                 # into 'not subattribute'
        temp = pd.DataFrame({f'{attribute}_{subattr}_actual': y_test, f'{attribute}_{subattr}_predicted': pred})
        results.append([accuracy, temp])  # accuracy over all folds, sample (by default last iteration of Kfold) actual labels,
                                          # and sample predicted labels for the selected subattribute in an attribute
    return results

# Example for 'Category'

In [12]:
training = getclean(training1, 'category')
training.shape[0] # 3321 records where X-s and y (category) are not NaN

3321

In [13]:
description_vectors = doc2vecdesc(training)

In [14]:
brand_vectors = list(pd.get_dummies(training.brand).values)

In [15]:
names = list(training.name.values)
details = list(training.details.values)

In [16]:
names_vectors = tfidfvectorize(names)
details_vectors = tfidfvectorize(details)

In [17]:
check = [len(description_vectors)-training.shape[0],len(brand_vectors)-training.shape[0],\
         len(names_vectors)-training.shape[0], len(details_vectors)-training.shape[0]]
not any(check) # 'True' here indicates that all embedded objects have the same number of documents as does the 'training' df

True

In [18]:
catlist = softmax(training,description_vectors,names_vectors,details_vectors,'category',2,2) 
# using 2 folds and random.seed = 2

In [19]:
catlist

['category accuracy: 81.8126690990331',
        category_actual category_predicted
 0                  top                top
 1             onepiece           onepiece
 2               bottom             bottom
 3             onepiece           onepiece
 4             onepiece             bottom
 ...                ...                ...
 1655               top                top
 1656               top                top
 1657  sweatshirthoodie                top
 1658            bottom             bottom
 1659            bottom             bottom
 
 [1660 rows x 2 columns]]

# General form

In [20]:
def combine(df: pd.DataFrame(), fold: int(), seed: int()):
    results = []
    for i in ['category','embellishment','style','occasion']: # iterate through each selected class to predict
        training = getclean(df, i) # cleaned data (accounting for NA-s) according to the selected class
        description_vectors = doc2vecdesc(training) # description embeddings
        brand_vectors = list(pd.get_dummies(training.brand).values) # one-hot encoded brands
        names = list(training.name.values) # product names
        details = list(training.details.values) # product details
        names_vectors = tfidfvectorize(names) # product name embeddings
        details_vectors = tfidfvectorize(details) # product details embeddings
        check = [len(description_vectors)-training.shape[0],len(brand_vectors)-training.shape[0],\
                 len(names_vectors)-training.shape[0], len(details_vectors)-training.shape[0]] 
        # any(check) == 'False' here indicates that all embedded objects have the same number of documents 
        # as does the 'training' df
        if any(check):
            return f'Error: Number of embedded rows does not match the number of labeled records for class {i}'
        if i == 'category':
            results.append(softmax(training,description_vectors,names_vectors,details_vectors,i,fold,seed)) # single-label
        else:
            results.append(sigmoid(training,description_vectors,names_vectors,details_vectors,i,fold,seed)) # multi-label
    return results

In [21]:
classlist = combine(training1,5,0) # 5+ fold CV-s are encouraged, but take significantly more time to output

In [22]:
classlist

[['category accuracy: 83.1069843282906',
        category_actual category_predicted
  0                 top                top
  1            onepiece           onepiece
  2            onepiece           onepiece
  3                shoe               shoe
  4                shoe               shoe
  ..                ...                ...
  659               top                top
  660               top                top
  661  sweatshirthoodie                top
  662               top                top
  663               top                top
  
  [664 rows x 2 columns]],
 [['embellishment accuracy: 96.20689655172414',
      embellishment_sequins_actual embellishment_sequins_predicted
   0                   not sequins                     not sequins
   1                   not sequins                     not sequins
   2                   not sequins                     not sequins
   3                   not sequins                     not sequins
   4                   not seq

In [23]:
end = pd.datetime.now()

In [24]:
print(end-start)

0:36:10.410516
