In [1]:
import pandas as pd
import numpy as np
import pickle
import warnings
import ast # for string to list
from tqdm.auto import tqdm

warnings.filterwarnings('ignore') # ignore warnings

In [3]:
customers_history = pd.read_csv('drive/MyDrive/hm/customer_sequence.csv').set_index('Unnamed: 0')
everthing = pd.read_csv('drive/MyDrive/hm/everthing.csv')
customer_all = pd.read_csv('drive/MyDrive/hm/customer_all.csv')

In [4]:
tfrs_knn = pickle.load(open('drive/MyDrive/hm/tfrs_knn.pickle', 'rb' ))
image_knn = pickle.load(open('drive/MyDrive/hm/image_knn.pickle', 'rb'))
text_knn = pickle.load(open('drive/MyDrive/hm/text_knn.pickle', 'rb'))
feature_knn = pickle.load(open('drive/MyDrive/hm/feature_knn.pickle', 'rb'))
all_knn = pickle.load(open('drive/MyDrive/hm/all_knn.pickle', 'rb'))

Makoing the customers_history dataset a little smaller due to computation and storage limitations

In [5]:
customers_history.shape

(1362281, 2)

In [6]:
df = customers_history.iloc[:994]

In [7]:
# converting the string of sequences(purchase history) into lists
valid_true = []
for i in range(len(df)):
  lst = ast.literal_eval(df.iloc[i].sequence)
  lst = [int(i) for i in lst]
  valid_true.append(lst)

In [8]:
valid_true_df = pd.DataFrame({'valid_true' : valid_true}) #converting list to dataframe

In [9]:
df = pd.concat([df,valid_true_df],axis=1) #merging with the dataframe
df.drop(['sequence'], axis = 1, inplace = True) #droping the old column
df.head()

Unnamed: 0,customer,valid_true
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,"[625548001, 176209023, 627759010, 697138006, 5..."
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,"[583558001, 639677008, 640244003, 521269001, 6..."
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,"[663713001, 541518023, 663713001, 578020002, 7..."
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,"[742079001, 732413001]"
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,"[634249005, 677049001, 698286003, 707704003, 3..."


Performance Measure of Feature Embeddings

In [41]:
#function to get the customer embeddings for each customer using the feature model
def getEmbedsFeature(customer):
  customer = customer_all[customer_all.customer_id == customer].drop('customer_id', axis=1)
  customer_feature = customer.filter(regex='^feature',axis=1)
  return customer_feature

In [42]:
#gets best 12 recommendations using feature embeddings
def getRcmndsFeature(embeds,k=12):
  feature = feature_knn.kneighbors(embeds, k)
  feature_rcmnds= everthing.iloc[feature[1][0]].article_id.values
  return feature_rcmnds

In [43]:
#generating customer embeddings for each customer using
list_feature = []
for i in range(len(df)):
  embeds = getEmbedsFeature(df.loc[i, "customer"])
  list_feature.append(embeds)

In [44]:
# getting predictions for each customer using feature embeds
preds_Feature = []
for i in range(len(list_feature)):
  predict = getRcmndsFeature(list_feature[i])
  preds_Feature.append(predict)

In [45]:
len(preds_Feature)

994

Converting The List of Predictions into a dataframe

In [46]:
pred_df = pd.DataFrame({'predictions' : preds_Feature})
pred_df.head()

Unnamed: 0,predictions
0,"[662189001, 896368001, 543689001, 591914001, 7..."
1,"[684209025, 854677003, 854678003, 838055003, 8..."
2,"[647373004, 859873004, 606250001, 832051001, 5..."
3,"[678688001, 678691002, 851774001, 852174001, 5..."
4,"[512075001, 678339001, 723243002, 668550001, 7..."


Merging the two datasets

In [47]:
result_df = pd.concat([df,pred_df], axis=1)
result_df.head()

Unnamed: 0,customer,valid_true,predictions
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,"[625548001, 176209023, 627759010, 697138006, 5...","[662189001, 896368001, 543689001, 591914001, 7..."
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,"[583558001, 639677008, 640244003, 521269001, 6...","[684209025, 854677003, 854678003, 838055003, 8..."
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,"[663713001, 541518023, 663713001, 578020002, 7...","[647373004, 859873004, 606250001, 832051001, 5..."
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,"[742079001, 732413001]","[678688001, 678691002, 851774001, 852174001, 5..."
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,"[634249005, 677049001, 698286003, 707704003, 3...","[512075001, 678339001, 723243002, 668550001, 7..."


In [48]:
# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/306007
# https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py


def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
          num_hits += 1.0
          score += num_hits / (i+1.0)

    # remove this case in advance
    # if not actual:
    #     return 0.0

    return score / min(len(actual), k)


def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [49]:
tqdm.pandas()

mapk(
    result_df['valid_true'],
    result_df['predictions'],
    k=12
)

0.09931813870412454

In [50]:
#function to get the customer embeddings for each customer using the tensorflow model
def getEmbedstfrs(customer):
  customer = customer_all[customer_all.customer_id == customer].drop('customer_id', axis=1)
  customer_tfrs = customer.filter(regex='^tfrs',axis=1)
  return customer_tfrs

In [51]:
#gets best 12 recommendations using tfrs embeddings
def getRcmndstfrs(embeds,k=12):
  tfrs = tfrs_knn.kneighbors(embeds, k)
  tfrs_rcmnds = everthing.iloc[tfrs[1][0]].article_id.values
  return tfrs_rcmnds

In [52]:
#generating customer embeddings for each customer using tfrs model
list_tfrs = []
for i in range(len(df)):
  embeds = getEmbedstfrs(df.loc[i, "customer"])
  list_tfrs.append(embeds)

In [53]:
# getting predictions for each customer using tfrs embeds
preds_tfrs = []
for i in range(len(list_tfrs)):
  predict = getRcmndstfrs(list_tfrs[i])
  preds_tfrs.append(predict)

In [54]:
len(preds_tfrs)

994

In [55]:
pred_df_tfrs = pd.DataFrame({'predictions' : preds_tfrs})
pred_df_tfrs.head()

Unnamed: 0,predictions
0,"[627165001, 741543002, 515305001, 640001023, 7..."
1,"[566140015, 559616014, 599580076, 599580083, 8..."
2,"[843642003, 772571005, 849214008, 591833002, 7..."
3,"[685843001, 732413001, 739347003, 742079001, 7..."
4,"[659854018, 851363002, 763843001, 866796001, 8..."


In [57]:
result_df_tfrs = pd.concat([df,pred_df_tfrs], axis=1)
result_df_tfrs.head()

Unnamed: 0,customer,valid_true,predictions
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,"[625548001, 176209023, 627759010, 697138006, 5...","[627165001, 741543002, 515305001, 640001023, 7..."
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,"[583558001, 639677008, 640244003, 521269001, 6...","[566140015, 559616014, 599580076, 599580083, 8..."
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,"[663713001, 541518023, 663713001, 578020002, 7...","[843642003, 772571005, 849214008, 591833002, 7..."
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,"[742079001, 732413001]","[685843001, 732413001, 739347003, 742079001, 7..."
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,"[634249005, 677049001, 698286003, 707704003, 3...","[659854018, 851363002, 763843001, 866796001, 8..."


In [58]:
tqdm.pandas()

mapk(
    result_df_tfrs['valid_true'],
    result_df_tfrs['predictions'],
    k=12
)

0.1653308228485183

In [59]:
#function to get the customer embeddings for each customer from the combined embeddings
def getEmbedsCombined(customer):
  customer = customer_all[customer_all.customer_id == customer].drop('customer_id', axis=1)
  return customer.values[0]

In [60]:
#gets best 12 recommendations using combined embeddings
def getRcmndsCombined(embeds,k=12):
  combined = all_knn.kneighbors([embeds], k)
  combined_rcmnds = everthing.iloc[combined[1][0]].article_id.values
  return combined_rcmnds

In [61]:
#getting combined embeddings
list_combined = []
for i in range(len(df)):
  embeds = getEmbedsCombined(df.loc[i, "customer"])
  list_combined.append(embeds)

In [63]:
# getting predictions for each customer using combined embeds
preds_combined = []
for i in range(len(list_combined)):
  predict = getRcmndsCombined(list_combined[i])
  preds_combined.append(predict)

In [64]:
len(preds_combined)

994

In [65]:
pred_df_combined = pd.DataFrame({'predictions' : preds_combined})
pred_df_combined.head()

Unnamed: 0,predictions
0,"[649331004, 599718043, 849419005, 766777015, 7..."
1,"[934312001, 906382001, 906705001, 906372001, 6..."
2,"[574717001, 865454002, 870188001, 890697002, 8..."
3,"[655902001, 542010005, 608789004, 386463006, 9..."
4,"[930578001, 906382001, 873764003, 644322001, 9..."


In [66]:
result_df_combined = pd.concat([df,pred_df_combined], axis=1)
result_df_combined.head()

Unnamed: 0,customer,valid_true,predictions
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,"[625548001, 176209023, 627759010, 697138006, 5...","[649331004, 599718043, 849419005, 766777015, 7..."
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,"[583558001, 639677008, 640244003, 521269001, 6...","[934312001, 906382001, 906705001, 906372001, 6..."
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,"[663713001, 541518023, 663713001, 578020002, 7...","[574717001, 865454002, 870188001, 890697002, 8..."
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,"[742079001, 732413001]","[655902001, 542010005, 608789004, 386463006, 9..."
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,"[634249005, 677049001, 698286003, 707704003, 3...","[930578001, 906382001, 873764003, 644322001, 9..."


In [68]:
tqdm.pandas()

mapk(
    result_df_combined['valid_true'],
    result_df_combined['predictions'],
    k=12
)

0.0