In [1]:
import sys
import os

import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import lightfm
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm import cross_validation
from lightfm.evaluation import precision_at_k

print("System version: {}".format(sys.version))
print("LightFM version: {}".format(lightfm.__version__))

%matplotlib inline

System version: 3.7.12 | packaged by conda-forge | (default, Oct 26 2021, 06:08:53) 
[GCC 9.4.0]
LightFM version: 1.16


In [2]:
# default number of recommendations
K = 12
# percentage of data used for testing
TEST_PERCENTAGE = 0.25
# model learning rate
LEARNING_RATE = 0.25
# no of latent factors
NO_COMPONENTS = 20
# no of epochs to fit model
NO_EPOCHS = 20
# no of threads to fit model
NO_THREADS = 8
# regularisation for both user and item features
ITEM_ALPHA=1e-6
USER_ALPHA=1e-6

# seed for pseudonumber generations
SEEDNO = 42

In [3]:
data_path = r'../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv'
customer_data_path = r'../input/h-and-m-personalized-fashion-recommendations/customers.csv'
article_data_path = r'../input/h-and-m-personalized-fashion-recommendations/articles.csv'
submission_data_path = r'../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv'

In [4]:
# Data Extraction
def create_data(datapath, data_type=None):
    if data_type is None:
        df = pd.read_csv(datapath)
    elif data_type == 'transaction':
        df = pd.read_csv(datapath, dtype={'article_id': str}, parse_dates=['t_dat'])
    elif data_type == 'article':
        df = pd.read_csv(datapath, dtype={'article_id': str})
    return df

In [5]:
%%time

# Load all sales data (for 3 years starting from 2018 to 2020)
# ALso, article_id is treated as a string column otherwise it 
# would drop the leading zeros while reading the specific column values
transactions_data=create_data(data_path, data_type='transaction')
print(transactions_data.shape)

# # Unique Attributes
print(str(len(transactions_data['t_dat'].drop_duplicates())) + "-total No of unique transactions dates in data sheet")
print(str(len(transactions_data['customer_id'].drop_duplicates())) + "-total No of unique customers ids in data sheet")
print(str(len(transactions_data['article_id'].drop_duplicates())) + "-total No of unique article ids courses names in data sheet")
print(str(len(transactions_data['sales_channel_id'].drop_duplicates())) + "-total No of unique sales channels in data sheet")

(31788324, 5)
734-total No of unique transactions dates in data sheet
1362281-total No of unique customers ids in data sheet
104547-total No of unique article ids courses names in data sheet
2-total No of unique sales channels in data sheet
CPU times: user 1min 1s, sys: 6.64 s, total: 1min 7s
Wall time: 1min 31s


In [6]:
%%time

# Load all Customers
customer_data=create_data(customer_data_path)
print(customer_data.shape)

print(str(len(customer_data['customer_id'].drop_duplicates())) + "-total No of unique customers ids in customer data sheet")

(1371980, 7)
1371980-total No of unique customers ids in customer data sheet
CPU times: user 4.04 s, sys: 455 ms, total: 4.5 s
Wall time: 6.24 s


In [7]:
%%time

# Load all Customers
article_data=create_data(article_data_path, data_type='article')
print(article_data.shape)

print(str(len(article_data['article_id'].drop_duplicates())) + "-total No of unique article ids in article data sheet")

(105542, 25)
105542-total No of unique article ids in article data sheet
CPU times: user 858 ms, sys: 40.5 ms, total: 899 ms
Wall time: 1.17 s


In [8]:
transactions_data = transactions_data[transactions_data['t_dat'] > '2020-08-21']
transactions_data.shape

(1190911, 5)

In [9]:
transactions_data_tcount = transactions_data.groupby(['customer_id','article_id']).agg({'t_dat':'count'}).reset_index()
transactions_data_tcount = transactions_data_tcount[['customer_id','article_id','t_dat']]
transactions_data_tcount.head()

Unnamed: 0,customer_id,article_id,t_dat
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,568601043,1
1,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,794321007,1
2,0000757967448a6cb83efb3ea7a3fb9d418ac7adf2379d...,448509014,1
3,0000757967448a6cb83efb3ea7a3fb9d418ac7adf2379d...,719530003,1
4,00009d946eec3ea54add5ba56d5210ea898def4b46c685...,516859008,1


In [10]:
def get_customers_list():
    return np.sort(transactions_data_tcount['customer_id'].unique())

def get_articles_list():
    return np.sort(transactions_data_tcount['article_id'].unique())

def id_mappings(customers_list, articles_list):
    """
    
    Create id mappings to convert user_id and item_id
    
    """
    customer_to_index_mapping = {}
    index_to_customer_mapping = {}
    for customer_index, customer_id in enumerate(customers_list):
        customer_to_index_mapping[customer_id] = customer_index
        index_to_customer_mapping[customer_index] = customer_id
        
    article_to_index_mapping = {}
    index_to_article_mapping = {}
    for article_index, article_id in enumerate(articles_list):
        article_to_index_mapping[article_id] = article_index
        index_to_article_mapping[article_index] = article_id
        
    return customer_to_index_mapping, index_to_customer_mapping, \
           article_to_index_mapping, index_to_article_mapping

In [11]:
customers = get_customers_list()
articles = get_articles_list()

In [12]:
# Generate mapping, LightFM library can't read other than (integer) index
customer_to_index_mapping, index_to_customer_mapping, \
article_to_index_mapping, index_to_article_mapping = id_mappings(customers, articles)

In [13]:
transactions_data_tcount['userID'] = transactions_data_tcount['customer_id'].apply(lambda x:customer_to_index_mapping.get(x))
transactions_data_tcount['itemID'] = transactions_data_tcount['article_id'].apply(lambda x:article_to_index_mapping.get(x))
transactions_data_tcount = transactions_data_tcount[['userID','itemID','t_dat']].rename(columns={'t_dat':'rating'})

In [14]:
listBin = [-1, 19, 29, 39, 49, 59, 69, 119]
customer_data_copy = customer_data.copy()
customer_data_copy['user_agebins'] = pd.cut(customer_data_copy['age'], listBin)
customer_data_copy['userID'] = customer_data_copy['customer_id'].apply(lambda x:customer_to_index_mapping.get(x))
customer_data_copy

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,user_agebins,userID
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,,,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...,"(39, 49]",0.0
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,,,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...,"(19, 29]",
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,,,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...,"(19, 29]",1.0
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,,,ACTIVE,NONE,54.0,5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...,"(49, 59]",
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,ACTIVE,Regularly,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd...,"(49, 59]",
...,...,...,...,...,...,...,...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,,,ACTIVE,NONE,24.0,7aa399f7e669990daba2d92c577b52237380662f36480b...,"(19, 29]",256353.0
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,,,ACTIVE,NONE,21.0,3f47f1279beb72215f4de557d950e0bfa73789d24acb5e...,"(19, 29]",
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,1.0,1.0,ACTIVE,Regularly,21.0,4563fc79215672cd6a863f2b4bf56b8f898f2d96ed590e...,"(19, 29]",256354.0
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,1.0,1.0,ACTIVE,Regularly,18.0,8892c18e9bc3dca6aa4000cb8094fc4b51ee8db2ed14d7...,"(-1, 19]",


In [15]:
transactions_data_copy = transactions_data_tcount.copy()
transactions_data_copy = pd.merge(transactions_data_copy, customer_data_copy[['userID','user_agebins']], how='left', on='userID')
transactions_data_copy

Unnamed: 0,userID,itemID,rating,user_agebins
0,0,2020,1,"(39, 49]"
1,1,12041,1,"(19, 29]"
2,2,683,1,"(19, 29]"
3,2,6707,1,"(19, 29]"
4,3,1173,1,"(49, 59]"
...,...,...,...,...
1051725,256353,13092,1,"(19, 29]"
1051726,256354,5157,1,"(19, 29]"
1051727,256354,9590,1,"(19, 29]"
1051728,256354,12089,1,"(19, 29]"


In [16]:
article_data_copy = article_data.copy()
article_data_copy['itemID'] = article_data_copy['article_id'].apply(lambda x:article_to_index_mapping.get(x))
article_data_copy.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc,itemID
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.,
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.,0.0
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.,
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde...",
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde...",


In [17]:
transactions_data_copy = pd.merge(transactions_data_copy, article_data_copy[['itemID','product_group_name']], how='left', on='itemID')
transactions_data_copy.rename(columns={'product_group_name':'item_groupname'}, inplace=True)
transactions_data_copy

Unnamed: 0,userID,itemID,rating,user_agebins,item_groupname
0,0,2020,1,"(39, 49]",Garment Upper body
1,1,12041,1,"(19, 29]",Garment Upper body
2,2,683,1,"(19, 29]",Garment Lower body
3,2,6707,1,"(19, 29]",Garment Lower body
4,3,1173,1,"(49, 59]",Accessories
...,...,...,...,...,...
1051725,256353,13092,1,"(19, 29]",Garment Upper body
1051726,256354,5157,1,"(19, 29]",Garment Upper body
1051727,256354,9590,1,"(19, 29]",Garment Upper body
1051728,256354,12089,1,"(19, 29]",Garment Upper body


In [18]:
all_user_agebins = transactions_data_copy.user_agebins.unique()
all_user_agebins

[(39.0, 49.0], (19.0, 29.0], (49.0, 59.0], (59.0, 69.0], (29.0, 39.0], (69.0, 119.0], (-1.0, 19.0], NaN]
Categories (7, interval[int64, right]): [(-1, 19] < (19, 29] < (29, 39] < (39, 49] < (49, 59] < (59, 69] < (69, 119]]

In [19]:
all_item_groupnames = transactions_data_copy.item_groupname.unique()
all_item_groupnames

array(['Garment Upper body', 'Garment Lower body', 'Accessories',
       'Unknown', 'Garment Full body', 'Underwear', 'Shoes', 'Nightwear',
       'Swimwear', 'Socks & Tights', 'Items', 'Bags',
       'Garment and Shoe care', 'Cosmetic', 'Fun', 'Underwear/nightwear',
       'Stationery'], dtype=object)

In [20]:
lfm_dataset_tcount = Dataset()
lfm_dataset_tcount.fit(transactions_data_copy['userID'], 
                    transactions_data_copy['itemID'], 
                    item_features=all_item_groupnames,
                    user_features=all_user_agebins)

In [21]:
item_features = lfm_dataset_tcount.build_item_features(
                    (x, [y]) for x,y in zip(transactions_data_copy.itemID, transactions_data_copy.item_groupname)
                )

In [22]:
user_features = lfm_dataset_tcount.build_user_features(
                    (x, [y]) for x,y in zip(transactions_data_copy.userID, transactions_data_copy.user_agebins)
                )

In [23]:
(lfm_interactions, lfm_weights) = lfm_dataset_tcount.build_interactions(transactions_data_copy.iloc[:, 0:3].values)

train_interactions, test_interactions = cross_validation.random_train_test_split(
    lfm_interactions, test_percentage=TEST_PERCENTAGE,
    random_state=np.random.RandomState(SEEDNO))

In [24]:
def sample_hyperparameters():
    while True:
        yield {
            "no_components": np.random.randint(16, 64),
            "learning_schedule": np.random.choice(["adagrad", "adadelta"]),
            "loss": np.random.choice(["bpr", "warp", "warp-kos"]),
            "learning_rate": np.random.exponential(0.05),
            "item_alpha": np.random.exponential(1e-8),
            "user_alpha": np.random.exponential(1e-8),
            "max_sampled": np.random.randint(5, 15),
            "num_epochs": np.random.randint(5, 50),
        }

In [25]:
def random_search(train_interactions, test_interactions, user_features = None, item_features= None, num_samples=20, num_threads=1):
    for hyperparams in itertools.islice(sample_hyperparameters(), num_samples):
        num_epochs = hyperparams.pop("num_epochs")

        model = LightFM(**hyperparams, random_state = np.random.RandomState(SEEDNO))
        if user_features is not None and item_features is not None:
            model.fit(interactions = train_interactions, 
                      user_features = user_features,
                      item_features = item_features,
                      epochs = num_epochs, 
                      num_threads = num_threads)
            
            score = precision_at_k(model, 
                               test_interactions, 
                               train_interactions=train_interactions, 
                               k=12, 
                               user_features = user_features,
                               item_features = item_features,
                               num_threads=num_threads).mean()
        else:
            model.fit(interactions = train_interactions, 
                      epochs = num_epochs, 
                      num_threads = num_threads)
            
            score = precision_at_k(model, 
                               test_interactions, 
                               train_interactions=train_interactions, 
                               k=12, 
                               num_threads=num_threads).mean()
        
        hyperparams["num_epochs"] = num_epochs
        
        print(hyperparams)
        print(score)

        yield (score, hyperparams, model)

In [26]:
(score, hyperparams, model) = max(random_search(train_interactions = train_interactions, 
                                                test_interactions = test_interactions, 
                                                user_features = user_features,
                                                item_features = item_features,
                                                num_threads = 4), key=lambda x: x[0])

{'no_components': 19, 'learning_schedule': 'adagrad', 'loss': 'bpr', 'learning_rate': 0.006181621779225343, 'item_alpha': 2.053054525443846e-09, 'user_alpha': 7.06300208764061e-09, 'max_sampled': 11, 'num_epochs': 44}
0.00070166297
{'no_components': 63, 'learning_schedule': 'adagrad', 'loss': 'warp', 'learning_rate': 0.016241510547664085, 'item_alpha': 6.093385191890508e-09, 'user_alpha': 2.3263619131111628e-08, 'max_sampled': 7, 'num_epochs': 43}
0.001533602
{'no_components': 21, 'learning_schedule': 'adagrad', 'loss': 'bpr', 'learning_rate': 0.03100805015335148, 'item_alpha': 3.1142289657446034e-08, 'user_alpha': 5.6703013299934025e-09, 'max_sampled': 6, 'num_epochs': 43}
0.0022101242
{'no_components': 20, 'learning_schedule': 'adadelta', 'loss': 'warp-kos', 'learning_rate': 0.13236296442093934, 'item_alpha': 6.652736650262878e-10, 'user_alpha': 2.7209314931734984e-08, 'max_sampled': 9, 'num_epochs': 12}
0.0017124463
{'no_components': 38, 'learning_schedule': 'adadelta', 'loss': 'bpr

In [27]:
print("Best score {} at {}".format(score, hyperparams))

Best score 0.0034134648740291595 at {'no_components': 37, 'learning_schedule': 'adadelta', 'loss': 'warp', 'learning_rate': 0.02321310851163495, 'item_alpha': 7.404090339430105e-09, 'user_alpha': 1.85472443823004e-08, 'max_sampled': 14, 'num_epochs': 41}


In [28]:
transactions_data_tspent = transactions_data.groupby(['customer_id','article_id']).agg({'price':'sum'}).reset_index()
transactions_data_tspent = transactions_data_tspent[['customer_id','article_id','price']]
transactions_data_tspent.head()

Unnamed: 0,customer_id,article_id,price
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,568601043,0.050831
1,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,794321007,0.061
2,0000757967448a6cb83efb3ea7a3fb9d418ac7adf2379d...,448509014,0.042356
3,0000757967448a6cb83efb3ea7a3fb9d418ac7adf2379d...,719530003,0.033881
4,00009d946eec3ea54add5ba56d5210ea898def4b46c685...,516859008,0.013542


In [29]:
def get_customers_list():
    return np.sort(transactions_data_tspent['customer_id'].unique())

def get_articles_list():
    return np.sort(transactions_data_tspent['article_id'].unique())

def id_mappings(customers_list, articles_list):
    """
    
    Create id mappings to convert user_id and item_id
    
    """
    customer_to_index_mapping = {}
    index_to_customer_mapping = {}
    for customer_index, customer_id in enumerate(customers_list):
        customer_to_index_mapping[customer_id] = customer_index
        index_to_customer_mapping[customer_index] = customer_id
        
    article_to_index_mapping = {}
    index_to_article_mapping = {}
    for article_index, article_id in enumerate(articles_list):
        article_to_index_mapping[article_id] = article_index
        index_to_article_mapping[article_index] = article_id
        
    return customer_to_index_mapping, index_to_customer_mapping, \
           article_to_index_mapping, index_to_article_mapping

In [30]:
customers = get_customers_list()
articles = get_articles_list()

In [31]:
# Generate mapping, LightFM library can't read other than (integer) index
customer_to_index_mapping, index_to_customer_mapping, \
article_to_index_mapping, index_to_article_mapping = id_mappings(customers, articles)

In [32]:
transactions_data_tspent['userID'] = transactions_data_tspent['customer_id'].apply(lambda x:customer_to_index_mapping.get(x))
transactions_data_tspent['itemID'] = transactions_data_tspent['article_id'].apply(lambda x:article_to_index_mapping.get(x))
transactions_data_tspent = transactions_data_tspent[['userID','itemID','price']].rename(columns={'price':'rating'})

In [33]:
listBin = [-1, 19, 29, 39, 49, 59, 69, 119]
customer_data_copy = customer_data.copy()
customer_data_copy['user_agebins'] = pd.cut(customer_data_copy['age'], listBin)
customer_data_copy['userID'] = customer_data_copy['customer_id'].apply(lambda x:customer_to_index_mapping.get(x))
customer_data_copy

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,user_agebins,userID
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,,,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...,"(39, 49]",0.0
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,,,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...,"(19, 29]",
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,,,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...,"(19, 29]",1.0
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,,,ACTIVE,NONE,54.0,5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...,"(49, 59]",
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,ACTIVE,Regularly,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd...,"(49, 59]",
...,...,...,...,...,...,...,...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,,,ACTIVE,NONE,24.0,7aa399f7e669990daba2d92c577b52237380662f36480b...,"(19, 29]",256353.0
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,,,ACTIVE,NONE,21.0,3f47f1279beb72215f4de557d950e0bfa73789d24acb5e...,"(19, 29]",
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,1.0,1.0,ACTIVE,Regularly,21.0,4563fc79215672cd6a863f2b4bf56b8f898f2d96ed590e...,"(19, 29]",256354.0
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,1.0,1.0,ACTIVE,Regularly,18.0,8892c18e9bc3dca6aa4000cb8094fc4b51ee8db2ed14d7...,"(-1, 19]",


In [34]:
transactions_data_copy = transactions_data_tspent.copy()
transactions_data_copy = pd.merge(transactions_data_copy, customer_data_copy[['userID','user_agebins']], how='left', on='userID')
transactions_data_copy

Unnamed: 0,userID,itemID,rating,user_agebins
0,0,2020,0.050831,"(39, 49]"
1,1,12041,0.061000,"(19, 29]"
2,2,683,0.042356,"(19, 29]"
3,2,6707,0.033881,"(19, 29]"
4,3,1173,0.013542,"(49, 59]"
...,...,...,...,...
1051725,256353,13092,0.025407,"(19, 29]"
1051726,256354,5157,0.010017,"(19, 29]"
1051727,256354,9590,0.025407,"(19, 29]"
1051728,256354,12089,0.014322,"(19, 29]"


In [35]:
article_data_copy = article_data.copy()
article_data_copy['itemID'] = article_data_copy['article_id'].apply(lambda x:article_to_index_mapping.get(x))
article_data_copy.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc,itemID
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.,
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.,0.0
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.,
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde...",
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde...",


In [36]:
transactions_data_copy = pd.merge(transactions_data_copy, article_data_copy[['itemID','product_group_name']], how='left', on='itemID')
transactions_data_copy.rename(columns={'product_group_name':'item_groupname'}, inplace=True)
transactions_data_copy

Unnamed: 0,userID,itemID,rating,user_agebins,item_groupname
0,0,2020,0.050831,"(39, 49]",Garment Upper body
1,1,12041,0.061000,"(19, 29]",Garment Upper body
2,2,683,0.042356,"(19, 29]",Garment Lower body
3,2,6707,0.033881,"(19, 29]",Garment Lower body
4,3,1173,0.013542,"(49, 59]",Accessories
...,...,...,...,...,...
1051725,256353,13092,0.025407,"(19, 29]",Garment Upper body
1051726,256354,5157,0.010017,"(19, 29]",Garment Upper body
1051727,256354,9590,0.025407,"(19, 29]",Garment Upper body
1051728,256354,12089,0.014322,"(19, 29]",Garment Upper body


In [37]:
all_user_agebins = transactions_data_copy.user_agebins.unique()
all_user_agebins

[(39.0, 49.0], (19.0, 29.0], (49.0, 59.0], (59.0, 69.0], (29.0, 39.0], (69.0, 119.0], (-1.0, 19.0], NaN]
Categories (7, interval[int64, right]): [(-1, 19] < (19, 29] < (29, 39] < (39, 49] < (49, 59] < (59, 69] < (69, 119]]

In [38]:
all_item_groupnames = transactions_data_copy.item_groupname.unique()
all_item_groupnames

array(['Garment Upper body', 'Garment Lower body', 'Accessories',
       'Unknown', 'Garment Full body', 'Underwear', 'Shoes', 'Nightwear',
       'Swimwear', 'Socks & Tights', 'Items', 'Bags',
       'Garment and Shoe care', 'Cosmetic', 'Fun', 'Underwear/nightwear',
       'Stationery'], dtype=object)

In [39]:
lfm_dataset_tspent = Dataset()
lfm_dataset_tspent.fit(transactions_data_copy['userID'], 
                    transactions_data_copy['itemID'], 
                    item_features=all_item_groupnames,
                    user_features=all_user_agebins)

In [40]:
item_features = lfm_dataset_tspent.build_item_features(
                    (x, [y]) for x,y in zip(transactions_data_copy.itemID, transactions_data_copy.item_groupname)
                )

In [41]:
user_features = lfm_dataset_tspent.build_user_features(
                    (x, [y]) for x,y in zip(transactions_data_copy.userID, transactions_data_copy.user_agebins)
                )

In [42]:
(lfm_interactions, lfm_weights) = lfm_dataset_tspent.build_interactions(transactions_data_copy.iloc[:, 0:3].values)

train_interactions, test_interactions = cross_validation.random_train_test_split(
    lfm_interactions, test_percentage=TEST_PERCENTAGE,
    random_state=np.random.RandomState(SEEDNO))

In [43]:
(score, hyperparams, model) = max(random_search(train_interactions = train_interactions, 
                                                test_interactions = test_interactions, 
                                                user_features = user_features,
                                                item_features = item_features,
                                                num_threads = 4), key=lambda x: x[0])

{'no_components': 63, 'learning_schedule': 'adagrad', 'loss': 'warp', 'learning_rate': 0.057611449535388326, 'item_alpha': 7.588906829463319e-09, 'user_alpha': 2.206149735487375e-09, 'max_sampled': 7, 'num_epochs': 7}
0.0016433086
{'no_components': 34, 'learning_schedule': 'adagrad', 'loss': 'warp-kos', 'learning_rate': 0.04599959211326446, 'item_alpha': 8.336039914891672e-10, 'user_alpha': 1.3914841842880575e-08, 'max_sampled': 6, 'num_epochs': 29}
0.0021581277
{'no_components': 32, 'learning_schedule': 'adadelta', 'loss': 'bpr', 'learning_rate': 0.024069433448681503, 'item_alpha': 3.49374102987793e-09, 'user_alpha': 1.7702021689906144e-08, 'max_sampled': 8, 'num_epochs': 20}
0.00073651754
{'no_components': 25, 'learning_schedule': 'adadelta', 'loss': 'warp', 'learning_rate': 0.10226226667362757, 'item_alpha': 1.074222002054221e-08, 'user_alpha': 1.263668648225221e-09, 'max_sampled': 13, 'num_epochs': 13}
0.0019655707
{'no_components': 18, 'learning_schedule': 'adadelta', 'loss': 'war

In [44]:
print("Best score {} at {}".format(score, hyperparams))

Best score 0.003327185520902276 at {'no_components': 46, 'learning_schedule': 'adadelta', 'loss': 'warp', 'learning_rate': 0.08851832346949123, 'item_alpha': 1.438670360744596e-09, 'user_alpha': 1.7759840751285706e-08, 'max_sampled': 12, 'num_epochs': 43}
