In [1]:
!pip install lightfm


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting lightfm
  Downloading lightfm-1.16.tar.gz (310 kB)
[K     |████████████████████████████████| 310 kB 7.4 MB/s 
Building wheels for collected packages: lightfm
  Building wheel for lightfm (setup.py) ... [?25l[?25hdone
  Created wheel for lightfm: filename=lightfm-1.16-cp37-cp37m-linux_x86_64.whl size=705398 sha256=1229fdbfb9de2e38e24fa23606d9f802facc30faaefc2f6de4f4cb93493a4462
  Stored in directory: /root/.cache/pip/wheels/f8/56/28/5772a3bd3413d65f03aa452190b00898b680b10028a1021914
Successfully built lightfm
Installing collected packages: lightfm
Successfully installed lightfm-1.16


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Матричная факторизация
#from implicit.als import AlternatingLeastSquares
#from implicit.nearest_neighbours import bm25_weight, tfidf_weight

from lightfm import LightFM

# Функции из 1-ого вебинара
import os, sys

module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

In [4]:
from lightfm.evaluation import precision_at_k, recall_at_k

from metrics import precision_at_k as custom_precision, recall_at_k
from utils import prefilter_items

In [5]:
data = pd.read_csv('data/retail_train.csv')

item_features = pd.read_csv('data/product.csv')
user_features = pd.read_csv('data/hh_demographic.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

# train test split
test_size_weeks = 3

data_train = data[data['week_no'] < data['week_no'].max() - test_size_weeks]
data_test = data[data['week_no'] >= data['week_no'].max() - test_size_weeks]

data_train.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [6]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[821867, 834484, 856942, 865456, 889248, 90795..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963..."


In [7]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [8]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


In [9]:
user_features['age_desc'].unique()

array(['65+', '45-54', '25-34', '35-44', '19-24', '55-64'], dtype=object)

In [10]:
user_features['marital_status_code'].unique()

array(['A', 'U', 'B'], dtype=object)

In [11]:
user_features['household_size_desc'].unique()

array(['2', '3', '4', '1', '5+'], dtype=object)

## 1. Filter items

In [12]:
data_train.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0
2,2375,26984851472,1,1036325,1,0.99,364,-0.3,1631,1,0.0,0.0
3,2375,26984851472,1,1082185,1,1.21,364,0.0,1631,1,0.0,0.0
4,2375,26984851472,1,8160430,1,1.5,364,-0.39,1631,1,0.0,0.0


In [13]:
n_items_before = data_train['item_id'].nunique()

data_train_filtered = prefilter_items(data_train, take_n_popular=5000, item_features=item_features)

n_items_after = data_train_filtered['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['price'] = data['sales_value'] / (np.maximum(data['quantity'], 1))


Decreased # items from 86865 to 5001


In [18]:
data_train_filtered['item_id'].value_counts()

999999      303002
1029743      10378
1106523       7185
5569230       4134
916122        3893
             ...  
5570974         13
859809          12
898448          10
12696099        10
1058629          7
Name: item_id, Length: 5001, dtype: int64

# 2. Prepare data set

## 2.1 Prepare csr train matrix

In [19]:

user_item_matrix = pd.pivot_table(data_train_filtered, 
                                  index='user_id', columns='item_id', 
                                  values='quantity', # Можно пробоват ьдругие варианты
                                  aggfunc='count', 
                                  fill_value=0
                                 )

user_item_matrix = user_item_matrix.astype(float) # необходимый тип матрицы для implicit

# переведем в формат sparse matrix
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

user_item_matrix.head(2)

item_id,117847,818981,819255,819308,819400,819487,819590,819594,819840,819845,...,15926775,15926844,15926886,15972074,15972298,15972565,15972790,16100266,16729299,16729415
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 2.2 Prepare CSR test matrix

In [20]:
data_test = data_test[data_test['item_id'].isin(data_train['item_id'].unique())]

test_user_item_matrix = pd.pivot_table(data_test, 
                                  index='user_id', columns='item_id', 
                                  values='quantity', # Можно пробоват ьдругие варианты
                                  aggfunc='count', 
                                  fill_value=0
                                 )

test_user_item_matrix = test_user_item_matrix.astype(float) # необходимый тип матрицы для implicit

In [23]:
userids = user_item_matrix.index.values
itemids = user_item_matrix.columns.values

matrix_userids = np.arange(len(userids))
matrix_itemids = np.arange(len(itemids))

id_to_itemid = dict(zip(matrix_itemids, itemids))
id_to_userid = dict(zip(matrix_userids, userids))

itemid_to_id = dict(zip(itemids, matrix_itemids))
userid_to_id = dict(zip(userids, matrix_userids))

## 3. Prepare user and item features

In [24]:
user_feat = pd.DataFrame(user_item_matrix.index)
user_feat = user_feat.merge(user_features, on='user_id', how='left')
user_feat.set_index('user_id', inplace=True)
user_feat.head(2)

Unnamed: 0_level_0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,65+,A,35-49K,Homeowner,2 Adults No Kids,2.0,None/Unknown
2,,,,,,,


In [27]:
user_feat.shape

(2497, 7)

In [28]:
item_feat = pd.DataFrame(user_item_matrix.columns)
item_feat = item_feat.merge(item_features, on='item_id', how='left')
item_feat.set_index('item_id', inplace=True)

item_feat.head(2)

Unnamed: 0_level_0,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
117847,450.0,NUTRITION,National,REFRIGERATED,SOY/RICE MILK,64 OZ
818981,194.0,GROCERY,National,COLD CEREAL,ALL FAMILY CEREAL,10.4 OZ


In [29]:
item_feat.shape

(5001, 6)

## Encoding features

In [30]:
user_feat_lightfm = pd.get_dummies(user_feat, columns=user_feat.columns.tolist())
item_feat_lightfm = pd.get_dummies(item_feat, columns=item_feat.columns.tolist())

In [None]:
user_feat_lightfm.head(2)

Unnamed: 0_level_0,age_desc_19-24,age_desc_25-34,age_desc_35-44,age_desc_45-54,age_desc_55-64,age_desc_65+,marital_status_code_A,marital_status_code_B,marital_status_code_U,income_desc_100-124K,...,hh_comp_desc_Unknown,household_size_desc_1,household_size_desc_2,household_size_desc_3,household_size_desc_4,household_size_desc_5+,kid_category_desc_1,kid_category_desc_2,kid_category_desc_3+,kid_category_desc_None/Unknown
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,1,1,0,0,0,...,0,0,1,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Init model

In [31]:
model = LightFM(no_components=40,
                loss='bpr', # "logistic","bpr", "warp"
                learning_rate=0.01, 
                item_alpha=0.4,
                user_alpha=0.1, 
                random_state=42,
                k=5,
                n=15,
                max_sampled=100)

## Train

In [32]:
model.fit((sparse_user_item > 0) * 1,  # user-item matrix из 0 и 1
          sample_weight=coo_matrix(user_item_matrix),
          user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
          item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
          epochs=20, 
          num_threads=20,
          verbose=True) 

Epoch: 100%|██████████| 20/20 [00:41<00:00,  2.05s/it]


<lightfm.lightfm.LightFM at 0x7f6a36eef5d0>

# Predict

In [42]:
# подготавливаемм id для юзеров и товаров в порядке пар user-item
users_ids_row = data_train_filtered['user_id'].apply(lambda x: userid_to_id[x]).values.astype(int)
items_ids_row = data_train_filtered['item_id'].apply(lambda x: itemid_to_id[x]).values.astype(int)

In [None]:
users_ids_row[:10]

array([2371, 1363, 1363, 1363, 1363, 1171, 1171, 1171, 1171, 1171])

In [43]:
# модель возвращает меру/скор похожести между соответствующим пользователем и товаром
predictions = model.predict(user_ids=users_ids_row,
                            item_ids=items_ids_row,
                            user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
                            item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
                            num_threads=10)

In [44]:
# добавляем наш полученный скор в трейн датафрейм
data_train_filtered['score'] = predictions

In [45]:
data_train_filtered.head()

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc,price,score
7,2375,26984851516,1,1085983,1,2.99,364,-0.4,1642,1,0.0,0.0,2.99,-0.239088
11,1364,26984896261,1,999999,1,2.19,31742,0.0,1520,1,0.0,0.0,2.19,-4.76654
12,1364,26984896261,1,999999,1,2.99,31742,-0.4,1520,1,0.0,0.0,2.99,-4.76654
13,1364,26984896261,1,999999,1,3.09,31742,0.0,1520,1,0.0,0.0,3.09,-4.76654
14,1364,26984896261,1,937406,1,2.5,31742,-0.99,1520,1,0.0,0.0,2.5,-4.95304


In [46]:
# создаем предикт датафрейм в формате списка това
predict_result = data_train_filtered[['user_id','item_id','score']][data_train_filtered.item_id != 999999].drop_duplicates().sort_values(by=['user_id','score'], ascending=False).groupby('user_id')['item_id']. \
            unique().reset_index()

In [47]:
predict_result.head()

Unnamed: 0,user_id,item_id
0,1,"[1029743, 6034857, 1088462, 838867, 7431408, 7..."
1,2,"[1106523, 1075368, 899624, 6919458, 952163, 96..."
2,3,"[1106523, 983584, 5585510, 899624, 866211, 946..."
3,4,"[1029743, 1075368, 1052294, 7431408, 970760, 1..."
4,5,"[1126899, 1029743, 6034991, 825659, 1112387, 9..."


In [48]:
# объединяем предикт и тест датасет для подсчета precision
df_result_for_metrics = result.merge(predict_result, on='user_id', how='inner')

In [49]:
df_result_for_metrics.head()

Unnamed: 0,user_id,actual,item_id
0,1,"[821867, 834484, 856942, 865456, 889248, 90795...","[1029743, 6034857, 1088462, 838867, 7431408, 7..."
1,3,"[835476, 851057, 872021, 878302, 879948, 90963...","[1106523, 983584, 5585510, 899624, 866211, 946..."
2,6,"[920308, 926804, 946489, 1006718, 1017061, 107...","[1070820, 1029743, 1126899, 1121393, 9524291, ..."
3,7,"[840386, 889774, 898068, 909714, 929067, 95347...","[1029743, 1126899, 1106523, 7147142, 1072494, ..."
4,8,"[835098, 872137, 910439, 924610, 992977, 10412...","[1106523, 1070820, 1029743, 5585510, 6034857, ..."


### Test with custom precision func

In [50]:
precision = df_result_for_metrics.apply(lambda row: custom_precision(row['item_id'], row['actual'],k=5), axis=1).mean()
print(f"Precision: {precision}")

Precision: 0.14170771756978476


# Домашнее задание

#### 1) Прочитать статьи про BPR, WARP loss

#### 2) Сделать грид серч текущей модели, смотрите на метрику precision@5, считаем на тесте нашей функцией
    Подбор параметров:
    - Loss
    - no_components
    - regularization

In [80]:
n_comp = [40, 80, 150]
losses = ["bpr", "warp"] 
learning_rates = [0.1, 0.01]
item_alphas = [0.5, 0.3]
user_alphas = [0.1, 0.05]
max_samples = [100, 150]
precisions = {
    'n_comp' : [],
    'losses' : [],
    'learning_rates' : [],
    'item_alphas' : [],
    'user_alphas' : [],
    'max_samples' : [],
    'precisions' : []
}


In [81]:

for n_c in n_comp:
  for l in losses:
    for lr in learning_rates:
      for ia in item_alphas:
        for ua in user_alphas:
          for ms in max_samples:

            model = LightFM(no_components=n_c,
                            loss=l, # "bpr", "warp"
                            learning_rate=lr, 
                            item_alpha=ia,
                            user_alpha=ua, 
                            random_state=42,
                            k=5,
                            n=15,
                            max_sampled=ms)
            model.fit((sparse_user_item > 0) * 1,  # user-item matrix из 0 и 1
                      sample_weight=coo_matrix(user_item_matrix),
                      user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
                      item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
                      epochs=10, 
                      num_threads=20,
                      verbose=True) 
            users_ids_row = data_train_filtered['user_id'].apply(lambda x: userid_to_id[x]).values.astype(int)
            items_ids_row = data_train_filtered['item_id'].apply(lambda x: itemid_to_id[x]).values.astype(int)

            predictions = model.predict(user_ids=users_ids_row,
                                        item_ids=items_ids_row,
                                        user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
                                        item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
                                        num_threads=10)

            data_train_filtered['score'] = predictions

            predict_result = data_train_filtered[['user_id','item_id','score']][data_train_filtered.item_id != 999999].drop_duplicates().sort_values(by=['user_id','score'], ascending=False).groupby('user_id')['item_id']. \
                        unique().reset_index()

            df_result_for_metrics = result.merge(predict_result, on='user_id', how='inner')
            precision = df_result_for_metrics.apply(lambda row: custom_precision(row['item_id'], row['actual'],k=5), axis=1).mean()


            precisions['n_comp'].append(n_c)
            precisions['losses'].append(l) 
            precisions['learning_rates'].append(lr) 
            precisions['item_alphas'].append(ia) 
            precisions['user_alphas'].append(ua) 
            precisions['max_samples'].append(ms)
            precisions['precisions'].append(precision)
            print(len(precisions['precisions']), 'Из 96')   

Epoch: 100%|██████████| 10/10 [00:20<00:00,  2.01s/it]


1 Из 96


Epoch: 100%|██████████| 10/10 [00:19<00:00,  1.99s/it]


2 Из 96


Epoch: 100%|██████████| 10/10 [00:20<00:00,  2.00s/it]


3 Из 96


Epoch: 100%|██████████| 10/10 [00:19<00:00,  1.98s/it]


4 Из 96


Epoch: 100%|██████████| 10/10 [00:19<00:00,  2.00s/it]


5 Из 96


Epoch: 100%|██████████| 10/10 [00:23<00:00,  2.36s/it]


6 Из 96


Epoch: 100%|██████████| 10/10 [00:20<00:00,  2.00s/it]


7 Из 96


Epoch: 100%|██████████| 10/10 [00:19<00:00,  1.99s/it]


8 Из 96


Epoch: 100%|██████████| 10/10 [00:19<00:00,  1.99s/it]


9 Из 96


Epoch: 100%|██████████| 10/10 [00:19<00:00,  1.98s/it]


10 Из 96


Epoch: 100%|██████████| 10/10 [00:19<00:00,  1.99s/it]


11 Из 96


Epoch: 100%|██████████| 10/10 [00:19<00:00,  1.99s/it]


12 Из 96


Epoch: 100%|██████████| 10/10 [00:20<00:00,  2.00s/it]


13 Из 96


Epoch: 100%|██████████| 10/10 [00:20<00:00,  2.08s/it]


14 Из 96


Epoch: 100%|██████████| 10/10 [00:19<00:00,  1.99s/it]


15 Из 96


Epoch: 100%|██████████| 10/10 [00:19<00:00,  1.97s/it]


16 Из 96


Epoch: 100%|██████████| 10/10 [00:25<00:00,  2.52s/it]


17 Из 96


Epoch: 100%|██████████| 10/10 [00:24<00:00,  2.43s/it]


18 Из 96


Epoch: 100%|██████████| 10/10 [00:25<00:00,  2.52s/it]


19 Из 96


Epoch: 100%|██████████| 10/10 [00:26<00:00,  2.63s/it]


20 Из 96


Epoch: 100%|██████████| 10/10 [00:24<00:00,  2.49s/it]


21 Из 96


Epoch: 100%|██████████| 10/10 [00:25<00:00,  2.58s/it]


22 Из 96


Epoch: 100%|██████████| 10/10 [00:24<00:00,  2.49s/it]


23 Из 96


Epoch: 100%|██████████| 10/10 [00:25<00:00,  2.53s/it]


24 Из 96


Epoch: 100%|██████████| 10/10 [00:26<00:00,  2.64s/it]


25 Из 96


Epoch: 100%|██████████| 10/10 [00:29<00:00,  2.91s/it]


26 Из 96


Epoch: 100%|██████████| 10/10 [00:26<00:00,  2.64s/it]


27 Из 96


Epoch: 100%|██████████| 10/10 [00:27<00:00,  2.77s/it]


28 Из 96


Epoch: 100%|██████████| 10/10 [00:24<00:00,  2.46s/it]


29 Из 96


Epoch: 100%|██████████| 10/10 [00:25<00:00,  2.57s/it]


30 Из 96


Epoch: 100%|██████████| 10/10 [00:26<00:00,  2.63s/it]


31 Из 96


Epoch: 100%|██████████| 10/10 [00:27<00:00,  2.78s/it]


32 Из 96


Epoch: 100%|██████████| 10/10 [00:50<00:00,  5.05s/it]


33 Из 96


Epoch: 100%|██████████| 10/10 [00:51<00:00,  5.15s/it]


34 Из 96


Epoch: 100%|██████████| 10/10 [00:39<00:00,  3.91s/it]


35 Из 96


Epoch: 100%|██████████| 10/10 [00:38<00:00,  3.81s/it]


36 Из 96


Epoch: 100%|██████████| 10/10 [00:37<00:00,  3.80s/it]


37 Из 96


Epoch: 100%|██████████| 10/10 [00:38<00:00,  3.81s/it]


38 Из 96


Epoch: 100%|██████████| 10/10 [00:38<00:00,  3.82s/it]


39 Из 96


Epoch: 100%|██████████| 10/10 [00:38<00:00,  3.84s/it]


40 Из 96


Epoch: 100%|██████████| 10/10 [00:38<00:00,  3.83s/it]


41 Из 96


Epoch: 100%|██████████| 10/10 [00:38<00:00,  3.82s/it]


42 Из 96


Epoch: 100%|██████████| 10/10 [00:39<00:00,  3.96s/it]


43 Из 96


Epoch: 100%|██████████| 10/10 [00:37<00:00,  3.77s/it]


44 Из 96


Epoch: 100%|██████████| 10/10 [00:38<00:00,  3.86s/it]


45 Из 96


Epoch: 100%|██████████| 10/10 [00:38<00:00,  3.83s/it]


46 Из 96


Epoch: 100%|██████████| 10/10 [00:38<00:00,  3.84s/it]


47 Из 96


Epoch: 100%|██████████| 10/10 [00:38<00:00,  3.84s/it]


48 Из 96


Epoch: 100%|██████████| 10/10 [00:46<00:00,  4.62s/it]


49 Из 96


Epoch: 100%|██████████| 10/10 [00:49<00:00,  4.91s/it]


50 Из 96


Epoch: 100%|██████████| 10/10 [00:48<00:00,  4.88s/it]


51 Из 96


Epoch: 100%|██████████| 10/10 [00:48<00:00,  4.85s/it]


52 Из 96


Epoch: 100%|██████████| 10/10 [00:47<00:00,  4.72s/it]


53 Из 96


Epoch: 100%|██████████| 10/10 [00:48<00:00,  4.89s/it]


54 Из 96


Epoch: 100%|██████████| 10/10 [00:47<00:00,  4.79s/it]


55 Из 96


Epoch: 100%|██████████| 10/10 [00:49<00:00,  4.94s/it]


56 Из 96


Epoch: 100%|██████████| 10/10 [00:49<00:00,  4.91s/it]


57 Из 96


Epoch: 100%|██████████| 10/10 [00:52<00:00,  5.23s/it]


58 Из 96


Epoch: 100%|██████████| 10/10 [00:48<00:00,  4.89s/it]


59 Из 96


Epoch: 100%|██████████| 10/10 [00:51<00:00,  5.13s/it]


60 Из 96


Epoch: 100%|██████████| 10/10 [00:45<00:00,  4.53s/it]


61 Из 96


Epoch: 100%|██████████| 10/10 [00:46<00:00,  4.66s/it]


62 Из 96


Epoch: 100%|██████████| 10/10 [00:49<00:00,  4.90s/it]


63 Из 96


Epoch: 100%|██████████| 10/10 [00:52<00:00,  5.29s/it]


64 Из 96


Epoch: 100%|██████████| 10/10 [01:39<00:00,  9.93s/it]


65 Из 96


Epoch: 100%|██████████| 10/10 [01:15<00:00,  7.59s/it]


66 Из 96


Epoch: 100%|██████████| 10/10 [01:12<00:00,  7.29s/it]


67 Из 96


Epoch: 100%|██████████| 10/10 [01:12<00:00,  7.27s/it]


68 Из 96


Epoch: 100%|██████████| 10/10 [01:12<00:00,  7.30s/it]


69 Из 96


Epoch: 100%|██████████| 10/10 [01:11<00:00,  7.19s/it]


70 Из 96


Epoch: 100%|██████████| 10/10 [01:12<00:00,  7.23s/it]


71 Из 96


Epoch: 100%|██████████| 10/10 [01:09<00:00,  6.91s/it]


72 Из 96


Epoch: 100%|██████████| 10/10 [01:12<00:00,  7.27s/it]


73 Из 96


Epoch: 100%|██████████| 10/10 [01:11<00:00,  7.18s/it]


74 Из 96


Epoch: 100%|██████████| 10/10 [01:12<00:00,  7.27s/it]


75 Из 96


Epoch: 100%|██████████| 10/10 [01:12<00:00,  7.22s/it]


76 Из 96


Epoch: 100%|██████████| 10/10 [01:13<00:00,  7.40s/it]


77 Из 96


Epoch: 100%|██████████| 10/10 [01:12<00:00,  7.21s/it]


78 Из 96


Epoch: 100%|██████████| 10/10 [01:13<00:00,  7.31s/it]


79 Из 96


Epoch: 100%|██████████| 10/10 [01:12<00:00,  7.20s/it]


80 Из 96


Epoch: 100%|██████████| 10/10 [01:29<00:00,  8.92s/it]


81 Из 96


Epoch: 100%|██████████| 10/10 [01:31<00:00,  9.19s/it]


82 Из 96


Epoch: 100%|██████████| 10/10 [01:29<00:00,  8.92s/it]


83 Из 96


Epoch: 100%|██████████| 10/10 [01:33<00:00,  9.34s/it]


84 Из 96


Epoch: 100%|██████████| 10/10 [01:29<00:00,  8.93s/it]


85 Из 96


Epoch: 100%|██████████| 10/10 [01:37<00:00,  9.72s/it]


86 Из 96


Epoch: 100%|██████████| 10/10 [01:26<00:00,  8.63s/it]


87 Из 96


Epoch: 100%|██████████| 10/10 [01:33<00:00,  9.31s/it]


88 Из 96


Epoch: 100%|██████████| 10/10 [01:31<00:00,  9.15s/it]


89 Из 96


Epoch: 100%|██████████| 10/10 [01:36<00:00,  9.67s/it]


90 Из 96


Epoch: 100%|██████████| 10/10 [01:33<00:00,  9.31s/it]


91 Из 96


Epoch: 100%|██████████| 10/10 [01:36<00:00,  9.61s/it]


92 Из 96


Epoch: 100%|██████████| 10/10 [01:24<00:00,  8.41s/it]


93 Из 96


Epoch: 100%|██████████| 10/10 [01:25<00:00,  8.59s/it]


94 Из 96


Epoch: 100%|██████████| 10/10 [01:33<00:00,  9.31s/it]


95 Из 96


Epoch: 100%|██████████| 10/10 [01:37<00:00,  9.74s/it]


96 Из 96


In [82]:
df = pd.DataFrame(precisions)

In [83]:
df = df.sort_values('precisions', ascending=False)

In [84]:
df

Unnamed: 0,n_comp,losses,learning_rates,item_alphas,user_alphas,max_samples,precisions
76,150,bpr,0.01,0.3,0.10,100,0.147028
77,150,bpr,0.01,0.3,0.10,150,0.145944
72,150,bpr,0.01,0.5,0.10,100,0.145255
89,150,warp,0.01,0.5,0.10,150,0.144171
45,80,bpr,0.01,0.3,0.10,150,0.143580
...,...,...,...,...,...,...,...
7,40,bpr,0.10,0.3,0.05,150,0.050969
5,40,bpr,0.10,0.3,0.10,150,0.050673
17,40,warp,0.10,0.5,0.10,150,0.049984
69,150,bpr,0.10,0.3,0.10,150,0.049787


In [92]:
model = LightFM(no_components=150,
                loss='bpr', 
                learning_rate=0.01, 
                item_alpha=0.1,
                user_alpha=0.1, 
                random_state=42,
                k=1,
                n=15,
                max_sampled=150)
model.fit((sparse_user_item > 0) * 1,  # user-item matrix из 0 и 1
          sample_weight=coo_matrix(user_item_matrix),
          user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
          item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
          epochs=10, 
          num_threads=20,
          verbose=True) 
users_ids_row = data_train_filtered['user_id'].apply(lambda x: userid_to_id[x]).values.astype(int)
items_ids_row = data_train_filtered['item_id'].apply(lambda x: itemid_to_id[x]).values.astype(int)

predictions = model.predict(user_ids=users_ids_row,
                            item_ids=items_ids_row,
                            user_features=csr_matrix(user_feat_lightfm.values).tocsr(),
                            item_features=csr_matrix(item_feat_lightfm.values).tocsr(),
                            num_threads=10)

data_train_filtered['score'] = predictions

predict_result = data_train_filtered[['user_id','item_id','score']][data_train_filtered.item_id != 999999].drop_duplicates().sort_values(by=['user_id','score'], ascending=False).groupby('user_id')['item_id']. \
            unique().reset_index()

df_result_for_metrics = result.merge(predict_result, on='user_id', how='inner')
precision = df_result_for_metrics.apply(lambda row: custom_precision(row['item_id'], row['actual'],k=5), axis=1).mean()


print(f"Precision: {precision}")

Epoch: 100%|██████████| 10/10 [01:15<00:00,  7.58s/it]


Precision: 0.15008210180623777


- Перебрал сначала по сетке, потом руками чуток подкрутил. В итоге лучшая точность Precision: 0.15, параметры получились следующие:  

no_components=150

loss='bpr'

learning_rate=0.01

item_alpha=0.1 - снижение параметра дало ощутимый результат

user_alpha=0.1

k=1 - как в предыдущем занятии, искать похожее на себя выгоднее.

n=15

max_sampled=150