In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Для работы с матрицами
from scipy.sparse import csr_matrix, coo_matrix

# Матричная факторизация
from implicit.als import AlternatingLeastSquares
from implicit.nearest_neighbours import bm25_weight, tfidf_weight

from lightfm import LightFM
from lightfm.evaluation import precision_at_k, recall_at_k

# Функции из 1-ого вебинара
import os, sys

module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from best_rec_lib.metrics import precision_at_k, recall_at_k, ap_k
from best_rec_lib.utils import prefilter
from best_rec_lib.recommenders import MainRecommender



### Данные.

In [2]:
data_train = pd.read_csv('retail_train.csv')
item_features = pd.read_csv('product.csv')
user_features = pd.read_csv('hh_demographic.csv')
data_test = pd. read_csv('retail_test.csv')

# column processing
item_features.columns = [col.lower() for col in item_features.columns]
user_features.columns = [col.lower() for col in user_features.columns]

item_features.rename(columns={'product_id': 'item_id'}, inplace=True)
user_features.rename(columns={'household_key': 'user_id'}, inplace=True)

In [3]:
data_train.head(2)

Unnamed: 0,user_id,basket_id,day,item_id,quantity,sales_value,store_id,retail_disc,trans_time,week_no,coupon_disc,coupon_match_disc
0,2375,26984851472,1,1004906,1,1.39,364,-0.6,1631,1,0.0,0.0
1,2375,26984851472,1,1033142,1,0.82,364,0.0,1631,1,0.0,0.0


In [4]:
item_features.head(2)

Unnamed: 0,item_id,manufacturer,department,brand,commodity_desc,sub_commodity_desc,curr_size_of_product
0,25671,2,GROCERY,National,FRZN ICE,ICE - CRUSHED/CUBED,22 LB
1,26081,2,MISC. TRANS.,National,NO COMMODITY DESCRIPTION,NO SUBCOMMODITY DESCRIPTION,


In [5]:
user_features.head(2)

Unnamed: 0,age_desc,marital_status_code,income_desc,homeowner_desc,hh_comp_desc,household_size_desc,kid_category_desc,user_id
0,65+,A,35-49K,Homeowner,2 Adults No Kids,2,None/Unknown,1
1,45-54,A,50-74K,Homeowner,2 Adults No Kids,2,None/Unknown,7


In [6]:
result = data_test.groupby('user_id')['item_id'].unique().reset_index()
result.columns=['user_id', 'actual']
result.head(2)

Unnamed: 0,user_id,actual
0,1,"[880007, 883616, 931136, 938004, 940947, 94726..."
1,2,"[820165, 820291, 826784, 826835, 829009, 85784..."


### Предобработка данных:
        отфильтруем товары:
            - оставим топ-5000 товаров, остаьные заменим на фейковый 999999
        

In [7]:
take_n_popular = 5000
n_items_before = data_train['item_id'].nunique()
data_train = prefilter(data_train, take_n_popular, item_features)
n_items_after = data_train['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 89051 to 5001


### Инициализируем MainRecommender

In [8]:
main_rec = MainRecommender(data_train)



  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

### Составим матрицу взаимодействия согласно количеству купленных товаром пользователем

In [9]:
user_item_matrix = main_rec._prepare_matrix(data_train)
user_item_matrix.head(2)

item_id,202291,397896,420647,480014,545926,707683,731106,818980,819063,819227,...,15926775,15926844,15926885,15926886,15926927,15927403,15927661,15927850,16809471,17105257
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

### Обучим модель ItemItemRecommender  и ALS

In [11]:
ItemItem = main_rec.fit_own_recommender(sparse_user_item)

  0%|          | 0/5001 [00:00<?, ?it/s]

In [12]:
ALS = main_rec.fit(sparse_user_item, n_factors=20, regularization=0.01, iterations=15, num_threads=4)

  0%|          | 0/15 [00:00<?, ?it/s]

### Получим предсказанния

In [13]:
result['ItemItem'] = result['user_id'].apply(lambda x: main_rec.get_own_recommendations(x, N=5))

In [14]:
result['ALS'] = result['user_id'].apply(lambda x: main_rec.get_als_recommendations(x, N=5))

In [15]:
result.head(3)

Unnamed: 0,user_id,actual,ItemItem,ALS
0,1,"[880007, 883616, 931136, 938004, 940947, 94726...","[856942, 940947, 5577022, 9297615, 9527290]","[965766, 995242, 1082185, 934369, 962568]"
1,2,"[820165, 820291, 826784, 826835, 829009, 85784...","[1075368, 8090521, 1070803, 1040807, 1133018]","[1133018, 5569230, 1082185, 1106523, 995242]"
2,3,"[827683, 908531, 989069, 1071377, 1080155, 109...","[1053690, 1070803, 1092026, 998206, 885697]","[1133018, 951590, 1106523, 1053690, 1092026]"


### Посчитаем метрики 

In [16]:
result.apply(lambda row: precision_at_k(row['ItemItem'], row['actual']), axis=1).mean()

0.31883289124668374

In [17]:
result.apply(lambda row: precision_at_k(row['ALS'], row['actual']), axis=1).mean()

0.20403183023872407

In [18]:
result.apply(lambda row: recall_at_k(row['ItemItem'], row['actual']), axis=1).mean()

0.05591649194368882

In [19]:
result.apply(lambda row: recall_at_k(row['ALS'], row['actual']), axis=1).mean()

0.03807517981603808

In [20]:
result.apply(lambda row: ap_k(row['ItemItem'], row['actual']), axis=1).mean()

0.24072855879752428

In [21]:
result.apply(lambda row: ap_k(row['ALS'], row['actual']), axis=1).mean()

0.1361061007957553

### Попробуем изменить параметры MainRecommendor
    отфильтруем товары:
            1. оставим топ-3000 товаров, остаьные заменим на фейковый 999999
            2. оставим топ-6000 товаров, остаьные заменим на фейковый 999999


### 1

In [22]:
data_train = pd.read_csv('retail_train.csv')

In [23]:
take_n_popular = 3000
n_items_before = data_train['item_id'].nunique()
data_train_3000 = prefilter(data_train, take_n_popular, item_features)
n_items_after = data_train_3000['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 89051 to 3001


In [24]:
main_rec_3000 = MainRecommender(data_train_3000)

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/3001 [00:00<?, ?it/s]

In [25]:
user_item_matrix = main_rec_3000._prepare_matrix(data_train_3000)
user_item_matrix.head(2)

item_id,202291,397896,420647,480014,545926,707683,731106,818980,819063,819255,...,15506577,15511891,15596279,15926712,15926844,15926886,15927403,15927661,15927850,16809471
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
sparse_user_item = csr_matrix(user_item_matrix).tocsr()

In [27]:
main_rec_3000.fit_own_recommender(sparse_user_item)

  0%|          | 0/3001 [00:00<?, ?it/s]

<implicit.nearest_neighbours.ItemItemRecommender at 0x16906a028b0>

In [28]:
 main_rec_3000.fit(sparse_user_item, n_factors=20, regularization=0.01, iterations=15, num_threads=4)

  0%|          | 0/15 [00:00<?, ?it/s]

<implicit.cpu.als.AlternatingLeastSquares at 0x169059f76a0>

In [29]:
result['ItemItem_3000'] = result['user_id'].apply(lambda x: main_rec_3000.get_own_recommendations(x, N=5))

In [30]:
result['ALS_3000'] = result['user_id'].apply(lambda x: main_rec_3000.get_als_recommendations(x, N=5))

In [31]:
result.apply(lambda row: precision_at_k(row['ItemItem_3000'], row['actual']), axis=1).mean()

0.3540583554376658

In [32]:
result.apply(lambda row: precision_at_k(row['ALS_3000'], row['actual']), axis=1).mean()

0.19236074270556758

In [33]:
result.apply(lambda row: recall_at_k(row['ItemItem_3000'], row['actual']), axis=1).mean()

0.05987876731353509

In [34]:
result.apply(lambda row: recall_at_k(row['ALS_3000'], row['actual']), axis=1).mean()

0.03735925432825599

In [35]:
result.apply(lambda row: ap_k(row['ItemItem_3000'], row['actual']), axis=1).mean()

0.2812183908045979

In [36]:
result.apply(lambda row: ap_k(row['ALS_3000'], row['actual']), axis=1).mean()

0.12331564986737352

### 2

In [37]:
data_train = pd.read_csv('retail_train.csv')

In [38]:
take_n_popular = 6000
n_items_before = data_train['item_id'].nunique()
data_train_6000 = prefilter(data_train, take_n_popular, item_features)
n_items_after = data_train_6000['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 89051 to 6001


In [39]:
main_rec_6000 = MainRecommender(data_train_6000)

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/6001 [00:00<?, ?it/s]

In [40]:
user_item_matrix = main_rec_6000._prepare_matrix(data_train_6000)
sparse_user_item = csr_matrix(user_item_matrix).tocsr()
user_item_matrix.head(2)

item_id,202291,397896,420647,480014,545926,707683,731106,818980,818981,819063,...,15926886,15926887,15926927,15927033,15927403,15927661,15927850,15972298,16809471,17105257
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
main_rec_6000.fit_own_recommender(sparse_user_item)

  0%|          | 0/6001 [00:00<?, ?it/s]

<implicit.nearest_neighbours.ItemItemRecommender at 0x1690636b940>

In [42]:
main_rec_6000.fit(sparse_user_item, n_factors=20, regularization=0.01, iterations=15, num_threads=4)

  0%|          | 0/15 [00:00<?, ?it/s]

<implicit.cpu.als.AlternatingLeastSquares at 0x1690637b130>

In [43]:
result['ItemItem_6000'] = result['user_id'].apply(lambda x: main_rec_6000.get_own_recommendations(x, N=5))

In [44]:
result['ALS_6000'] = result['user_id'].apply(lambda x: main_rec_6000.get_als_recommendations(x, N=5))

In [45]:
result.apply(lambda row: precision_at_k(row['ItemItem_6000'], row['actual']), axis=1).mean()

0.30397877984084765

In [46]:
result.apply(lambda row: precision_at_k(row['ALS_6000'], row['actual']), axis=1).mean()

0.20625994694959912

In [47]:
result.apply(lambda row: recall_at_k(row['ItemItem_6000'], row['actual']), axis=1).mean()

0.05401002670641507

In [48]:
result.apply(lambda row: recall_at_k(row['ALS_6000'], row['actual']), axis=1).mean()

0.03862338169043611

In [49]:
result.apply(lambda row: ap_k(row['ItemItem_6000'], row['actual']), axis=1).mean()

0.22461892130857647

In [50]:
result.apply(lambda row: ap_k(row['ALS_6000'], row['actual']), axis=1).mean()

0.13764456233421668

#### Изменения количество топ товаров сильно не влиеет на качество предсказаний, попробуем добавить еще фильтацию по популярности, а также по цене товара

In [51]:
data = {'precision':[0.235, 0.1728, 0.2348, 0.1725, 0.2346, 0.1732, 0.0735, 0.082 ],
        'recall': [0.042, 0.033, 0.0423, 0.0327, 0.0422, 0.0333, 0.012, 0.014],
        'MAP': [0.161, 0.108, 0.1608, 0.1096, 0.1608, 0.1092, 0.03, 0.46]}
ind = ['It_It_5000', 'ALS_5000', 'It_It_3000', 'ALS_3000', 'It_It_6000', 'ALS_6000', 'It_It_pop', 'ALS_pop']
metrics = pd.DataFrame(data, index = ind)
metrics.T

Unnamed: 0,It_It_5000,ALS_5000,It_It_3000,ALS_3000,It_It_6000,ALS_6000,It_It_pop,ALS_pop
precision,0.235,0.1728,0.2348,0.1725,0.2346,0.1732,0.735,0.82
recall,0.042,0.033,0.0423,0.0327,0.0422,0.0333,0.012,0.014
MAP,0.161,0.108,0.1608,0.1096,0.1608,0.1092,0.03,0.46


Метрики сильно ухудшились, возвращаемся к первоначальному варианту

### Прологарифмируем количество покупок пользователем

In [52]:
data_train = pd.read_csv('retail_train.csv')

In [53]:
take_n_popular = 5000
n_items_before = data_train['item_id'].nunique()
data_train = prefilter(data_train, take_n_popular, item_features)
n_items_after = data_train['item_id'].nunique()
print('Decreased # items from {} to {}'.format(n_items_before, n_items_after))

Decreased # items from 89051 to 5001


In [54]:
main_rec = MainRecommender(data_train)

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/5001 [00:00<?, ?it/s]

In [55]:
user_item_matrix = main_rec._prepare_matrix(data_train)
user_item_matrix.head(2)

item_id,202291,397896,420647,480014,545926,707683,731106,818980,819063,819227,...,15926775,15926844,15926885,15926886,15926927,15927403,15927661,15927850,16809471,17105257
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [56]:
user_item_matrix = np.log(user_item_matrix+1)

In [57]:
sparse_user_item = csr_matrix(user_item_matrix).tocsr()
user_item_matrix.head(2)

item_id,202291,397896,420647,480014,545926,707683,731106,818980,819063,819227,...,15926775,15926844,15926885,15926886,15926927,15927403,15927661,15927850,16809471,17105257
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.693147,0.0,0.0,1.098612,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [58]:
main_rec.fit_own_recommender(sparse_user_item)

  0%|          | 0/5001 [00:00<?, ?it/s]

<implicit.nearest_neighbours.ItemItemRecommender at 0x16906d526a0>

In [59]:
main_rec.fit(sparse_user_item, n_factors=20, regularization=0.01, iterations=15, num_threads=4)

  0%|          | 0/15 [00:00<?, ?it/s]

<implicit.cpu.als.AlternatingLeastSquares at 0x16906d5a220>

In [60]:
result['ItemItem_log'] = result['user_id'].apply(lambda x: main_rec.get_own_recommendations(x, N=5))

In [61]:
result['ALS_log'] = result['user_id'].apply(lambda x: main_rec.get_als_recommendations(x, N=5))

In [62]:
result.apply(lambda row: precision_at_k(row['ItemItem_log'], row['actual']), axis=1).mean()

0.31883289124668374

In [63]:
result.apply(lambda row: precision_at_k(row['ALS_log'], row['actual']), axis=1).mean()

0.20456233421750364

In [64]:
result.apply(lambda row: recall_at_k(row['ItemItem_log'], row['actual']), axis=1).mean()

0.05591649194368882

In [65]:
result.apply(lambda row: recall_at_k(row['ALS_log'], row['actual']), axis=1).mean()

0.03807255149508351

In [66]:
result.apply(lambda row: ap_k(row['ItemItem_log'], row['actual']), axis=1).mean()

0.24072855879752428

In [67]:
result.apply(lambda row: ap_k(row['ALS_log'], row['actual']), axis=1).mean()

0.13490716180371293

In [68]:
data = {'precision':[0.235, 0.1728, 0.2348, 0.1725, 0.2346, 0.1732, 0.0735, 0.082, 0.1936, 0.172],
        'recall': [0.042, 0.033, 0.0423, 0.0327, 0.0422, 0.0333, 0.012, 0.014, 0.0328, 0.03241],
        'MAP': [0.161, 0.108, 0.1608, 0.1096, 0.1608, 0.1092, 0.03, 0.046, 0.142, 0.1089]}
ind = ['It_It_5000', 'ALS_5000', 'It_It_3000', 'ALS_3000', 'It_It_6000', 'ALS_6000', 'It_It_pop', 'ALS_pop',
       'It_It_log', 'ALS_log']
metrics = pd.DataFrame(data, index = ind)
metrics.T

Unnamed: 0,It_It_5000,ALS_5000,It_It_3000,ALS_3000,It_It_6000,ALS_6000,It_It_pop,ALS_pop,It_It_log,ALS_log
precision,0.235,0.1728,0.2348,0.1725,0.2346,0.1732,0.0735,0.082,0.1936,0.172
recall,0.042,0.033,0.0423,0.0327,0.0422,0.0333,0.012,0.014,0.0328,0.03241
MAP,0.161,0.108,0.1608,0.1096,0.1608,0.1092,0.03,0.046,0.142,0.1089


Составим item-user матрицу на основе суммы покупок

In [69]:
data_sum = {'precision':[0.1935, 0.118, 0.1935, 0.117, 0.1935, 0.119, 0.1935, 0.115],
        'recall': [0.0349, 0.0264, 0.0348, 0.0271, 0.0348, 0.0263, 0.0349, 0.0265],
        'MAP': [0.1423, 0.0615, 0.1423, 0.0620, 0.1423, 0.0623, 0.1423, 0.062]}
ind = ['It_It_5000', 'ALS_5000', 'It_It_3000', 'ALS_3000', 'It_It_6000', 'ALS_6000', 'It_It_log', 'ALS_log']
metrics = pd.DataFrame(data_sum, index = ind)
metrics.T

Unnamed: 0,It_It_5000,ALS_5000,It_It_3000,ALS_3000,It_It_6000,ALS_6000,It_It_log,ALS_log
precision,0.1935,0.118,0.1935,0.117,0.1935,0.119,0.1935,0.115
recall,0.0349,0.0264,0.0348,0.0271,0.0348,0.0263,0.0349,0.0265
MAP,0.1423,0.0615,0.1423,0.062,0.1423,0.0623,0.1423,0.062


Добавим Взвешиванние "bm25"

In [70]:
data_BM25 = {'precision':[0.2372, 0.17411, 0.3166, 0.16722, 0.2251, 0.1765, 0.2372, 0.1709],
        'recall': [0.04239, 0.02814, 0.0541, 0.0276, 0.0402, 0.0278, 0.0423, 0.0275],
        'MAP': [0.1377, 0.12417, 0.2443, 0.1126, 0.1351, 0.1273, 0.1377, 0.1209]}
ind = ['It_It_5000', 'ALS_5000', 'It_It_3000', 'ALS_3000', 'It_It_6000', 'ALS_6000', 'It_It_log', 'ALS_log']
metrics = pd.DataFrame(data_BM25, index = ind)
metrics.T

Unnamed: 0,It_It_5000,ALS_5000,It_It_3000,ALS_3000,It_It_6000,ALS_6000,It_It_log,ALS_log
precision,0.2372,0.17411,0.3166,0.16722,0.2251,0.1765,0.2372,0.1709
recall,0.04239,0.02814,0.0541,0.0276,0.0402,0.0278,0.0423,0.0275
MAP,0.1377,0.12417,0.2443,0.1126,0.1351,0.1273,0.1377,0.1209


Изменим стандартные параметры bm25 на K1=150, B=0.5

In [76]:
data_BM25_150_05 = {'precision':[0.3188, 0.1863, 0.3541, 0.1857, 0.3039, 0.1818, 0.3188, 0.186],
                    'recall': [0.0559, 0.0305, 0.0599, 0.3156, 0.05401, 0.2918, 0.056, 0.0336],
                    'MAP': [0.2407, 0.1256, 0.2812, 0.1237, 0.2246, 0.12621, 0.2407, 0.1264]}
ind = ['It_It_5000', 'ALS_5000', 'It_It_3000', 'ALS_3000', 'It_It_6000', 'ALS_6000', 'It_It_log', 'ALS_log']
metrics = pd.DataFrame(data_BM25_150_05, index = ind)
metrics.T

Unnamed: 0,It_It_5000,ALS_5000,It_It_3000,ALS_3000,It_It_6000,ALS_6000,It_It_log,ALS_log
precision,0.3188,0.1863,0.3541,0.1857,0.3039,0.1818,0.3188,0.186
recall,0.0559,0.0305,0.0599,0.3156,0.05401,0.2918,0.056,0.0336
MAP,0.2407,0.1256,0.2812,0.1237,0.2246,0.12621,0.2407,0.1264


Попробуем взвешивание tfidf

In [72]:
data_tfidf = {'precision':[0.1936, 0.1869, 0.1936, 0.1840, 0.1936, 0.1849, 0.1936, 0.1850],
                    'recall': [0.03489, 0.03327, 0.03489, 0.0330, 0.0349, 0.033, 0.0349, 0.0336],
                    'MAP': [0.1424, 0.125, 0.1425, 0.122, 0.1425, 0.1230, 0.1425, 0.1234]}
ind = ['It_It_5000', 'ALS_5000', 'It_It_3000', 'ALS_3000', 'It_It_6000', 'ALS_6000', 'It_It_log', 'ALS_log']
metrics = pd.DataFrame(data_tfidf, index = ind)
metrics.T

Unnamed: 0,It_It_5000,ALS_5000,It_It_3000,ALS_3000,It_It_6000,ALS_6000,It_It_log,ALS_log
precision,0.1936,0.1869,0.1936,0.184,0.1936,0.1849,0.1936,0.185
recall,0.03489,0.03327,0.03489,0.033,0.0349,0.033,0.0349,0.0336
MAP,0.1424,0.125,0.1425,0.122,0.1425,0.123,0.1425,0.1234


Изменим стандартные параметры bm25 на K1=150, B=0.5 дает более высокое качество чем tfidf

### Поменяем параметры ALS

factors=100, regularization=0.004, iterations=50 - парамеры взяты из дз - 3 

In [73]:
data_ALS_upd = {'precision':[0.202, 0.19, 0.206, 0.206],
                    'recall': [0.037, 0.038, 0.038, 0.039],
                    'MAP': [0.136, 0.124, 0.14, 0.137]}
ind = [ 'ALS_5000', 'ALS_3000', 'ALS_6000', 'ALS_log']
metrics = pd.DataFrame(data_ALS_upd, index = ind)
metrics.T

Unnamed: 0,ALS_5000,ALS_3000,ALS_6000,ALS_log
precision,0.202,0.19,0.206,0.206
recall,0.037,0.038,0.038,0.039
MAP,0.136,0.124,0.14,0.137


### Реализуем ALS на Pyspark

In [None]:
https://colab.research.google.com/drive/13GMDAaz0k_6wXyDg37AL2yZ13dCyS5ea?usp=sharing

Получили более высокие показатели.

Возьмем его за основу и постоим 2х уровневую модель.

### Реализуем гибридную модель взяв за первый уровень модель ALS  c наибольшим recall

In [None]:
https://colab.research.google.com/drive/13GMDAaz0k_6wXyDg37AL2yZ13dCyS5ea?usp=sharing