In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install lightfm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting lightfm
  Downloading lightfm-1.17.tar.gz (316 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.4/316.4 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lightfm
  Building wheel for lightfm (setup.py) ... [?25l[?25hdone
  Created wheel for lightfm: filename=lightfm-1.17-cp39-cp39-linux_x86_64.whl size=889512 sha256=650633dbf9178fd324703b959998bf956dec1e96b9d5451c4a092df8ef2f2a4d
  Stored in directory: /root/.cache/pip/wheels/d8/65/93/6ac8180274dc2e8f86ff326be62da1dfa55dc158fd45faba7d
Successfully built lightfm
Installing collected packages: lightfm
Successfully installed lightfm-1.17


In [75]:
import pandas as pd
import numpy as np
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k
import pyarrow.parquet as pq
from itertools import chain
import random

**Обучение на разделенном train 80/20**  
_____

In [None]:
train_data = pd.read_csv('/content/drive/MyDrive/train_data.csv')

In [None]:
test_data = pd.read_csv('/content/drive/MyDrive/test_data.csv')

In [None]:
train_data.head()

Unnamed: 0,cookie_id,vacancy_id_,event_type
0,abaec81fdf5e41b98ba70562cf1ee12b,102993,0.5
1,87b9efa7623340c2bc1c30d93a4c5663,158179,0.5
2,eac6f3b6a4bb4c1396d127cb48202bea,103209,0.5
3,f4fd2cbde7bb47f3bf9156480b81f05a,112356,0.5
4,fe17b1e0df57474d91b363d609e46d9b,113707,0.5


In [None]:
test_data.head()

Unnamed: 0,cookie_id,vacancy_id_,event_type
0,69495070cb644e79b6fe2343c08f16d7,174682,0.5
1,cdbaf308bd234abfbe2e5ccd6daca8db,107882,0.5
2,ba3677792be945cd8ffa62569370c967,195702,0.5
3,43abb0d1606745348e99cf7f4d57bf17,258441,0.5
4,5fd84198b3374c9297bd8e314daae571,203404,0.5


In [None]:
pd.concat([train_data['vacancy_id_'], test_data['vacancy_id_']]).unique()

array([102993, 158179, 103209, ..., 208174, 242128, 113710])

In [None]:
# Подготовка данных для LightFM
dataset = Dataset()
dataset.fit(pd.concat([train_data['cookie_id'], test_data['cookie_id']]).unique(),\
            pd.concat([train_data['vacancy_id_'], test_data['vacancy_id_']]).unique())
num_users, num_items = dataset.interactions_shape()
num_users, num_items

(330180, 160167)

In [None]:
# Создание словаря вакансий из dataset
dict_vacancy = {v: k for k, v in dataset.mapping()[2].items()}

In [None]:
# Создание матрицы взаимодействий
train_interactions, train_weights = dataset.build_interactions([(x['cookie_id'], x['vacancy_id_'], x['event_type']) for idx, x in train_data.iterrows()])
test_interactions, test_weights = dataset.build_interactions([(x['cookie_id'], x['vacancy_id_'], x['event_type']) for idx, x in test_data.iterrows()])

In [None]:
train_interactions

<330180x160167 sparse matrix of type '<class 'numpy.int32'>'
	with 3780406 stored elements in COOrdinate format>

In [None]:
test_interactions

<330180x160167 sparse matrix of type '<class 'numpy.int32'>'
	with 898179 stored elements in COOrdinate format>

In [None]:
# Обучение модели LightFM
model = LightFM(loss='warp')
model.fit(train_interactions, epochs=15)

<lightfm.lightfm.LightFM at 0x7fab3efcebe0>

In [None]:
# Оценка модели с использованием метрики precision@5 на test
# check_intesection=True (train_interactions are supplied)
test_precision = precision_at_k(model, test_interactions, train_interactions = train_interactions, k=5).mean()
print('Test precision@5:', test_precision)
# w/o features: Test precision@5: 0.058370296

Test precision@5: 0.058370296


**Добавление признаков в item**  
___

In [89]:
item_features_df = pd.read_csv('/content/drive/MyDrive/item_feature_conv.csv')

In [None]:
#item_features_df_decay = pd.read_csv('/content/drive/MyDrive/item_feature_decay.csv')

In [90]:
item_features_df

Unnamed: 0,vacancy_id_,cat_conv
0,100001,c1
1,100002,c1
2,100003,c1
3,100004,c1
4,100005,c1
...,...,...
160162,260163,c1
160163,260164,c3
160164,260165,c1
160165,260166,c1


In [None]:
#item_features_df_decay

Unnamed: 0,vacancy_id_,time_decay
0,100001,1.353353e-01
1,100002,1.000000e+00
2,100003,5.749522e-19
3,100004,1.425164e-21
4,100005,3.354626e-04
...,...,...
160162,260163,9.118820e-04
160163,260164,8.756511e-27
160164,260165,9.118820e-04
160165,260166,2.115131e-19


In [None]:
#item_feature_data = pd.merge(item_features_df, item_features_df_decay, on = 'vacancy_id_')
#item_feature_data

Unnamed: 0,vacancy_id_,avg_day_conv,time_decay
0,100001,0.06,1.353353e-01
1,100002,0.13,1.000000e+00
2,100003,0.12,5.749522e-19
3,100004,0.00,1.425164e-21
4,100005,0.09,3.354626e-04
...,...,...,...
160162,260163,0.00,9.118820e-04
160163,260164,1.00,8.756511e-27
160164,260165,0.07,9.118820e-04
160165,260166,0.07,2.115131e-19


In [91]:
# Подготовка данных для LightFM via item_feature
dataset_item = Dataset()
dataset_item.fit(pd.concat([train_data['cookie_id'], test_data['cookie_id']]).unique(),\
            pd.concat([train_data['vacancy_id_'], test_data['vacancy_id_']]).unique(),\
            item_features=['c1', 'c2', 'c3', 'c4', 'c5'])

In [92]:
# Создание словаря вакансий из dataset_item
dict_vac_item = {v: k for k, v in dataset_item.mapping()[2].items()}

In [102]:
train_interactions_item, train_weights_item = dataset_item.build_interactions([(x['cookie_id'], x['vacancy_id_'], x['event_type']) for idx, x in train_data.iterrows()])

In [103]:
train_interactions_item

<330180x160167 sparse matrix of type '<class 'numpy.int32'>'
	with 3780406 stored elements in COOrdinate format>

In [106]:
test_interactions_item, test_weights_item = dataset_item.build_interactions([(x['cookie_id'], x['vacancy_id_'], x['event_type']) for idx, x in test_data.iterrows()])

In [94]:
# Создание матрицы item_feature
item_features = dataset_item.build_item_features([(x['vacancy_id_'], [x['cat_conv']]) for idx, x in item_features_df.iterrows()])

In [95]:
item_features

<160167x160172 sparse matrix of type '<class 'numpy.float32'>'
	with 320334 stored elements in Compressed Sparse Row format>

In [107]:
# Создаем модель
model_item = LightFM(loss='warp')

In [108]:
# Обучаем модель на train данных via item_feature
model_item.fit(train_interactions_item, item_features=item_features, epochs=15)

<lightfm.lightfm.LightFM at 0x7f0de88f6250>

In [110]:
# Оценка модели с использованием метрики precision@5 на test
# check_intesection=True (train_interactions are supplied)
test_precision_item = precision_at_k(model_item, test_interactions_item, train_interactions = train_interactions_item, item_features=item_features, k=5).mean()
print('Test precision@5:', test_precision_item)
# item_features(conversion category) Test precision@5: 0.055422664

Test precision@5: 0.055422664


___  
**Обучение на полном train**

In [None]:
data_full = pd.concat([train_data, test_data])
data_full

Unnamed: 0,cookie_id,vacancy_id_,event_type
0,abaec81fdf5e41b98ba70562cf1ee12b,102993,0.5
1,87b9efa7623340c2bc1c30d93a4c5663,158179,0.5
2,eac6f3b6a4bb4c1396d127cb48202bea,103209,0.5
3,f4fd2cbde7bb47f3bf9156480b81f05a,112356,0.5
4,fe17b1e0df57474d91b363d609e46d9b,113707,0.5
...,...,...,...
898174,9332e51a085349408a7b11232d46be53,191382,0.5
898175,b07eb084be574ba69461d98734261d71,163292,0.5
898176,4ceef9ac78884d99aa68894230050fd1,130608,0.5
898177,513c75bcbdd94bedbd65ec10152d35e4,105890,0.5


In [None]:
# Создание матрицы взаимодействий
train_interactions_full, train_weights_full = dataset.build_interactions([(x['cookie_id'], x['vacancy_id_'], x['event_type']) for idx, x in data_full.iterrows()])

In [None]:
train_interactions_full

<330180x160167 sparse matrix of type '<class 'numpy.int32'>'
	with 4678585 stored elements in COOrdinate format>

In [None]:
# Обучение модели LightFM на полном train w/o features
model_full = LightFM(loss='warp')
model_full.fit(train_interactions_full, epochs=15)

**Валидация на test_public**  
___

In [None]:
# Чтение файла
table_test = pq.read_table('/content/drive/MyDrive/test_public_mfti.parquet')

# Запись в датафрейм
df_test_public = table_test.to_pandas()
df_test_public

Unnamed: 0,cookie_id,vacancy_id_
0,000cd76cd33f43d4a1ac1d16d10f8bf7,"[222177, 222173, 222163, 238874, 238878, 22812..."
1,0034bc7f404341ba8412665453e7825a,"[102794, 137587, 257319, 237756, 240744, 11348..."
2,00a6c5a64a274c55a836402bdeb3b2c4,"[254292, 164602, 116438, 228634, 218819, 24065..."
3,015937a125b14e74bdff1cddc49f9172,"[246685, 138123, 115420, 210628, 212325, 235196]"
4,01de50c280794cec8804f16f45f847b7,"[219070, 251469, 166899, 212703, 214561]"
...,...,...
767,fdbcda17f22f406486837059e76c7fed,"[207851, 254989, 213344, 214180, 222146]"
768,fe6193ab26494ace9be5aae36e507618,"[115352, 230546, 225527, 120188, 109360, 23212..."
769,fe95b2826ee1452b81201ed3f4c3294d,"[240362, 114852, 253946, 251081, 127546, 244688]"
770,ff1aef256a49481698bb2e938510ff36,"[231194, 236363, 220747, 244688, 100094, 24052..."


In [100]:
# Функция для вычисления Precision@k для LightFM
def evaluate_precision_at_k(model_ev, test_data_df, dataset_ev, dict_ev, k=5):
    test_users = test_data_df['cookie_id'].unique()
    precision = 0
    
    for user_cookie_id in test_users:
        known_positives = data_full[data_full['cookie_id'] == user_cookie_id]['vacancy_id_'].unique()
        true_positives = chain.from_iterable(test_data_df[test_data_df['cookie_id'] == user_cookie_id]['vacancy_id_'])

        user_idx = dataset_ev.mapping()[0][user_cookie_id] # Доступ к словарю user-item
        scores = model_ev.predict(user_idx, np.arange(dataset_ev.item_features_shape()[0]))
        top_items = np.argsort(-scores)
        top_items_ = [dict_ev[idx] for idx in top_items]
        
        # Фильтруем уже просмотренные вакансии
        new_top_items = [x for x in top_items_ if x not in known_positives][:k]

        precision += len(set(new_top_items) & set(true_positives)) / k

    return precision / len(test_users)

In [None]:
lightfm_precision = evaluate_precision_at_k(model, df_test_public, dataset, dict_vacancy, k=5)
print(f"LightFM Precision@5: {lightfm_precision}")
# w/o features train 80/20 - LightFM Precision@5: 0.036010362694300455

LightFM Precision@5: 0.036010362694300455


In [None]:
lightfm_precision_full = evaluate_precision_at_k(model_full, df_test_public, dataset, dict_vacancy, k=5)
print(f"LightFM Precision@5: {lightfm_precision_full}")
# w/o features full train - LightFM Precision@5: 0.03419689119170978

LightFM Precision@5: 0.03419689119170978


In [109]:
lightfm_precision_item = evaluate_precision_at_k(model_item, df_test_public, dataset_item, dict_vac_item, k=5)
print(f"LightFM Precision@5: {lightfm_precision_item}")
# item_features(conversion category) train 80/20 - LightFM Precision@5: 0.03911917098445587

LightFM Precision@5: 0.03911917098445587


**Рекомендации**  
___

In [None]:
# df_test_private
# Чтение файла
table_test_pr = pq.read_table('/content/drive/MyDrive/test_private_users_mfti.parquet')

# Запись в датафрейм
df_test_private = table_test_pr.to_pandas()

In [None]:
def recommend_user(model_rec, test_data_proba, dataset_rec, dict_vacancy_rec, k=5):
    test_users = test_data_proba['cookie_id'].unique()
    df = pd.Dataframe(columns=['cookie_id', 'vacancy_id_'])
   
    for user_cookie_id in test_users:
        known_positives = data_full[data_full['cookie_id'] == user_cookie_id]['vacancy_id_'].unique()

        user_idx = dataset_rec.mapping()[0][user_cookie_id] # Доступ к словарю user-item
        scores = model_rec.predict(user_idx, np.arange(dataset_rec.item_features_shape()[0]))
        top_items = np.argsort(-scores)
        top_items_ = [dict_vacancy_rec[idx] for idx in top_items]
        
        # Фильтруем уже просмотренные вакансии
        new_top_items = [x for x in top_items_ if x not in known_positives][:k]
        
        df.loc[len(df)] = [user_cookie_id, new_top_items]
        
    return df

In [None]:
rec_df = recommend_user(model, df_test_private, dataset, dict_vacancy, k=5)

In [None]:
rec_df.to_csv('test_private_rec.csv', index=False)

**Baseline**  
____________________

In [None]:
# Топ-100 вакансий из train (80%)
top_100_vacancies_80 = train_data.groupby(['vacancy_id_']).agg({'event_type': 'sum'}).reset_index()\
.sort_values(by='event_type', ascending=False).head(100)['vacancy_id_'].values

In [None]:
top_100_vacancies_80

array([260154, 198114, 203404, 202608, 164602, 111505, 116823, 148714,
       207423, 258441, 108242, 242642, 158242, 182870, 111867, 110421,
       162187, 250327, 174953, 176141, 207108, 247535, 217683, 110792,
       113305, 240744, 237341, 149024, 113482, 244077, 114583, 105907,
       182100, 110793, 247276, 180382, 227708, 210628, 193331, 113707,
       246509, 190030, 169194, 153245, 164481, 164588, 115924, 126251,
       138634, 230707, 120252, 239021, 214513, 106944, 182084, 184440,
       154411, 206350, 136266, 127352, 181745, 112506, 111592, 249571,
       168935, 150283, 109079, 151616, 155539, 111837, 220718, 207156,
       111941, 143721, 257631, 243868, 248852, 140917, 111890, 182439,
       212325, 128183, 187360, 129787, 212141, 258378, 117532, 176131,
       239624, 106293, 209568, 117525, 176171, 205606, 229689, 171602,
       154423, 114328, 197930, 248720])

In [73]:
# Топ-100 вакансий из всего train
top_100_vacancies = data_full.groupby(['vacancy_id_']).agg({'event_type': 'sum'}).reset_index()\
.sort_values(by='event_type', ascending=False).head(100)['vacancy_id_'].values

In [74]:
top_100_vacancies

array([260154, 198114, 203404, 202608, 164602, 111505, 116823, 207423,
       148714, 108242, 158242, 258441, 182870, 242642, 111867, 110421,
       162187, 250327, 174953, 176141, 207108, 247535, 217683, 110792,
       113305, 149024, 240744, 237341, 113482, 114583, 244077, 105907,
       182100, 210628, 180382, 247276, 110793, 193331, 227708, 113707,
       169194, 164588, 190030, 246509, 153245, 115924, 138634, 126251,
       230707, 164481, 120252, 214513, 239021, 106944, 182084, 127352,
       184440, 154411, 136266, 206350, 249571, 112506, 111592, 181745,
       220718, 151616, 111837, 150283, 109079, 168935, 155539, 111941,
       143721, 248852, 212325, 257631, 243868, 207156, 182439, 140917,
       111890, 212141, 117532, 129787, 258378, 117525, 239624, 128183,
       106293, 176131, 187360, 209568, 176171, 205606, 171602, 154423,
       114328, 253678, 248720, 197930])

In [76]:
# baseline модель, которая возвращает топ-k вакансий для всех пользователей
def baseline_model(known_positives, top_100, k=5):
    top_items = random.sample([x for x in top_100 if x not in known_positives], k=k)
    return top_items

In [87]:
# Вычисление Precision@k для baseline модели
def evaluate_precision_at_k_baseline(data_df, test_df, top):
  baseline_precision = 0

  test_users = test_df['cookie_id'].unique()
  for user_cookie_id in test_users:
      known_positives = data_df[data_df['cookie_id'] == user_cookie_id]['vacancy_id_'].unique()
      true_positives = chain.from_iterable(test_df[test_df['cookie_id'] == user_cookie_id]['vacancy_id_'])

      top_items = baseline_model(known_positives, top, k=5)
      baseline_precision += len(set(top_items) & set(true_positives)) / 5

  baseline_precision /= len(test_users)
  print(f'Baseline Precision@5: {baseline_precision}')

In [None]:
evaluate_precision_at_k_baseline(train_data, test_data, top_100_vacancies_80)
# Local validate - Baseline Precision@5 для test - train (20%): 0.005814435554081406 ?

In [88]:
evaluate_precision_at_k_baseline(data_full, df_test_public, top_100_vacancies)
# Baseline Precision@5 для test public: 0.011139896373056997

Baseline Precision@5: 0.011139896373056997


_______________