# Чемпионат Тюменьской области

# Задача разработки рекомендательного алгоритма для читателей библиотеки

In [1]:
import pandas as pd
import numpy as np
# from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
from more_itertools import pairwise
import re
# from sklearn.preprocessing import LabelEncoder, StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline

In [2]:
#считывание данных
users = pd.read_csv('train/users.csv', sep=';', index_col=None, dtype={'age': str, 'chb': str, 'chit_type': str, 'gender': str})
items = pd.read_csv('train/items.csv', sep=';', index_col=None, dtype={'author': str, 'bbk': str, 'izd': str, 'sys_numb': str, 'title': str, 'year_izd': str})
train_transactions = pd.read_csv('train/train_transactions_extended.csv', sep=';', index_col=None, dtype={'chb': str, 'date_1': str, 'is_printed': str, 'is_real': str, 'source': str, 'sys_numb': str, 'type': str})
submit = pd.read_csv('sample_solution.csv', sep=';', index_col=None, dtype={'chb': str, 'sys_numb': str})

Посмотрим, какие пользователи есть в наших таблицах, и для каких пользователей предстоит дать рекомендации.

In [3]:
print(f"Кол-во пользователей: {users['chb'].nunique()}")
print(f"Кол-во пользователей в истории: {train_transactions['chb'].nunique()}")

Кол-во пользователей: 16753
Кол-во пользователей в истории: 16753


In [4]:
users_uniq=set(users.chb) # множество уникальных пользователей в таблице users
users_in_transaction = set(train_transactions.chb) # множество уникальных пользователей в таблице train_transactions
submit_users = set(submit.chb) # множество уникальных пользователей для которых предстоит дать рекомендации

In [5]:
users_uniq - users_in_transaction

set()

Пустое множество при вычитании множеств означает, что пользователи в users и пользователи в train_transactions абсолютно одинаковые по составу.

In [6]:
submit_users - users_uniq

set()

Аналогично с submit и users.                                                                          

In [7]:
users.head(2)

Unnamed: 0,chb,age,gender,chit_type
0,300001020830,21,female,нет данных
1,300001113642,36,female,нет данных


In [8]:
users.drop(['age', 'gender', 'chit_type'], axis=1, inplace=True)
users['chb_cod'] = users.chb.apply(lambda x: x[0])
users.chb_cod = users.chb_cod.astype('category')
users['chb'] = users.chb.astype('category')

In [9]:
users.head(2)

Unnamed: 0,chb,chb_cod
0,300001020830,3
1,300001113642,3


In [10]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16753 entries, 0 to 16752
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   chb      16753 non-null  category
 1   chb_cod  16753 non-null  category
dtypes: category(2)
memory usage: 820.3 KB


In [11]:
train_transactions.head(2)

Unnamed: 0,chb,sys_numb,date_1,is_real,type,source,is_printed
0,100000641403,RSL01004206702,2021-02-21,yes,скачивание,dlib.rsl.ru,False
1,100000641403,RSL01000769304,2021-03-23,yes,скачивание,dlib.rsl.ru,False


In [12]:
train_transactions['date_1'] = pd.to_datetime(train_transactions['date_1'])
# сортируем по дате
train_transactions.sort_values(by='date_1', inplace=True)
# переписываем индексы
train_transactions.index = np.arange(0, len(train_transactions))
# удаляем столбцы
train_transactions.drop(['is_real', 'type', 'source', 'is_printed'], axis=1, inplace=True)

In [13]:
train_transactions['chb'] = train_transactions.chb.astype('category')
train_transactions['sys_numb'] = train_transactions.sys_numb.astype('category')
train_transactions.head(2)

Unnamed: 0,chb,sys_numb,date_1
0,100001057738,RSL01009819690,2021-01-01
1,100001057738,RSL01009824127,2021-01-01


In [14]:
items.head(2)

Unnamed: 0,sys_numb,title,author,izd,year_izd,bbk
0,RSL01008600016,Судебное следствие в уголовном процессе России...,"Машовец, Асия Океановна",Юрлитинформ,2016,"Х629.374,0"
1,RSL01004304880,Уральское казачество и его роль в системе росс...,"Дубовиков, Александр Маратович",none,2006,none


In [15]:
items['sys_numb'] = items.sys_numb.astype('category')

In [16]:
from pymorphy2 import MorphAnalyzer
morph = MorphAnalyzer()

In [17]:
import string
from nltk import word_tokenize
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nailya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Nailya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [18]:
spec_chars = string.punctuation + '«»—…’‘”“©'
# print(spec_chars)

# Зададим функцию для удаления спец символов


def remove_chars(text, chars):
    return "".join([ch for ch in text if ch not in chars])

# stemmer = Mystem()
# Функция для обработки текста

def text_tokenizer(text):

    text = text.lower()
    
    # Удалим \n и \t в тексте
    text = text.replace('\n', ' ').replace('\t', ' ')
    
    # Удалим непечатные спецсимволы
    text = [w for w in text if w.isprintable()]
    
    # Удалим все спец символы
    text = remove_chars(text, spec_chars)

    # Удалим все цифры
    text = remove_chars(text, string.digits)

    # Токенизируем текст
    tokens = word_tokenize(text)
    
    # Удаляем слова длиной меньше 3 символов
    tokens = [w for w in tokens if len(w)>2] 

    # Удаляем стоп-слова
    filtered_token = [w for w in tokens if not w in stopwords_list]

    # лемматизируем
    for token in filtered_token:
        token = morph.normal_forms(token)[0]
#         token = stemmer.lemmatize(token)

    return filtered_token

In [19]:
stopwords_list = stopwords.words("russian")
# Дополним список слов-на-выброс
list_to_drop = ['отсутствует']

stopwords_list.extend(list_to_drop)
items['title'] = items['title'].apply(lambda x: text_tokenizer(x))
items['title'] = items['title'].apply(lambda x: ' '.join(x))

In [20]:
def year_detection(row):
    year_='0'
    if type(row)!=float:
        for i in row:
            if i[0]=='1' or (i[0]=='2' and i[1]=='0'):
                year_=i
            else:
                continue
    return year_

In [21]:
items['year_izd'] = items.year_izd.str.findall(r'\d\d\d\d')
items['year_izd'] = items.year_izd.apply(lambda x: year_detection(x))
items['year_izd'] = items.year_izd.astype('int')

In [22]:
def bbk_detection(item):
    cod = 0
    if type(item)!=float:
        if item[0].isupper():
            cod=item[0]
    return str(cod)

In [23]:
items.loc[items[items.bbk=='none'].index.values, 'bbk'] = 'отсутствует'
items['bbk'] = items.bbk.apply(lambda x: bbk_detection(x))

In [24]:
items.sample(3)

Unnamed: 0,sys_numb,title,author,izd,year_izd,bbk
85934,RSL01008814686,вожди германской промышленности крупп стиннес ...,"Пиннер, Феликс",Время,1925,0
148864,RSL01009580087,оценка влияния международной трудовой миграции...,"Алфёров, Александр Александрович",none,2017,У
7579,RSL01008547881,месса музыке века проблеме неоканона диссертац...,"Кальченко, Елена Викторовна",none,2016,0


In [25]:
item_titles = pd.Series(items['title'].values, index=items['sys_numb']).to_dict()

In [26]:
def compute_metrics(df_true, df_pred, top_N):
    result = {}
    test_recs = df_true.set_index(['chb', 'sys_numb']).join(df_pred.set_index(['chb', 'sys_numb']))
    test_recs = test_recs.sort_values(by=['chb', 'rank'])

    test_recs['users_item_count'] = test_recs.groupby(level='chb')['rank'].transform(np.size)
    test_recs['reciprocal_rank'] = (1 / test_recs['rank']).fillna(0)
    test_recs['cumulative_rank'] = test_recs.groupby(level='chb').cumcount() + 1
    test_recs['cumulative_rank'] = test_recs['cumulative_rank'] / test_recs['rank']
    
    users_count = test_recs.index.get_level_values('chb').nunique()
    for k in range(1, top_N + 1):
        hit_k = f'hit@{k}'
        test_recs[hit_k] = test_recs['rank'] <= k
        result[f'Precision@{k}'] = (test_recs[hit_k] / k).sum() / users_count
        result[f'Recall@{k}'] = (test_recs[hit_k] / test_recs['users_item_count']).sum() / users_count
        
    result[f'MAP@{top_N}'] = (test_recs["cumulative_rank"] / test_recs["users_item_count"]).sum() / users_count
    result['MRR'] = test_recs.groupby(level='chb')['reciprocal_rank'].max().mean()
    if (result[f'Precision@{top_N}'] + result[f'Recall@{top_N}'])!=0:
        result['F1'] = 2*result[f'Precision@{top_N}'] * result[f'Recall@{top_N}'] /(result[f'Precision@{top_N}'] + result[f'Recall@{top_N}'])
    else:
        result['F1'] = 0
    return pd.Series(result)

# Train & Test

Для наших данных выбрем 7 последних дней и будем тестировать на них последовательно (1 test fold - 1 день).

Но теперь нам нужно учитывать проблему холодного старта. Это основная проблем классических метод над матрицей взаимодействий. Поэтому напишем свой класс для разбиения исходного датафрейма на train/test

In [27]:
class TimeRangeSplit():
    """
        https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.date_range.html
    """
    def __init__(self, 
                 start_date, 
                 end_date=None, 
                 freq='D', 
                 periods=None, 
                 tz=None, 
                 normalize=False, 
                 closed=None, 
                 train_min_date=None,
                 filter_cold_users=True, 
                 filter_cold_items=True, 
                 filter_already_seen=True):
        
        self.start_date = start_date
        if end_date is None and periods is None:
            raise ValueError("Either 'end_date' or 'periods' must be non-zero, not both at the same time.")

        self.end_date = end_date
        self.freq = freq
        self.periods = periods
        self.tz = tz
        self.normalize = normalize
        self.closed = closed
        self.train_min_date = pd.to_datetime(train_min_date, errors='raise')
        self.filter_cold_users = filter_cold_users
        self.filter_cold_items = filter_cold_items
        self.filter_already_seen = filter_already_seen

        self.date_range = pd.date_range(
            start=start_date, 
            end=end_date, 
            freq=freq, 
            periods=periods, 
            tz=tz, 
            normalize=normalize, 
            closed=closed)

        self.max_n_splits = max(0, len(self.date_range) - 1)
        if self.max_n_splits == 0:
            raise ValueError("Provided parametrs set an empty date range.") 

    def split(self, 
              df, 
              user_column='chb', ###
              item_column='sys_numb',
              datetime_column='date_1', ###
              fold_stats=False):
        df_datetime = df[datetime_column]
        
        if self.train_min_date is not None:
            train_min_mask = df_datetime >= self.train_min_date
        else:
            train_min_mask = df_datetime.notnull()

        date_range = self.date_range[(self.date_range >= df_datetime.min()) & 
                                     (self.date_range <= df_datetime.max())]

        for start, end in tqdm(pairwise(date_range)):
            fold_info = {
                'Start date': start,
                'End date': end
            }
            train_mask = train_min_mask & (df_datetime < start)
            train_idx = df.index[train_mask]
            
            if fold_stats:
                fold_info['Train'] = len(train_idx)
                
            test_mask = (df_datetime >= start) & (df_datetime < end)
            test_idx = df.index[test_mask]
            
            if self.filter_cold_users:
                new = set(df.loc[test_idx, user_column].unique()) - set(df.loc[train_idx, user_column].unique())
                new_idx = df.index[test_mask & df[user_column].isin(new)]
                test_idx = np.setdiff1d(test_idx, new_idx)
                test_mask = df.index.isin(test_idx)
                if fold_stats:
                    fold_info['New users'] = len(new)
                    fold_info['New users interactions'] = len(new_idx)
                    
            if self.filter_cold_items:
                new = np.setdiff1d(
                    df.loc[test_idx, item_column].unique(), 
                    df.loc[train_idx, item_column].unique())
                new_idx = df.index[test_mask & df[item_column].isin(new)]
                test_idx = np.setdiff1d(test_idx, new_idx)
                test_mask = df.index.isin(test_idx)
                if fold_stats:
                    fold_info['New items'] = len(new)
                    fold_info['New items interactions'] = len(new_idx)
                    
            if self.filter_already_seen:
                user_item = [user_column, item_column]
                train_pairs = df.loc[train_idx, user_item].set_index(user_item).index
                test_pairs = df.loc[test_idx, user_item].set_index(user_item).index
                intersection = train_pairs.intersection(test_pairs)
                test_idx = test_idx[~test_pairs.isin(intersection)]
                # test_mask = rd.df.index.isin(test_idx)
                if fold_stats:
                    fold_info['Known interactions'] = len(intersection)
                    
            if fold_stats:
                fold_info['Test'] = len(test_idx)
                
            yield (train_idx, test_idx, fold_info)

    def get_n_splits(self, df, datetime_column='date'):
        df_datetime = df[datetime_column]
        if self.train_min_date is not None:
            df_datetime = df_datetime[df_datetime >= self.train_min_date]

        date_range = self.date_range[(self.date_range >= df_datetime.min()) & 
                                     (self.date_range <= df_datetime.max())]

        return max(0, len(date_range) - 1)

In [28]:
last_date = train_transactions['date_1'].max().normalize()
folds = 7
start_date = last_date - pd.Timedelta(days=folds)
start_date, last_date

(Timestamp('2022-03-24 00:00:00'), Timestamp('2022-03-31 00:00:00'))

In [29]:
cv = TimeRangeSplit(start_date=start_date, periods=folds+1)

cv.max_n_splits, cv.get_n_splits(train_transactions, datetime_column='date_1')

(7, 7)

In [30]:
cv.date_range

DatetimeIndex(['2022-03-24', '2022-03-25', '2022-03-26', '2022-03-27',
               '2022-03-28', '2022-03-29', '2022-03-30', '2022-03-31'],
              dtype='datetime64[ns]', freq='D')

In [31]:
folds_with_stats = list(cv.split(
    train_transactions, 
    user_column='chb',
    item_column='sys_numb',
    datetime_column='date_1',
    fold_stats=True
))

folds_info_with_stats = pd.DataFrame([info for _, _, info in folds_with_stats])

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




In [32]:
from lightfm import LightFM
from lightfm.data import Dataset

  "LightFM was compiled without OpenMP support. "


In [33]:
dataset = Dataset()

In [34]:
dataset.fit(train_transactions['chb'].unique(), train_transactions['sys_numb'].unique())

In [35]:
users['chb_cod'] = users['chb_cod'].cat.add_categories('age_unknown')
users['chb_cod'] = users['chb_cod'].fillna('age_unknown')
users_features = users['chb_cod'].unique()
users_features

['3', '2', '1', '4']
Categories (4, object): ['3', '2', '1', '4']

In [36]:
dataset.fit_partial(user_features=users_features)

In [37]:
items.title = items.title.astype('category')
items['title'] = items['title'].cat.add_categories('type_unknown')
items['title'] = items['title'].fillna('type_unknown')
item_title = list(items['title'].str.split(' ').explode().unique())
len(item_title)

211781

In [38]:
items.bbk = items.bbk.astype('category')
item_bbk = list(items['bbk'].unique())
len(item_bbk)

29

In [39]:
items.year_izd = items.year_izd.astype('category')
item_year_izd = list(items['year_izd'].unique())
len(item_year_izd)

380

In [40]:
i_features = np.append(item_title, item_bbk)
i_features = np.append(i_features, item_year_izd)
i_features

array(['судебное', 'следствие', 'уголовном', ..., '1671', '1612', '1693'],
      dtype='<U112')

In [41]:
i_features.shape

(212190,)

In [42]:
dataset.fit_partial(item_features=i_features)

In [43]:
num_users, num_items = dataset.interactions_shape()
num_users, num_items

(16753, 194666)

In [44]:
lightfm_mapping = dataset.mapping()
lightfm_mapping = {
    'users_mapping': lightfm_mapping[0],
    'user_features_mapping': lightfm_mapping[1],
    'items_mapping': lightfm_mapping[2],
    'item_features_mapping': lightfm_mapping[3],
}
print('users_mapping len - ', len(lightfm_mapping['users_mapping']))
print('user_features_mapping len - ', len(lightfm_mapping['user_features_mapping']))
print('items_mapping len - ', len(lightfm_mapping['items_mapping']))
print('Users item_features_mapping len - ', len(lightfm_mapping['item_features_mapping']))

users_mapping len -  16753
user_features_mapping len -  16757
items_mapping len -  194666
Users item_features_mapping len -  406855


In [45]:
lightfm_mapping['users_inv_mapping'] = {v: k for k, v in lightfm_mapping['users_mapping'].items()}
lightfm_mapping['items_inv_mapping'] = {v: k for k, v in lightfm_mapping['items_mapping'].items()}

In [46]:
num_user_features = dataset.user_features_shape()
num_show_features = dataset.item_features_shape()
print('Num user features: {} -> {}\nnum item features: {} -> {}.'.format(
    num_user_features[1] - num_users, num_user_features[1], 
    num_show_features[1] - num_items, num_show_features[1]))

Num user features: 4 -> 16757
num item features: 212189 -> 406855.


In [47]:
def df_to_tuple_iterator(df):
    return zip(*df.values.T)

def concat_last_to_list(t):
    return (t[0], list(t[1:])[0])

def df_to_tuple_list_iterator(df):
    return map(concat_last_to_list, zip(*df.values.T))

In [48]:
chb_cod_list = users['chb_cod'].values
new = [[i] for i in chb_cod_list]
users['feature'] = new
users

Unnamed: 0,chb,chb_cod,feature
0,300001020830,3,[3]
1,300001113642,3,[3]
2,300001148466,3,[3]
3,300001117011,3,[3]
4,200001038094,2,[2]
...,...,...,...
16748,300001154270,3,[3]
16749,200000851690,2,[2]
16750,200001154993,2,[2]
16751,200001155169,2,[2]


In [49]:
known_users_filter = users['chb'].isin(train_transactions['chb'].unique())
train_user_features = dataset.build_user_features(
    df_to_tuple_list_iterator(
        users.loc[known_users_filter, ['chb', 'feature']]
    )
)
train_user_features

<16753x16757 sparse matrix of type '<class 'numpy.float32'>'
	with 33506 stored elements in Compressed Sparse Row format>

In [50]:
items['features'] = items['title'].astype(str).apply(lambda x: x.split(' '))
items['bbk'] = items['bbk'].astype(str)
items['features'] = items[['features','bbk']].apply(lambda x: list(x), axis=1)
items['year_izd'] = items['year_izd'].astype(str)
items['features'] = items[['features','year_izd']].apply(lambda x: list(x), axis=1)
items['features'] = items['features'].astype(str).apply(lambda x: x.replace('[','').replace(']',''))
items['features'] = items['features'].apply(lambda x: x.replace("'', ",'').replace("'",'').replace("'",''))
# items['features'] = items['features'].astype('category')
items['features'] = items['features'].str.split(', ')
items.head(2)

Unnamed: 0,sys_numb,title,author,izd,year_izd,bbk,features
0,RSL01008600016,судебное следствие уголовном процессе россии м...,"Машовец, Асия Океановна",Юрлитинформ,2016,Х,"[судебное, следствие, уголовном, процессе, рос..."
1,RSL01004304880,уральское казачество роль системе российской г...,"Дубовиков, Александр Маратович",none,2006,0,"[уральское, казачество, роль, системе, российс..."


In [51]:
known_items_filter = items['sys_numb'].isin(train_transactions['sys_numb'].unique())
train_items_features = dataset.build_item_features(
    df_to_tuple_list_iterator(
        items.loc[known_items_filter, ['sys_numb', 'features']]
    )
)
train_items_features

<194666x406855 sparse matrix of type '<class 'numpy.float32'>'
	with 2204144 stored elements in Compressed Sparse Row format>

In [52]:
lfm_model = LightFM(no_components=64, learning_rate=0.05, loss='warp', max_sampled=5, random_state=23)

In [53]:
top_N = 20
all_cols = list(lightfm_mapping['items_mapping'].values())
len(all_cols)

194666

In [54]:
def generate_lightfm_recs_mapper(model, item_ids, known_items, user_features, item_features, N, user_mapping, item_inv_mapping, num_threads=4):
    def _recs_mapper(user):
        user_id = user_mapping[user]
        recs = model.predict(user_id, item_ids, user_features=user_features,
                             item_features=item_features, num_threads=num_threads)

        additional_N = len(known_items[user_id]
                           ) if user_id in known_items else 0
        total_N = N + additional_N
        top_cols = np.argpartition(recs, -np.arange(total_N))[-total_N:][::-1]

        final_recs = [item_inv_mapping[item] for item in top_cols]
        
        if additional_N > 0:
            filter_items = known_items[user_id]
            final_recs = [item for item in final_recs if item not in filter_items]
        return str(final_recs[:N])
    return _recs_mapper

In [55]:
metrics_in_time = pd.DataFrame(columns=['test_date', 'precision', 'recall', 'mapN', 'mrr', 'F1']) #

In [56]:
validation_results = pd.DataFrame()

for train_idx, test_idx, info in folds_with_stats:
    print(f"test range - from {info['Start date']} to {info['End date']}")
    train = train_transactions.loc[train_idx]
    test = train_transactions.loc[test_idx]
    print(f'train shape - {train.shape}, test shape - {test.shape}')
    
    train_mat, train_mat_weights = dataset.build_interactions(df_to_tuple_iterator(train[['chb', 'sys_numb']]))
    
    num_epochs = 15
    for _ in tqdm(range(num_epochs), total=num_epochs):
        
        lfm_model.fit_partial(
            train_mat, 
            user_features=train_user_features,
            item_features=train_items_features,
            num_threads=4
        )
    
    recs_user = pd.DataFrame({
        'chb': test['chb'].unique()
    })
    known_items = train.groupby('chb')['sys_numb'].apply(list).to_dict()
        
    mapper = generate_lightfm_recs_mapper(
        lfm_model, 
        item_ids=all_cols, 
        known_items=known_items,
        N=top_N,
        user_features=train_user_features, 
        item_features=train_items_features, 
        user_mapping=lightfm_mapping['users_mapping'],
        item_inv_mapping=lightfm_mapping['items_inv_mapping'],
        num_threads=4
    )
    
    recs_user['sys_numb'] = recs_user['chb'].map(mapper)
    recs_user['sys_numb']=recs_user['sys_numb'].astype('str').apply(lambda x: x.replace('[','').replace(']','').replace("'",'').replace("'",'').split(', '))
    # recs_user['sys_numb'] = recs_user.sys_numb.apply(lambda x: x.replace("'", '').replace("'", ''))
    recs_user = recs_user.explode('sys_numb')
    #recs_user.sys_numb=pd.to_numeric(recs_user.sys_numb, downcast="integer")
    recs_user['rank'] = recs_user.groupby('chb').cumcount() + 1
    metrics = compute_metrics(test, recs_user, top_N)
#     display(metrics)
    metric_dict = {
        'test_date': info['Start date'],
        'precision': metrics[f'Precision@{top_N}'],
        'recall': metrics[f'Recall@{top_N}'],
        'mapN': metrics[f'MAP@{top_N}'],
        'mrr': metrics['MRR'],
        'F1': metrics['F1']
        }

    metric_temp = pd.DataFrame(metric_dict, index=[1])

    metrics_in_time = metrics_in_time.append(metric_temp, ignore_index=True) 

test range - from 2022-03-24 00:00:00 to 2022-03-25 00:00:00
train shape - (253578, 3), test shape - (127, 3)


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))


test range - from 2022-03-25 00:00:00 to 2022-03-26 00:00:00
train shape - (254401, 3), test shape - (134, 3)


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))


test range - from 2022-03-26 00:00:00 to 2022-03-27 00:00:00
train shape - (255264, 3), test shape - (144, 3)


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))


test range - from 2022-03-27 00:00:00 to 2022-03-28 00:00:00
train shape - (256298, 3), test shape - (31, 3)


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))


test range - from 2022-03-28 00:00:00 to 2022-03-29 00:00:00
train shape - (256486, 3), test shape - (59, 3)


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))


test range - from 2022-03-29 00:00:00 to 2022-03-30 00:00:00
train shape - (256803, 3), test shape - (146, 3)


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))


test range - from 2022-03-30 00:00:00 to 2022-03-31 00:00:00
train shape - (257737, 3), test shape - (170, 3)


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))




In [57]:
metrics_in_time

Unnamed: 0,test_date,precision,recall,mapN,mrr,F1
0,2022-03-24,0.0,0.0,0.0,0.0,0.0
1,2022-03-25,0.000575,0.011494,0.000575,0.000575,0.001095
2,2022-03-26,0.000532,0.005319,0.000296,0.000591,0.000967
3,2022-03-27,0.0,0.0,0.0,0.0,0.0
4,2022-03-28,0.0,0.0,0.0,0.0,0.0
5,2022-03-29,0.000962,0.007212,0.000847,0.002015,0.001697
6,2022-03-30,0.000962,0.019231,0.010181,0.010181,0.001832


# Submit prediction

In [58]:
predict_recs = pd.DataFrame({
        'chb': submit['chb'].unique(), 'sys_numb': '0'
    })

In [59]:
mapper = generate_lightfm_recs_mapper(
    lfm_model, 
    item_ids=all_cols, 
    known_items=known_items,
    N=top_N,
    user_features=train_user_features, 
    item_features=train_items_features, 
    user_mapping=lightfm_mapping['users_mapping'],
    item_inv_mapping=lightfm_mapping['items_inv_mapping'],
    num_threads=4
)

In [60]:
f_num = len(predict_recs)//1000
rest = len(predict_recs)%1000+1
for rec in tqdm(range(1, f_num+1)):
    predict_recs['sys_numb'].loc[(rec-1)*1000:rec*1000] = predict_recs['chb'].loc[(rec-1)*1000:rec*1000].map(mapper)

predict_recs['sys_numb'].loc[f_num*1000-1:f_num*1000+rest] = predict_recs['chb'].loc[f_num*1000-1:f_num*1000+rest].map(mapper)                      

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=16.0), HTML(value='')))




In [61]:
# predict_recs['sys_numb'] = predict_recs['chb'].map(mapper)
predict_recs['sys_numb'] = predict_recs['sys_numb'].astype('str').apply(
    lambda x: x.replace('[', '').replace(']', '').replace("'", '').replace("'", '').split(', '))
predict_recs = predict_recs.explode('sys_numb')

In [62]:
predict_recs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 335060 entries, 0 to 16752
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   chb       335060 non-null  object
 1   sys_numb  335060 non-null  object
dtypes: object(2)
memory usage: 7.7+ MB


In [63]:
predict_recs.head(20)

Unnamed: 0,chb,sys_numb
0,100001051682,RSL07000432334
0,100001051682,RSL02000006960
0,100001051682,RSL02000022351
0,100001051682,RSL07000455845
0,100001051682,RSL07000386069
0,100001051682,RSL02000000591
0,100001051682,RSL01008681216
0,100001051682,RSL07000375074
0,100001051682,RSL60000250152
0,100001051682,RSL02000000529


In [64]:
predict_recs.index = np.arange(len(predict_recs))

In [65]:
# Формирование csv файла для отправки на платформу
predict_recs.to_csv("solution.csv", index=False, sep=';')