In [1]:
import os
import pandas as pd
from tqdm import tqdm_notebook
import warnings
warnings.filterwarnings('ignore')

In [2]:
#считывание данных 

os.chdir(r'C:\Users\Mr Alex\Downloads\Zakupki_data')

train_data = pd.read_csv("train_data.csv", sep=';')
train_labels = pd.read_csv("train_labels.csv", sep=';')
test_data = pd.read_csv("test_data.csv", sep=';')

In [3]:
#Заполнение пропусков строкой 'None'
train_data = train_data.fillna('None')
test_data = test_data.fillna('None')

In [5]:
train_data.describe()

Unnamed: 0,region_code,lot_price
count,778901.0,778901.0
mean,51.003673,3420793.0
std,24.031115,100893000.0
min,1.0,0.0
25%,29.0,99000.0
50%,54.0,305428.0
75%,74.0,991415.6
max,99.0,46780060000.0


<hr>

# Подготовка данных 

#### Обработка okpd2 и additional_code

In [None]:
train_data['okpd2_or_additional_code'] = train_data[['okpd2_code', 'additional_code']].apply(lambda x: x[0] if x[1] == 'None' else x[1], axis=1)
test_data['okpd2_or_additional_code'] = test_data[['okpd2_code', 'additional_code']].apply(lambda x: x[0] if x[1] == 'None' else x[1], axis=1)

In [None]:
#Просмотр частоты значений 
train_data['okpd2_or_additional_code'].value_counts()

#### Формирование описания тендера

In [None]:
train_data['text_description_tender'] = train_data['purchase_name'] + " " + train_data['lot_name'] + " " + train_data['okpd2_names'] + " " + train_data['additional_code_names'] + " " + train_data['item_descriptions']

In [None]:
train_data.head(3).T

#### Работа с историей участий поставщика

In [None]:
inn_kpp_history = pd.merge(train_labels, train_data[['pn_lot_anon','region_code', 'okpd2_or_additional_code']], on=['pn_lot_anon'])

In [None]:
inn_kpp_history.head()

In [None]:
#группировка по поставщику 
inn_kpp_history = inn_kpp_history.groupby('participant_inn_kpp_anon').apply(lambda x: [
    list(x['pn_lot_anon']),
    list(x['is_winner']), 
    list(x['fz']), 
    list(x['region_code']), 
    list(x['okpd2_or_additional_code'])]).apply(pd.Series)

In [None]:
inn_kpp_history = inn_kpp_history.reset_index()
inn_kpp_history.columns = ['participant_inn_kpp_anon', 'list_pn_lot_anon',
                           'list_is_winner', 'list_fz', 'list_region_code',
                           'list_okpd2_or_additional_code']

In [None]:
inn_kpp_history.head()

#### Рекомендательная система

Выбирается 35 случайный процедур из подвыборки с совпадением региона и ОКПД2 кода актуальной процедуры с регионами и ОКПД2 кодами из истории участия поставщика

In [None]:
inn_kpp_recommendation = []
similarity_score = 1
for inn_kpp in tqdm_notebook(inn_kpp_history.values):
    participant_inn_kpp_anon, list_participant_inn_kpp_anon, list_is_winner, list_fz, list_region_code, list_okpd2_or_additional_code = inn_kpp
    #подвыборка с совпадением региона и ОКПД2 кода актуальной с историей поставщика
    recommendation = test_data[test_data['region_code'].isin(list_region_code) & test_data['okpd2_or_additional_code'].isin(list_okpd2_or_additional_code)]
    if recommendation.shape[0] >= 35:
        #выбор 35 случайных актуальных процедур из подвыборки
        recommendation = recommendation.sample(35)['pn_lot_anon'].values
        for actual_pn_lot in recommendation:
            inn_kpp_recommendation.append([participant_inn_kpp_anon, actual_pn_lot, similarity_score])

#### Пример формирования файла рекомендаций

In [None]:
recommendation = pd.DataFrame(inn_kpp_recommendation, columns=['inn_kpp', 'actual_recommended_pn_lot', 'similarity_score'])
recommendation.to_csv("team_name.csv", index=False, sep=';')

In [None]:
recommendation

# Подсчёт метрик

In [None]:
test_labels = pd.read_csv("ваша тестирующая выборка", sep=';')

In [None]:
true = set((test_labels['pn_lot_anon'] + "_" + test_labels['participant_inn_kpp_anon']).values)

In [None]:
pred = set((recommendation['actual_recommended_pn_lot'] + "_" + recommendation['inn_kpp']).values)

In [None]:
intersection = len(true.intersection(pred))

In [None]:
print(f"Точность: {intersection / len(pred) * 100}")
print(f"Полнота: {intersection / len(true) * 100}")