# Импорт библиотек

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from catboost import CatBoostClassifier

%matplotlib inline

# Загрузка и обработка датасетов

## трейн

In [2]:
df_raw = pd.read_csv(r"C:\Users\RedmiBook\Documents\hacks\Russian Post\train_dataset_train.csv", low_memory=False)

In [3]:
fill_mode = lambda col: col.fillna(col.mode())
df_raw = df_raw.apply(fill_mode, axis=0)

In [4]:
def change_types(df):
    return df.astype({'class':'int32', 'priority':'int32', 'mailtype':'int32', 
          'mailctg':'int32', 'mailrank':'int32', 'directctg':'int32', 'postmark':'int32', 
           'dist_qty_oper_login_1':'int32', 'total_qty_oper_login_1':'int32',
           'total_qty_oper_login_0':'int32', 'total_qty_over_index_and_type':'int32', 
           'total_qty_over_index':'int32', 'is_wrong_sndr_name':'int32', 
           'is_wrong_rcpn_name':'int32', 'is_wrong_phone_number':'int32', 'is_wrong_address':'int32'})

In [5]:
df_raw = change_types(df_raw)
df = df_raw.drop(['id', 'oper_type + oper_attr'], axis = 1)
df['label'] = df.groupby(list(set(df.columns) - {'label'}))['label'].transform('max')
df['num_oper_count'] = df.groupby(list(df.columns))['label'].transform('count')
df = df.drop_duplicates()

## тест

In [6]:
df_test = pd.read_csv(r"C:\Users\RedmiBook\Documents\hacks\Russian Post\test_dataset_test.csv", low_memory=False)
df_test = change_types(df_test.apply(fill_mode, axis=0))

In [19]:
df_test_new = df_test.drop(['id', 'oper_type + oper_attr'], axis = 1)
df_test_new = df_test_new.merge(df.drop('label',axis=1), how='left')
df_test_new['num_oper_count'] = df_test_new['num_oper_count'].fillna(0)

In [20]:
df_new = df.drop(['name_mfi'], axis = 1)
df_new = df_new.drop_duplicates()
df_test_new = df_test_new.drop(['name_mfi'], axis = 1)

## Кэтбуст

In [9]:
clf = CatBoostClassifier(auto_class_weights='Balanced', iterations=200, cat_features=['index_oper', 'type', 'is_privatecategory', 'class', 
                                                                                                             'is_in_yandex', 'is_return','mailctg', 'mailtype',
                                                                                                             'directctg', 'is_wrong_sndr_name', 'is_wrong_rcpn_name', 
                                                                                                             'is_wrong_phone_number', 'is_wrong_address', ])

In [21]:
X = df_new.drop(['label'], axis = 1)
y = df_new[['label']]

# Оценка точности

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
clf.fit(X_train, y_train)
pred_train = clf.predict_proba(X_test)

Learning rate set to 0.5
0:	learn: 0.3417792	total: 956ms	remaining: 3m 10s
1:	learn: 0.2880500	total: 1.75s	remaining: 2m 53s
2:	learn: 0.2747108	total: 2.57s	remaining: 2m 48s
3:	learn: 0.2670777	total: 3.24s	remaining: 2m 38s
4:	learn: 0.2624975	total: 3.86s	remaining: 2m 30s
5:	learn: 0.2594438	total: 4.57s	remaining: 2m 27s
6:	learn: 0.2570161	total: 5.24s	remaining: 2m 24s
7:	learn: 0.2540928	total: 5.89s	remaining: 2m 21s
8:	learn: 0.2531091	total: 6.57s	remaining: 2m 19s
9:	learn: 0.2525331	total: 7.27s	remaining: 2m 18s
10:	learn: 0.2511025	total: 7.97s	remaining: 2m 16s
11:	learn: 0.2499619	total: 8.69s	remaining: 2m 16s
12:	learn: 0.2487557	total: 9.43s	remaining: 2m 15s
13:	learn: 0.2472867	total: 10.2s	remaining: 2m 14s
14:	learn: 0.2462702	total: 10.8s	remaining: 2m 13s
15:	learn: 0.2456791	total: 11.5s	remaining: 2m 12s
16:	learn: 0.2453517	total: 12.4s	remaining: 2m 13s
17:	learn: 0.2449897	total: 13.5s	remaining: 2m 16s
18:	learn: 0.2445123	total: 14.5s	remaining: 2m 1

In [14]:
def convert_2_prob(border, pred):
    pred_label = np.zeros(len(pred))
    for i, prob in enumerate(pred):
        if prob[1] < border:
            pred_label[i] = 0
        else:
            pred_label[i] = 1
    return pred_label

In [15]:
score = recall_score(y_test, convert_2_prob(border=0.5, pred=pred_train), average = "macro" )
print("Recall", score)

Recall 0.8977576491537025


# Предсказание

In [None]:
clf.fit(X, y)

In [23]:
pred = clf.predict_proba(df_test_new)

# Отправка на проверку

In [24]:
pred_label = convert_2_prob(border=0.17, pred=pred)

In [25]:
def save_ans(name_file, pred=pred, id_label=df_test['id'], dir_file=r'C:\Users\RedmiBook\Documents\hacks\Russian Post\res\\'):
    arr, label = pred.astype('byte'), np.array(id_label).astype('int32')
    pd.DataFrame(data={'id': label, 'label': arr}, columns=['id', 'label']).to_csv(dir_file + name_file + '.csv', index=False, lineterminator='\n')

In [26]:
save_ans(name_file='fin_cb', pred=pred_label)