In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB

from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, RobustScaler
from category_encoders import BinaryEncoder

from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings("ignore")

In [2]:
SEED = 42

In [3]:
pays = pd.read_csv('pays.csv')
inn_info = pd.read_csv('inn_info_public.csv')

In [4]:
print(inn_info.isnull().sum(), pays.isnull().sum(), sep='\n')
print('"Pays" Доля объектов с пропусками: {0:.2f}%'. format(pays.isnull().sum().sum()/pays.shape[0] * 100))
print('"Pays" Доля объектов у которых sum < 0: {0:.2f}%'.format(pays[pays['sum'] < 0].shape[0]/pays.shape[0]*100))
print('"Pays" Минимальный sum: {0}'.format(min(pays['sum'])))

hash_inn     0
okved2       0
region       0
is_public    0
dtype: int64
hash_inn_kt       0
hash_inn_dt       0
week              0
count             0
sum            1125
dtype: int64
"Pays" Доля объектов с пропусками: 0.02%
"Pays" Доля объектов у которых sum < 0: 3.44%
"Pays" Минимальный sum: -0.4


In [5]:
#преобразование столбца hash_inn в индекс датафрейма
inn_info = inn_info.sort_values(by='hash_inn').reset_index(drop=True)
inn_info.set_index('hash_inn', inplace=True)

#удаление объектов с пропусками в pays
pays.dropna(inplace=True)

#удаление объектов с sum < 0
pays.drop(pays[pays['sum'] <= 0].index, axis=0, inplace=True)

#так как минимальное отрицательно число sum = -0.4, можно предположить, что 
#все такие объекты имеют sum=0, то есть выполнялись пустые транзацкии
#pays.loc[pays['sum'] <= 0, 'sum'] = 0

In [6]:
#dummy кодирование для учета показателей week
pays = pays.join(pd.get_dummies(pays['week'], drop_first=False, prefix='week'))

In [7]:
#формирование признаков

##посмотрим на количетсво и суммы исходящих и входящих транзакций по одному инн
#pays_not_eq = pays[pays['hash_inn_kt'] == pays['hash_inn_dt']]
inn_count_send = pays.groupby(by='hash_inn_kt')['count'].sum()
inn_count_get = pays.groupby(by='hash_inn_dt')['count'].sum()
inn_sum_send = pays.groupby(by='hash_inn_kt')['sum'].sum()
inn_sum_get = pays.groupby(by='hash_inn_dt')['sum'].sum()

#pays_eq = pays[pays['hash_inn_kt'] == pays['hash_inn_dt']]
#pays_kt_eq_dt_count = pays_eq.groupby(by='hash_inn_kt')['count'].sum()
#pays_kt_eq_dt_sum = pays_eq.groupby(by='hash_inn_kt')['sum'].sum()

inn_count_send = pd.DataFrame(inn_count_send.values, index=inn_count_send.index, columns=['count_send'])
inn_count_get = pd.DataFrame(inn_count_get.values, index=inn_count_get.index, columns=['count_get'])
inn_sum_send = pd.DataFrame(inn_sum_send.values, index=inn_sum_send.index, columns=['sum_send'])
inn_sum_get = pd.DataFrame(inn_sum_get.values, index=inn_sum_get.index, columns=['sum_get'])

#pays_kt_eq_dt_count = pd.DataFrame(pays_kt_eq_dt_count.values,index=pays_kt_eq_dt_count.index,columns=['count_kt_eq_dt'])
#pays_kt_eq_dt_sum = pd.DataFrame(pays_kt_eq_dt_sum.values,index=pays_kt_eq_dt_sum.index,columns=['sum_kt_eq_dt'])

##некоторые инн делали транзакции сами себе, найдем их и дальше пометим
inn_kt_eq_dt = pays.loc[pays['hash_inn_kt'] == pays['hash_inn_dt'], 'hash_inn_kt'].unique()


##добавление учета недель и по скольким инн от конкретного инн было сделано транзакций и наоборот
week_range_send = []
week_range_get = []

for i in range(len(pays['week'].unique())):
    w_send = pays.groupby(by='hash_inn_kt')['week_'+'{0}'.format(i)].sum()
    w_get = pays.groupby(by='hash_inn_dt')['week_'+'{0}'.format(i)].sum()
    week_range_send.append(pd.DataFrame(w_send.values, index=w_send.index, columns=['week_send_'+'{0}'.format(i)]))
    week_range_get.append(pd.DataFrame(w_get.values, index=w_get.index, columns=['week_get_'+'{0}'.format(i)]))

In [8]:
#добавление построенных признаков в датафрейм
inn_info = inn_info.join([inn_count_send, inn_count_get, inn_sum_send, inn_sum_get])

inn_info['kt_eq_dt'] = 0
inn_info.loc[inn_kt_eq_dt, 'kt_eq_dt'] = 1

#inn_info = inn_info.join([pays_kt_eq_dt_count, pays_kt_eq_dt_sum])

for week_send in week_range_send:
    inn_info = inn_info.join(week_send)
for week_get in week_range_get:
    inn_info = inn_info.join(week_get)

##так как где-то получились nan, то заменим на 0, посколько таких событий просто напросто не было
inn_info.fillna(0, inplace=True)

In [9]:
#прологарифмируем часть признаков
inn_info['count_send'] = inn_info['count_send'].apply(lambda x: np.log(x+1))
inn_info['count_get'] = inn_info['count_get'].apply(lambda x: np.log(x+1))
inn_info['sum_get'] = inn_info['sum_get'].apply(lambda x: np.log(x+1))
inn_info['sum_send'] = inn_info['sum_send'].apply(lambda x: np.log(x+1))

In [10]:
#получение обучающей выборки, которая будет оцениваться на кросс-валидации
train = inn_info[inn_info['okved2'] != -1]
train = train.drop(['is_public'], axis=1)
test = inn_info[inn_info['okved2'] == -1]
test = test.drop(['is_public'], axis=1)

In [11]:
#данный способ кодирования показал наилучшие результаты в сравнении с некоторыми
#другими из category_encoders
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
region_train_ohe = ohe.fit_transform(train['region'].values.reshape(-1,1))
X_train = train.drop(['okved2', 'region'], axis=1)
X_train_ohe = np.hstack([X_train.values, region_train_ohe])
y_train = train['okved2']

region_test_ohe = ohe.transform(test['region'].values.reshape(-1,1))
X_test = test.drop(['okved2', 'region'], axis=1)
X_test_ohe = np.hstack([X_test, region_test_ohe])

y_test = test['okved2'] #по сути не нужно, так как там просто -1 везде

### CV on part of data

In [12]:
#возьмем часть выборки для ускорения обучения и получения результатов
count_of_part = 12000
X_part = X_train_ohe[:count_of_part,:]
y_part = y_train.iloc[:count_of_part]

In [14]:
#данные разрежены, воспользуемся линейными классификаторами: MultinomialNB, LinearSVC
#и LogisticRegression, последний показал себя лучше всех, оставил его
#для масштабирования проверял 3 варианта: StandardScaler, MinMaxScaler, RobustScaler
#последний дает наилучшее качество
start = datetime.datetime.now()
scaler = RobustScaler()
clf = OneVsOneClassifier(LinearSVC(C=3, random_state=SEED))
pipeline = Pipeline([('transformer', scaler), ('estimator', clf)])
cv = StratifiedKFold(n_splits = 5, shuffle=True, random_state=SEED)
scores = cross_val_score(estimator=pipeline, X = X_part, y = y_part,
                          cv=cv, scoring='f1_weighted', n_jobs=-1)
print('Time taken: {0}'.format(datetime.datetime.now() - start))
print('f1_weighted score: {0}'.format(scores.mean()))

Time taken: 0:02:07.385493
f1_weighted score: 0.1795438818353138


In [15]:
start = datetime.datetime.now()
scaler = RobustScaler()
clf = LinearSVC(C=3, random_state=SEED)#стратегия ovr по дефолту
pipeline = Pipeline([('transformer', scaler), ('estimator', clf)])
cv = StratifiedKFold(n_splits = 5, shuffle=True, random_state=SEED)
scores = cross_val_score(estimator=pipeline, X = X_part, y = y_part,
                          cv=cv, scoring='f1_weighted', n_jobs=-1)
print('Time taken: {0}'.format(datetime.datetime.now() - start))
print('f1_weighted score: {0}'.format(scores.mean()))

Time taken: 0:04:02.868633
f1_weighted score: 0.17609530694174674


In [16]:
start = datetime.datetime.now()
scaler = MinMaxScaler()
clf = OneVsOneClassifier(MultinomialNB())
pipeline = Pipeline([('transformer', scaler), ('estimator', clf)])
cv = StratifiedKFold(n_splits = 5, shuffle=True, random_state=SEED)
scores = cross_val_score(estimator=pipeline, X = X_part, y = y_part,
                          cv=cv, scoring='f1_weighted', n_jobs=-1)
print('Time taken: {0}'.format(datetime.datetime.now() - start))
print('f1_weighted score: {0}'.format(scores.mean()))

Time taken: 0:00:47.783284
f1_weighted score: 0.15112802060361874


In [17]:
start = datetime.datetime.now()
scaler = MinMaxScaler()
clf = OneVsRestClassifier(MultinomialNB())
pipeline = Pipeline([('transformer', scaler), ('estimator', clf)])
cv = StratifiedKFold(n_splits = 5, shuffle=True, random_state=SEED)
scores = cross_val_score(estimator=pipeline, X = X_part, y = y_part,
                          cv=cv, scoring='f1_weighted', n_jobs=-1)
print('Time taken: {0}'.format(datetime.datetime.now() - start))
print('f1_weighted score: {0}'.format(scores.mean()))

Time taken: 0:00:02.725562
f1_weighted score: 0.1525948173821889


In [18]:
start = datetime.datetime.now()
scaler = RobustScaler()
clf = LogisticRegression(C=1, random_state=SEED)
pipeline = Pipeline([('transformer', scaler), ('estimator', clf)])
cv = StratifiedKFold(n_splits = 5, shuffle=True, random_state=SEED)
scores = cross_val_score(estimator=pipeline, X = X_part, y = y_part,
                          cv=cv, scoring='f1_weighted', n_jobs=-1)
print('Time taken: {0}'.format(datetime.datetime.now() - start))
print('f1_weighted score: {0}'.format(scores.mean()))

Time taken: 0:01:23.466534
f1_weighted score: 0.18223923133943126


In [22]:
start = datetime.datetime.now()
scaler = MinMaxScaler()
clf = LogisticRegression(C=1, random_state=SEED)
pipeline = Pipeline([('transformer', scaler), ('estimator', clf)])
cv = StratifiedKFold(n_splits = 5, shuffle=True, random_state=SEED)
scores = cross_val_score(estimator=pipeline, X = X_part, y = y_part,
                          cv=cv, scoring='f1_weighted', n_jobs=-1)
print('Time taken: {0}'.format(datetime.datetime.now() - start))
print('f1_weighted score: {0}'.format(scores.mean()))

Time taken: 0:00:24.331267
f1_weighted score: 0.1742611638997646


### Fit model on all data

baseline f1_weigted = 0.12

In [23]:
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train_ohe)
X_test_scaled = scaler.transform(X_test_ohe)

In [None]:
start = datetime.datetime.now()
clf = LogisticRegression(solver='lbfgs', max_iter=500, n_jobs=-1)
clf.fit(X_train_scaled, y_train)
print('Time taken: {0}'.format(datetime.datetime.now() - start))

In [None]:
f1_score(y_train, clf.predict(X_train_scaled), average='weighted')
#0.17701037830683974

In [None]:
y_pred = clf.predict(X_test_scaled)
pd.DataFrame({'hash_inn':X_test.index,'y':y_pred}).to_csv('okved_data.csv', index=False)
#f1_weighted on test = 0.18 (got to know when send answers)