### Creating features and preparing dataset for fitting

In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler

In [3]:
pays = pd.read_csv('pays.csv')
pays = pays[pays['sum'].notna()]
pays = pays[pays['hash_inn_kt'] != pays['hash_inn_dt']]
pays = pays[pays['sum'] > 0]
# pays = pays[pays['sum'] < 100]

scaler = StandardScaler()
scaling_sum = scaler.fit_transform(pays['sum'].values.reshape(-1, 1))
scaling_sum = list(map(lambda x: x[0], scaling_sum))
pays['sum'] = scaling_sum
pays

Unnamed: 0,hash_inn_kt,hash_inn_dt,week,count,sum
10,0,7212,3,1,-0.033793
11,0,7212,8,1,-0.054777
12,0,7212,11,1,-0.049934
13,0,7212,13,2,-0.049868
14,0,7212,14,1,0.231462
...,...,...,...,...,...
5430144,260511,77935,2,2,-0.047402
5430145,260511,77935,5,2,0.031852
5430146,260511,77935,12,2,0.041759
5430147,260511,77935,14,2,-0.065234


In [4]:
inn = pd.read_csv('inn_info_public.csv')
inn['hash_inn_kt'] = inn['hash_inn']
inn['hash_inn_dt'] = inn['hash_inn']
inn


Unnamed: 0,hash_inn,okved2,region,is_public,hash_inn_kt,hash_inn_dt
0,61058,34,86,True,61058,61058
1,8311,18,86,True,8311,8311
2,130273,-1,86,False,130273,130273
3,64081,43,86,True,64081,64081
4,218005,12,86,True,218005,218005
...,...,...,...,...,...,...
240064,139170,4,12,True,139170,139170
240065,13553,12,12,True,13553,13553
240066,230402,12,12,True,230402,230402
240067,170104,63,12,True,170104,170104


In [5]:
pays_merged_kt = pd.merge(pays, inn[['hash_inn_dt', 'region']], on='hash_inn_dt', how='inner')
pays_merged_kt

Unnamed: 0,hash_inn_kt,hash_inn_dt,week,count,sum,region
0,0,7212,3,1,-0.033793,84
1,0,7212,8,1,-0.054777,84
2,0,7212,11,1,-0.049934,84
3,0,7212,13,2,-0.049868,84
4,0,7212,14,1,0.231462,84
...,...,...,...,...,...,...
5114061,260451,222207,21,1,-0.066412,82
5114062,260458,96970,11,1,-0.066975,17
5114063,260487,143637,11,3,3.710846,83
5114064,260503,30189,16,3,-0.063609,60


In [6]:
features_kt = pays_merged_kt.drop(['hash_inn_dt'], axis=1).pivot_table(
    index=['hash_inn_kt'], columns='region', values=['count', 'sum', 'week'], 
    aggfunc={'count': 'sum', 'sum': 'sum', 'week': np.std}).fillna(0)
features_kt.index.name = 'hash_inn'
features_kt

Unnamed: 0_level_0,count,count,count,count,count,count,count,count,count,count,...,week,week,week,week,week,week,week,week,week,week
region,0,1,2,3,4,5,6,7,8,9,...,81,82,83,84,85,86,87,88,89,90
hash_inn,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,0.0,0.0,0.0,0.0,7.0,0.0,2.0,1.0,0.0,0.0,...,0.0,0.0,0.0,5.886991,4.242641,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,3.505098,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260504,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,6.808807,0.0,0.0,0.0,0.0,0.0
260507,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,6.713171,0.0,0.0,0.0,0.0,0.0
260509,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.852289,4.732864,0.0,0.0,0.0,0.0,0.0
260510,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.725816,0.000000,0.0,0.0,0.0,0.0,0.0


In [7]:
pays_merged_dt = pd.merge(pays, inn[['hash_inn_kt', 'region']], on='hash_inn_kt', how='inner')
pays_merged_dt

Unnamed: 0,hash_inn_kt,hash_inn_dt,week,count,sum,region
0,0,7212,3,1,-0.033793,84
1,0,7212,8,1,-0.054777,84
2,0,7212,11,1,-0.049934,84
3,0,7212,13,2,-0.049868,84
4,0,7212,14,1,0.231462,84
...,...,...,...,...,...,...
5114061,260511,77935,2,2,-0.047402,84
5114062,260511,77935,5,2,0.031852,84
5114063,260511,77935,12,2,0.041759,84
5114064,260511,77935,14,2,-0.065234,84


In [8]:
features_dt = pays_merged_dt.drop(['hash_inn_kt'], axis=1).pivot_table(
    index=['hash_inn_dt'], columns='region', values=['count', 'sum', 'week'], 
    aggfunc={'count': 'sum', 'sum': 'sum', 'week': np.std}).fillna(0)
features_dt.index.name = 'hash_inn'
features_dt

Unnamed: 0_level_0,count,count,count,count,count,count,count,count,count,count,...,week,week,week,week,week,week,week,week,week,week
region,1,2,3,4,5,6,7,8,9,10,...,81,82,83,84,85,86,87,88,89,90
hash_inn,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,0.0,1.0,0.0,0.0,0.0,5.0,3.0,9.0,0.0,0.0,...,0.0,0.0,0.0,5.772733,0.000000,5.322906,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,6.107103,0.000000,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.391959,0.000000,0.000000,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,6.934215,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260511,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.062019,0.000000,0.000000,0.0,0.0,0.0,0.0
260512,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,3.976119,0.000000,0.0,0.0,0.0,0.0
260513,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
260514,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0


In [9]:
features = pd.concat([features_kt, features_dt], axis=1, sort=False)
features

Unnamed: 0_level_0,count,count,count,count,count,count,count,count,count,count,...,week,week,week,week,week,week,week,week,week,week
region,0,1,2,3,4,5,6,7,8,9,...,81,82,83,84,85,86,87,88,89,90
hash_inn,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0,0.0,0.0,0.0,0.0,7.0,0.0,2.0,1.0,0.0,0.0,...,0.0,0.0,0.0,5.772733,0.000000,5.322906,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260511,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.062019,0.000000,0.000000,0.0,0.0,0.0,0.0
260512,,,,,,,,,,,...,0.0,0.0,0.0,0.000000,3.976119,0.000000,0.0,0.0,0.0,0.0
260513,,,,,,,,,,,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0
260514,,,,,,,,,,,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0


In [10]:
data = features.merge(inn[['hash_inn', 'okved2']].set_index('hash_inn'), how='outer', left_index=True, right_index=True)
data = data.fillna(0)
data



Unnamed: 0_level_0,"(count, 0)","(count, 1)","(count, 2)","(count, 3)","(count, 4)","(count, 5)","(count, 6)","(count, 7)","(count, 8)","(count, 9)",...,"(week, 82)","(week, 83)","(week, 84)","(week, 85)","(week, 86)","(week, 87)","(week, 88)","(week, 89)","(week, 90)",okved2
hash_inn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,7.0,0.0,2.0,1.0,0.0,0.0,...,0.0,0.0,5.772733,0.000000,5.322906,0.0,0.0,0.0,0.0,-1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,62
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,-1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,29
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260511,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.062019,0.000000,0.000000,0.0,0.0,0.0,0.0,12
260512,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,3.976119,0.000000,0.0,0.0,0.0,0.0,34
260513,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,12
260514,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,-1


In [11]:
data.to_csv('new_data.csv')