In [1]:
import pandas as pd
import numpy as np
from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows as df_to_row

In [2]:
from szp_funcs import get_fio, create_full_res, print_df, load_groups, gku_inn, path_to_docs, to_double, path_to_data, szp

In [3]:
def create_res(months):
    res = 0
    for i in range(len(months)):
        if i == 0:
            res = load_data(months[i])
        else:
            res = pd.merge(res, load_data(months[i]), how='outer', on=['inn', 'snils'])
    return res

In [4]:
def load_data(file):
    df = pd.read_excel(path_to_data + file + '.xlsx')
    df.stv = df.apply(lambda row: to_double(row, 'stv'), axis=1)
    sample = df[(df.type == 'Основное место работы') | (df.type == 'Внутреннее совместительство')]
    sums = sample.iloc[:, [0,1,2,3]].groupby(['inn', 'snils']).sum().reset_index().groupby('snils').max().reset_index()
    sums = pd.merge(sums, sample[sample.type == 'Основное место работы'], how='left', on=['inn', 'snils'])
    sums = sums[(sums.stv >= 1.0) & (sums.status == 'Работа') & (sums.day == 1) & (sums.status_pref == 'Работа')]
    sums = sums[['inn', 'snils', 'sum_x', 'stv', 'job']].drop_duplicates(['inn', 'snils', 'sum_x'])
    sums = sums.rename(columns = {'inn_y': 'inn', 'sum_x': 'sum_' + file, 'job':'job_' + file, 'stv': 'stv_' + file})
    return sums

In [18]:
inn_of_int = 7729775619#7725031305#9705101759#7731484984#7736668110#7729775619
months = ['jan', 'feb', 'mar']

In [6]:
res = create_res(months)

In [19]:
res[res.inn == inn_of_int]

Unnamed: 0,inn,snils,sum_jan,stv_jan,job_jan,sum_feb,stv_feb,job_feb,sum_mar,stv_mar,job_mar,szp
5,7729775619,001-020-711 41,65993.00,1.0,Воспитатель,60020.00,1.0,Воспитатель,68000.00,1.0,Воспитатель,64671.000000
171,7729775619,001-170-318 67,471700.00,1.0,Заместитель директора,,,,471700.00,1.0,Заместитель директора,471700.000000
728,7729775619,001-266-726 03,154880.40,1.0,Учитель,153002.26,1.0,Учитель,152352.36,1.0,Учитель,153411.673333
1295,7729775619,001-374-498 13,131319.25,1.0,Учитель,133769.25,1.0,Учитель,140306.38,1.0,Учитель,135131.626667
1682,7729775619,001-458-966 32,108000.00,1.0,Педагог-организатор,108000.00,1.0,Педагог-организатор,111200.00,1.0,Педагог-организатор,109066.666667
...,...,...,...,...,...,...,...,...,...,...,...,...
151240,7729775619,189-093-864 31,,,,,,,69900.00,1.0,Педагог-психолог,69900.000000
151317,7729775619,190-560-138 57,,,,,,,68000.00,1.0,Педагог-организатор,68000.000000
151555,7729775619,197-820-396 19,,,,,,,86605.74,1.0,Учитель,86605.740000
151557,7729775619,197-893-551 59,,,,,,,145940.02,1.0,Учитель,145940.020000


In [8]:
res['szp'] = res.apply(lambda row: szp(row, months), axis=1)

In [9]:
cut_off = res[['inn', 'snils', 'szp']].groupby('inn').count().rename(columns={'snils': 'cnt'})
cut_off.cnt = np.ceil(cut_off.cnt / 10)
cut_off = cut_off.to_dict()['cnt']
cut_off

{5003021368: 91.0,
 5003021495: 98.0,
 5003021640: 69.0,
 5003096290: 52.0,
 5030032182: 26.0,
 5030032376: 52.0,
 5051005503: 33.0,
 5051005623: 20.0,
 5051005670: 88.0,
 5074019220: 30.0,
 5074019252: 40.0,
 5074045703: 54.0,
 7701032028: 4.0,
 7701050524: 15.0,
 7701113894: 5.0,
 7701293580: 10.0,
 7701375995: 6.0,
 7701377657: 21.0,
 7701384943: 5.0,
 7701406844: 7.0,
 7701905508: 27.0,
 7702058364: 16.0,
 7702061938: 4.0,
 7702764296: 16.0,
 7702797527: 27.0,
 7702833359: 29.0,
 7703021039: 7.0,
 7703112310: 9.0,
 7703261626: 15.0,
 7703363890: 23.0,
 7703379868: 3.0,
 7703611221: 18.0,
 7703738796: 21.0,
 7703742922: 37.0,
 7703776689: 10.0,
 7703808740: 12.0,
 7704040242: 13.0,
 7704118121: 6.0,
 7704118139: 12.0,
 7704178709: 18.0,
 7704191153: 9.0,
 7704222919: 17.0,
 7704222926: 41.0,
 7704231631: 32.0,
 7704234760: 23.0,
 7704271031: 1.0,
 7704853583: 19.0,
 7705020295: 16.0,
 7705041520: 4.0,
 7705052410: 7.0,
 7705399348: 33.0,
 7705480398: 29.0,
 7705513678: 28.0,
 770551

In [10]:
res[res.inn == inn_of_int][['inn', 'snils', 'szp']].groupby('inn').count().rename(columns={'snils': 'cnt'})

Unnamed: 0_level_0,cnt,szp
inn,Unnamed: 1_level_1,Unnamed: 2_level_1
7725031305,184,184


In [20]:
cut_off[inn_of_int]

38.0

In [12]:
ppl = res[['inn', 'szp', 'snils']].dropna().sort_values('szp').groupby('inn')
keys = ppl.groups.keys()
val_1_7 = pd.DataFrame({'inn': [], 'val': []})

In [13]:
ppl.get_group(inn_of_int)

Unnamed: 0,inn,szp,snils
37130,7725031305,54910.666667,032-214-769 12
147375,7725031305,56056.000000,145-192-347 58
132455,7725031305,57344.000000,016-441-548 28
8330,7725031305,57344.000000,007-097-379 53
6965,7725031305,59010.666667,004-841-024 07
...,...,...,...
114070,7725031305,252344.000000,020-683-947 45
6480,7725031305,273021.333333,004-675-393 48
143587,7725031305,274688.000000,118-291-260 51
121198,7725031305,318129.710000,114-656-488 65


In [14]:
for key in keys:
    top = ppl.get_group(key).tail(int(cut_off[key]))['szp'].mean()
    bot = ppl.get_group(key).head(int(cut_off[key]))['szp'].mean()
    t_df = pd.DataFrame({'inn': [key], 'val': [top / bot]})
    val_1_7 = pd.concat([val_1_7, t_df])

In [21]:
ppl.get_group(inn_of_int).tail(int(cut_off[inn_of_int]))['szp'].mean()

229024.4645614035

In [22]:
ppl.get_group(inn_of_int).head(int(cut_off[inn_of_int]))['szp'].mean()

62004.51078947369

In [None]:
print_df(ppl.get_group(inn_of_int), 'redezil мтк')

In [23]:
val_1_7[val_1_7.inn == inn_of_int]

Unnamed: 0,inn,val
0,7729776000.0,3.693674


In [25]:
print_df(val_1_7, 'redezil all')