In [51]:
import os
import pickle
import pandas as pd
from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

In [86]:
DATA_PATH = "../data"
DICTS_PATH = "../dicts"
DATA_FILENAME = "data.csv"
data_file = os.path.join(DATA_PATH, DATA_FILENAME)
dict_60000_path = os.path.join(DICTS_PATH, "0_30000.pickle")
dict_30000_path = os.path.join(DICTS_PATH, "0_60000.pickle")
dict_90000_120000_path = os.path.join(DICTS_PATH, "0_90000_120000.pickle")
dict_120000_160000_path = os.path.join(DICTS_PATH, "0_120000_160000.pickle")

In [89]:
iter_csv = pd.read_csv(data_file, encoding="utf-16", sep='\t', iterator=True, chunksize=1000000)
df = pd.concat([chunk[(chunk['Закупки по'] == '44-ФЗ') 
                & (chunk['Этап закупки']=='Определение поставщика завершено')
                & (chunk["Дата обновления"].str.split('.', expand=True).iloc[:, 2].astype("int").isin([2019, 2020, 2021]))] 
                for chunk in tqdm(iter_csv)])

11it [04:53, 26.69s/it]


In [198]:
df.shape

(143123, 27)

In [92]:
with open(dict_90000_120000_path, 'rb') as handle:
    d_90_120 = pickle.load(handle)
with open(dict_30000_path, 'rb') as handle:
    d_30 = pickle.load(handle)
with open(dict_60000_path, 'rb') as handle:
    d_60 = pickle.load(handle)
with open(dict_120000_160000_path, 'rb') as handle:
    d_120_160 = pickle.load(handle)

In [93]:
def convert(p):
    if p in d_30:
        return d_30[p]
    else:
        if p in d_60:
            return d_60[p]
        else:
            if p in d_90_120:
                return d_90_120[p]
            else:
                if p in d_120_160:
                    return d_120_160[p]
                else: return 0

In [94]:
df['purch'] = df['Реестровый номер закупки'].apply(lambda x: x[1:])
df["winer"] = df["purch"].apply(lambda p: convert(p))
df = df[df.winer != 0]
inns = df.groupby("winer").winer.count()[df.groupby("winer").winer.count() > 15].index
df_val = df[df["winer"].isin(inns)]

In [129]:
df_val["Начальная (максимальная) цена контракта"]

20025        725000.00
20600         44521.74
21040       1087410.00
21042       1292250.00
21117       9163000.00
               ...    
10223102     590579.56
10223107     251387.50
10223110      80000.00
10223111     348400.00
10223137    2282100.00
Name: Начальная (максимальная) цена контракта, Length: 14619, dtype: float64

In [96]:
df_val["Дата обновления"] = pd.to_datetime(df_val["Дата обновления"], format="%d.%m.%Y")
df_val["Год обновления"] = df_val["Дата обновления"].apply(lambda date: date.year)

In [97]:
df_val.to_csv(os.path.join(DATA_PATH, "valid.csv"), encoding="utf-8", sep=',')

In [171]:
count_2019 = lambda x: x[x == 2019].count()
count_2019.__name__ = "Кол-во (2019)"
count_2020 = lambda x: x[x == 2020].count()
count_2020.__name__ = "Кол-во (2020)"
count_2021 = lambda x: x[x == 2021].count()
count_2021.__name__ = "Кол-во (2021)"

stat = df_val.groupby(["winer"]).agg({
    "Год обновления": lambda x: (x == 2019).sum(),
    "Год обновления": lambda x: (x == 2020).sum()})

stat = df_val.groupby(["winer"]).agg(
    count_2019=("Год обновления", lambda x: (x == 2019).sum()),
    count_2020=("Год обновления", lambda x: (x == 2020).sum()),
    count_2021=("Год обновления", lambda x: (x == 2021).sum()),
    amount_2019=("Начальная (максимальная) цена контракта", lambda x: x.sum())
)

stat

Unnamed: 0_level_0,count_2019,count_2020,count_2021,amount_2019
winer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0224013045,16,17,27,3.201123e+07
0253003637,5,5,8,6.514830e+06
0274110679,11,23,24,2.967451e+09
0276136619,4,6,6,9.918901e+06
0323037270,5,8,10,2.765276e+07
...,...,...,...,...
9701146325,4,17,38,1.190218e+07
9715322330,1,6,9,9.235434e+06
9715338098,1,2,13,3.279639e+07
9718043127,5,8,6,3.245186e+06


In [194]:
count_df = df_val.pivot_table("Закупки по", index="winer", columns=["Год обновления"], aggfunc='count')
count_df.rename(columns={2019: "2019 кол-во", 2020: "2020 кол-во", 2021: "2021 кол-во"}, inplace = True)
count_df

Год обновления,2019 кол-во,2020 кол-во,2021 кол-во
winer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0224013045,16,17,27
0253003637,5,5,8
0274110679,11,23,24
0276136619,4,6,6
0323037270,5,8,10
...,...,...,...
9701146325,4,17,38
9715322330,1,6,9
9715338098,1,2,13
9718043127,5,8,6


In [195]:
sum_df = df_val.pivot_table("Начальная (максимальная) цена контракта", index="winer", columns=["Год обновления"], aggfunc='sum')
sum_df.rename(columns={2019: "2019 сумма", 2020: "2020 сумма", 2021: "2021 сумма"}, inplace = True)
sum_df

Год обновления,2019 сумма,2020 сумма,2021 сумма
winer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0224013045,7.883548e+06,1.334229e+07,1.078540e+07
0253003637,1.229531e+06,1.683347e+06,3.601952e+06
0274110679,4.441360e+08,1.165196e+09,1.358119e+09
0276136619,3.006112e+06,3.015865e+06,3.896924e+06
0323037270,1.424084e+07,4.924691e+06,8.487231e+06
...,...,...,...
9701146325,2.861609e+06,2.511585e+06,6.528991e+06
9715322330,2.038473e+05,3.453423e+06,5.578163e+06
9715338098,2.313520e+06,1.530400e+07,1.517887e+07
9718043127,5.376726e+05,1.323274e+06,1.384239e+06


In [197]:
pd.concat((count_df, sum_df), axis=1).to_csv("info.csv")

In [161]:
df_val.groupby(["winer"]).agg({"Год обновления": lambda x: (x == 2019).sum()})

Unnamed: 0_level_0,Год обновления
winer,Unnamed: 1_level_1
0224013045,16
0253003637,5
0274110679,11
0276136619,4
0323037270,5
...,...
9701146325,4
9715322330,1
9715338098,1
9718043127,5


In [152]:
stat.to_csv("stat.csv")

In [142]:
df_val["Год обновления"]

20025       2019
20600       2019
21040       2019
21042       2019
21117       2019
            ... 
10223102    2021
10223107    2021
10223110    2021
10223111    2021
10223137    2021
Name: Год обновления, Length: 14619, dtype: int64

In [100]:
valid_inns = []
for inn in inns:
    inn_stat = stat[inn]
    if inn_stat.shape[0] != 3:
        continue
    f = True
    for year in inn_stat.index[1:]:
        if (inn_stat[year] / inn_stat[year - 1]) < 0.75:
            f = False
            break
    if f: valid_inns.append(inn)

In [119]:
df_val = df_val[df_val.winer.isin(valid_inns)]

In [101]:
l = {'0224013045',
 '0323037270',
 '0326040626',
 '1101107584',
 '1101145043',
 '1328013394',
 '1435090876',
 '1649036481',
 '1655303424',
 '1656033379',
 '1660240931',
 '1660328953',
 '1901119814',
 '1901120601',
 '2130209293',
 '2221005804',
 '2222798183',
 '2224121968',
 '2225181208',
 '230203093245',
 '2312271360',
 '2460002949',
 '2461019543',
 '2464019742',
 '2465059970',
 '2465091580',
 '2465320039',
 '2466055150',
 '2466169528',
 '2466189073',
 '2466272518',
 '2540055907',
 '2540087761',
 '2540221551',
 '2543079825',
 '2628058630',
 '2635207403',
 '2801252525',
 '2807032170',
 '3025003760',
 '3120099928',
 '3123321867',
 '3308003627',
 '3327110751',
 '3443923035',
 '352532976647',
 '3525357197',
 '3662195283',
 '3662212115',
 '3662992257',
 '366300529071',
 '3702028556',
 '3703022973',
 '3711026548',
 '3808084790',
 '391300326265',
 '420544385832',
 '4209005047',
 '4253003592',
 '434200002901',
 '4345468090',
 '4347029500',
 '4501204468',
 '4632197133',
 '5005066205',
 '5017083029',
 '5032294384',
 '5035031009',
 '5038033276',
 '5041016087',
 '5047166032',
 '5074039026',
 '5249154390',
 '5252042135',
 '5252045070',
 '5260200603',
 '5262258091',
 '5402546938',
 '5404041675',
 '5404250100',
 '5404405629',
 '5405161372',
 '5407211477',
 '5408152658',
 '5408291034',
 '5409241861',
 '5501072090',
 '5506226202',
 '5507204882',
 '5526006170',
 '5611053132',
 '5612066367',
 '561605887857',
 '5720022381',
 '575201465797',
 '5754004670',
 '5902050496',
 '590400018275',
 '5904231307',
 '5908051626',
 '5916020413',
 '6143083623',
 '6150067809',
 '615520705111',
 '6163109990',
 '6165121914',
 '6165181247',
 '6165196363',
 '616606649213',
 '6312125239',
 '6312171877',
 '6314018352',
 '6321070900',
 '6321414100',
 '6448012000',
 '6449078860',
 '6454016806',
 '6455018450',
 '645505818053',
 '6658344555',
 '6658448949',
 '6663071261',
 '6678028357',
 '6685071025',
 '6686059581',
 '6730018851',
 '6732027668',
 '6901046523',
 '6950223535',
 '7017341019',
 '7017350373',
 '7017398456',
 '7017456010',
 '7202251472',
 '7203241685',
 '7204032814',
 '7204112210',
 '7204205993',
 '7325140324',
 '7430018884',
 '7449063094',
 '7449088941',
 '7451344670',
 '7453271523',
 '753600159769',
 '7702019044',
 '7703406695',
 '7704413624',
 '7704543951',
 '7705042179',
 '7705952210',
 '7707049388',
 '7707086510',
 '7709487997',
 '7709753021',
 '7710316675',
 '7713447235',
 '7714072839',
 '7714939790',
 '7715768700',
 '7716017505',
 '7716644263',
 '7716814525',
 '7716874274',
 '7717764683',
 '7718599175',
 '7719481754',
 '7722715321',
 '7723322154',
 '7723378990',
 '7725285067',
 '7725375730',
 '7726362290',
 '7726414773',
 '7726448684',
 '7727173908',
 '7729079361',
 '7729418511',
 '7730702372',
 '7731329393',
 '7731416945',
 '7733865216',
 '7740000076',
 '7751154455',
 '7801557131',
 '7802411953',
 '7802744800',
 '7802829934',
 '7804372516',
 '7805074390',
 '7806005744',
 '781018594156',
 '7813257774',
 '7816530372',
 '7816598363',
 '7842032486',
 '8601023568',
 '8602212536',
 '860604211188',
 '8617035150',
 '9701120430',
 '9701146325',
 '9731038990'}

In [123]:
len(l)

203

In [124]:
set(valid_inns).difference(l).__len__()

157

In [125]:
len(valid_inns)

330