In [74]:
import pandas as pd
import pickle
import torch
import os

In [75]:
with open('./data/map/large_to_medium.pickle', 'rb') as f:
    LARGE_TO_MEDIUM = pickle.load(f)
with open('./data/map/large_to_small.pickle', 'rb') as f:
    LARGE_TO_SMALL = pickle.load(f)
with open('./data/map/medium_to_small.pickle', 'rb') as f:
    MEDIUM_TO_SMALL = pickle.load(f)
with open('./data/labels/large_num_to_label.pickle', 'rb') as f:
    large_num_to_label = pickle.load(f)
with open('./data/labels/medium_num_to_label.pickle', 'rb') as f:
    medium_num_to_label = pickle.load(f)
with open('./data/labels/small_num_to_label.pickle', 'rb') as f:
    small_num_to_label = pickle.load(f)

In [76]:
NAME = 'ml_lstm_3e-5_wr0.1_kfold'
model_pickle_path = f'./output/{NAME}'

In [77]:
with open(os.path.join(model_pickle_path, "soft_large_logit.pickle"),'rb') as f:
    large = pickle.load(f)
with open(os.path.join(model_pickle_path, "soft_medium_logit.pickle"),'rb') as f:
    medium = pickle.load(f)
with open(os.path.join(model_pickle_path, "soft_small_logit.pickle"),'rb') as f:
    small = pickle.load(f)

In [78]:
_, large_indices = torch.sort(large, dim=1, descending=True)
_, medium_indices = torch.sort(medium, dim=1, descending=True)
_, small_indices = torch.sort(small, dim=1, descending=True)


In [79]:
df = pd.DataFrame({'large' : large_indices[:,0], 'medium' : medium_indices[:, 0]})
for i in range(10):
    df[f'small_{i+1}'] = small_indices[:, i]
df.large = df.large.map(large_num_to_label)
df.medium = df.medium.map(medium_num_to_label)
for i in range(10):
    df[f'small_{i+1}'] = df[f'small_{i+1}'].map(small_num_to_label)

In [80]:
df.head()

Unnamed: 0,large,medium,small_1,small_2,small_3,small_4,small_5,small_6,small_7,small_8,small_9,small_10
0,I,56,561,562,472,107,463,471,479,101,106,551
1,G,46,466,465,475,467,259,464,422,461,424,291
2,S,94,949,941,872,969,856,681,478,702,759,901
3,S,95,952,302,452,303,761,424,340,451,729,759
4,I,56,562,561,472,471,463,912,111,902,107,112


In [81]:
from tqdm.auto import tqdm
full = []
lm = []
ms = []
ls = []
diff = []

for i in tqdm(range(len(df))):
    large = df.loc[i].large
    medium = df.loc[i].medium
    small_1 = df.loc[i].small_1
    
    if small_1[:-1] == medium:
        if medium in LARGE_TO_MEDIUM[large]:
            full.append(i)
        else :
            ms.append(i)
    elif small_1[:-1] != medium:
        if medium in LARGE_TO_MEDIUM[large]:
            lm.append(i)
        elif small_1 in LARGE_TO_SMALL[large]:
            ls.append(i)
        else :
            diff.append(i)   
            
len(full), len(lm), len(ls), len(ms), len(diff), len(full)+len(lm)+len(ls)+len(ms)+len(diff)
        

100%|██████████| 100000/100000 [00:09<00:00, 10688.63it/s]


(98389, 1108, 158, 320, 25, 100000)

In [82]:
correct = {
    '1' : 0,
    '2' : 0,
    '3' : 0,
    '4' : 0,
    '5' : 0,
    '6' : 0,
    '7' : 0,
    '8' : 0,
    '9' : 0,
    '10' : 0,
    'etc' : 0
}

for i in tqdm(lm):
    now = False
    medium = df.loc[i].medium
    for j in range(10):
        if df.loc[i][f'small_{j+1}'][:-1] == medium:
            correct[str(j+1)] += 1
            now = True
            break
    if not now :
        correct['etc'] += 1
correct
    

100%|██████████| 1108/1108 [00:00<00:00, 10339.69it/s]


{'1': 0,
 '2': 965,
 '3': 113,
 '4': 18,
 '5': 9,
 '6': 3,
 '7': 0,
 '8': 0,
 '9': 0,
 '10': 0,
 'etc': 0}

In [83]:
for i in tqdm(lm) :
    medium = df.loc[i].medium
    for j in range(10):
        next_label = df.loc[i][f'small_{j+1}']
        if next_label[:-1] == medium:
            df.loc[i].small_1 = next_label
            
for i in tqdm(ls) :
    small = df.loc[i].small_1
    df.loc[i].medium = small[:-1]
    
for i in tqdm(ms):
    small = df.loc[i].small_1
    for k, v in LARGE_TO_SMALL.items():
        if small in v:
            df.loc[i].large = k

100%|██████████| 1108/1108 [00:00<00:00, 2339.09it/s]
100%|██████████| 158/158 [00:00<00:00, 13082.88it/s]
100%|██████████| 320/320 [00:00<00:00, 12512.61it/s]


In [84]:
submission = pd.read_csv('./input/test.csv', index_col=False)
submission

Unnamed: 0,AI_id,digit_1,digit_2,digit_3,text_obj,text_mthd,text_deal
0,id_000001,,,,치킨전문점에서,고객의주문에의해,치킨판매
1,id_000002,,,,산업공구,다른 소매업자에게,철물 수공구
2,id_000003,,,,절에서,신도을 대상으로,불교단체운영
3,id_000004,,,,영업장에서,고객요구로,자동차튜닝
4,id_000005,,,,실내포장마차에서,접객시설을 갖추고,"소주,맥주제공"
...,...,...,...,...,...,...,...
99995,id_099996,,,,사업장에서,일반인대상으로,버섯농장
99996,id_099997,,,,한의원에서,외래환자위주고,치료
99997,id_099998,,,,일반점포에서,소비자에게,그림판매
99998,id_099999,,,,사업장에서,일반인.학생대상으로,학습공간제공


In [85]:
submission.digit_1 = df.large
submission.digit_2 = df.medium
submission.digit_3 = df.small_1
submission

Unnamed: 0,AI_id,digit_1,digit_2,digit_3,text_obj,text_mthd,text_deal
0,id_000001,I,56,561,치킨전문점에서,고객의주문에의해,치킨판매
1,id_000002,G,46,466,산업공구,다른 소매업자에게,철물 수공구
2,id_000003,S,94,949,절에서,신도을 대상으로,불교단체운영
3,id_000004,S,95,952,영업장에서,고객요구로,자동차튜닝
4,id_000005,I,56,562,실내포장마차에서,접객시설을 갖추고,"소주,맥주제공"
...,...,...,...,...,...,...,...
99995,id_099996,G,46,461,사업장에서,일반인대상으로,버섯농장
99996,id_099997,Q,86,862,한의원에서,외래환자위주고,치료
99997,id_099998,G,47,478,일반점포에서,소비자에게,그림판매
99998,id_099999,R,90,902,사업장에서,일반인.학생대상으로,학습공간제공


In [86]:
model_name = model_pickle_path.split('/')[-1]
submission.to_csv(os.path.join(model_pickle_path, f'soft.csv'), index=False)