In [17]:
import numpy as np
import pandas as pd
from collections import Counter
from pathlib import Path
import json
from datetime import datetime

update_date = datetime.now().strftime("%y%m%d%H%M%S")

In [18]:
dataset_dir = Path("../datasets/")
haccp_dataset_path = dataset_dir / "haccp_dataset_200619142700.csv"
df = pd.read_csv(haccp_dataset_path)

In [19]:
kind_counter = Counter(df['prdkind'])
kind_set = set(df['prdkind'])

sorted_kind = sorted(kind_counter.items(), key=lambda pair: pair[1], reverse=True)

prd_all = []
for kind in list(kind_set):
    prd_all.append([kind, set(df[df['prdkind'] == kind]['allergy'])])

In [20]:
df[df['prdkind'] == '분쇄 가공육'][['prdlstnm', 'rawmtrl', 'allergy']].head(20)

Unnamed: 0,prdlstnm,rawmtrl,allergy
1701,마이산정기담은흑돼지수제치즈돈까스,"돈육(국내산50%),치즈20%(수입산),생빵가루17.3%(밀가루-국내산,캐나다산/(...",알수없음
1702,마이산 정기담은 흑돼지 돈까스,"돈육(국내산70%)생빵가루17.3%(밀가루-미국산,캐나다산/팜유-말레이시아산)-옥수...","돼지고기,대두,밀,우유,알류"


In [23]:
super_prd_path = dataset_dir / "SuperPrd.json"
super_prd_all_path = dataset_dir / "SuperPrdAll.json"
super_prd_veg_path = dataset_dir / "SuperPrdVeg.json"
super_raw_path = dataset_dir / "SuperRaw.json"
super_raw_all_path = dataset_dir / "SuperRawAll.json"
super_raw_veg_path = dataset_dir / "SuperRawVeg.json"

with open(super_prd_path, encoding='utf-8') as f:
    super_prd = json.load(f)
with open(super_prd_veg_path, encoding='utf-8') as f:
    super_prd_veg = json.load(f)
with open(super_prd_all_path, encoding='utf-8') as f:
    super_prd_all = json.load(f)
with open(super_raw_path, encoding='utf-8') as f:
    super_raw = json.load(f)
with open(super_raw_veg_path, encoding='utf-8') as f:
    super_raw_veg = json.load(f)
with open(super_raw_all_path, encoding='utf-8') as f:
    super_raw_all = json.load(f)
    
prd_all_dst_dir = dataset_dir / "AllergyPrdKind_{}.csv".format(update_date)
prd_veg_dst_dir = dataset_dir / "VegPrdKind_{}.csv".format(update_date)
raw_all_dst_dir = dataset_dir / "AllergyRawMtrl_{}.csv".format(update_date)
raw_veg_dst_dir = dataset_dir / "VegRawMtrl_{}.csv".format(update_date)

---

In [24]:
super_prd_veg

{'vegan': ['돼지고기', '쇠고기', '가금류', '생선', '고기', '난류', '우유'],
 'lactoVeg': ['돼지고기', '쇠고기', '가금류', '생선', '고기', '난류'],
 'ovoVeg': ['돼지고기', '쇠고기', '가금류', '생선', '고기', '우유'],
 'lactoOvoVeg': ['돼지고기', '쇠고기', '가금류', '생선', '고기'],
 'pescoVeg': ['돼지고기', '쇠고기', '가금류'],
 'polloVeg': ['돼지고기', '쇠고기']}

In [25]:
super_raw_veg

{'vegan': ['돼지', '쇠고기', '닭', '조개', '게', '새우', '고등어', '오징어', '난류', '우유'],
 'lactoVeg': ['돼지', '쇠고기', '닭', '조개', '게', '새우', '고등어', '오징어', '우유'],
 'ovoVeg': ['돼지', '쇠고기', '닭', '조개', '게', '새우', '고등어', '오징어', '난류'],
 'lactoOvoVeg': ['돼지', '쇠고기', '닭', '조개', '게', '새우', '고등어', '오징어'],
 'pescoVeg': ['돼지', '쇠고기', '닭'],
 'polloVeg': ['돼지', '쇠고기']}

In [26]:
veg_prd_sup = {'돼지고기' : [1, 1, 1, 1, 1, 1],
               '쇠고기' : [1, 1, 1, 1, 1, 1],
               '가금류' : [1, 1, 1, 1, 1, 0],
               '생선' : [1, 1, 1, 1, 0, 0],
               '고기' : [1, 1, 1, 1, 0, 0],
               '난류' : [1, 1, 0, 0, 0, 0],
               '우유' : [1, 0, 1, 0, 0, 0]}

veg_raw_sup = {'돼지' : [1, 1, 1, 1, 1, 1],
               '쇠고기' : [1, 1, 1, 1, 1, 1],
               '닭' : [1, 1, 1, 1, 1, 0],
               '조개' : [1, 1, 1, 1, 0, 0],
               '게' : [1, 1, 1, 1, 0, 0],
               '새우' : [1, 1, 1, 1, 0, 0],
               '고등어' : [1, 1, 1, 1, 0, 0],
               '오징어' : [1, 1, 1, 1, 0, 0],
               '고기' : [1, 1, 1, 1, 0, 0],
               '난류' : [1, 1, 0, 0, 0, 0],
               '우유' : [1, 0, 1, 0, 0, 0]}

In [27]:
final_sup_prd_veg = []
for sup, prd in super_prd.items():
    if sup in veg_prd_sup.keys():
        veg_info = veg_prd_sup[sup]
    else:
        veg_info = [0, 0, 0, 0, 0, 0]
    for p in prd:
        final_sup_prd_veg.append([p] + veg_info)

In [28]:
prd_veg_cols = ["prdKind", "vegan", "lactoVeg", "ovoVeg", "lactoOvoVeg", "pescoVeg", "polloVeg"]
final_df = pd.DataFrame(final_sup_prd_veg, columns=prd_veg_cols)
final_df.to_csv(prd_veg_dst_dir, encoding="utf-8-sig", index=False)

In [29]:
final_sup_raw_veg = []
for sup, raw in super_raw.items():
    if sup in veg_raw_sup.keys():
        veg_info = veg_raw_sup[sup]
    else:
        veg_info = [0, 0, 0, 0, 0, 0]
    for r in raw[0]:
        final_sup_raw_veg.append([r] + veg_info)

In [30]:
raw_veg_cols = ["RawKind", "vegan", "lactoVeg", "ovoVeg", "lactoOvoVeg", "pescoVeg", "polloVeg"]
final_df = pd.DataFrame(final_sup_raw_veg, columns=raw_veg_cols)
final_df.to_csv(raw_veg_dst_dir, encoding="utf-8-sig", index=False)

---

In [31]:
final_sup_prd_all = []
for sup, prd in super_prd.items():
    if sup in super_prd_all.keys():
        all_info = [sup, 0, 'null', 'null']
    else:
        all_info = ['null', 0, 'null', 'null']
    for p in prd:
        final_sup_prd_all.append([p] + all_info)

In [33]:
prd_all_cols = ["prdKind", "allergy", "isCrossReact", "parentAllergy", "probablility"]
final_df = pd.DataFrame(final_sup_prd_all, columns=prd_all_cols)
final_df.to_csv(prd_all_dst_dir, encoding="utf-8-sig", index=False)

In [34]:
final_sup_raw_all = []
for sup, raw in super_raw.items():
    if sup in super_raw_all.keys():
        all_info = [sup, 0, 'null', 'null']
    elif raw[1] == 1:
        all_info = [raw[2], 'true', raw[2], raw[3]]
    else:
        all_info = ['null', 0, 'null', 'null']
    for r in raw[0]:
        final_sup_raw_all.append([r] + all_info)

In [36]:
raw_all_cols = ["rawMtrl", "allergy", "isCrossReact", "parentAllergy", "probablility"]
final_df = pd.DataFrame(final_sup_raw_all, columns=raw_all_cols)
final_df.to_csv(raw_all_dst_dir, encoding="utf-8-sig", index=False)