In [204]:
from collections import Counter
from copy import deepcopy
from difflib import SequenceMatcher
from string import punctuation
import pandas as pd

df = pd.read_json("raw.json")
df[df["komoditas"].apply(lambda x: "soto" in x)]

Unnamed: 0,komoditas,berat
175,soto ayam babat kikil lele,7 kg
283,"nasi uduk pecel lele sotobayam babat , udang dll",lele 2 sampai 3 kg


In [205]:
def replace_punctuation(s: str):
    for p in punctuation:
        s = s.replace(p, " ")
        while "  " in s:
            s = s.replace("  ", " ")
    s = s.strip()
    return s

def replace_stopwords(s: str):
    stop_words = ["dan", "ikan", "ikn"]
    for sw in stop_words:
        s = s.replace(f"{sw} ", " ")
        s = s.replace(f" {sw}", " ")
        while "  " in s:
            s = s.replace("  ", " ")
        s = s.strip()
    return s

def numeric_only(s: str):
    s = "".join([c for c in s if c.isdigit() or c == " "])
    while "  " in s:
        s = s.replace("  ", " ")
    s = s.strip()
    return s

df["clean_komoditas"] = df["komoditas"].apply(replace_punctuation)
df["clean_berat"] = df["berat"].apply(replace_punctuation)
df["clean_komoditas"] = df["clean_komoditas"].apply(replace_stopwords)
df["clean_berat"] = df["clean_berat"].apply(replace_stopwords)

df["vector_komoditas"] = df["clean_komoditas"].apply(lambda x: x.strip().split(" "))
df["vector_berat"] = df["clean_berat"].apply(lambda x: x.strip().split(" "))
df["vector_berat"] = df["vector_berat"].apply(lambda x: [i for i in x if i != "" and i[0].isdigit()])
df["vector_berat"] = df["vector_berat"].apply(lambda x: [int(numeric_only(i)) for i in x])


In [206]:
dict_komoditas = dict(Counter(df["vector_komoditas"].sum()))
komoditas_mapping = {}

series_komoditas = pd.Series(dict_komoditas)
quantile_komoditas = series_komoditas.quantile(0.75)

for i, ic in dict_komoditas.items():
    for j, jc in dict_komoditas.items():
        if i != "" and (i in j or SequenceMatcher(None, i, j).ratio() >= 0.75) and ic >= quantile_komoditas:
            if j in komoditas_mapping:
                if ic > dict_komoditas[komoditas_mapping[j]]:
                    komoditas_mapping[j] = i
            else:
                komoditas_mapping[j] = i
komoditas_mapping
final_komoditas_mapping = deepcopy(komoditas_mapping)
for k, v in komoditas_mapping.items():
    final_komoditas_mapping[k] = komoditas_mapping[v]
final_komoditas_mapping

{'lele': 'lele',
 'lelw': 'lele',
 'kele': 'lele',
 'ikanlele': 'lele',
 'bawal': 'bawal',
 'nila': 'nila',
 'nil': 'nila',
 'kakap': 'kakap',
 'kembung': 'kembung',
 'kembug': 'kembung',
 'gembung': 'kembung',
 'tongkol': 'tongkol',
 'tingkol': 'tongkol',
 'tngkol': 'tongkol',
 'salem': 'salem',
 'salam': 'salem',
 'kerapu': 'kerapu',
 'krapu': 'kerapu',
 'kerpu': 'kerapu',
 'mas': 'mas',
 'emas': 'mas',
 'gurame': 'gurame',
 'gurami': 'gurame',
 'patin': 'patin',
 'parin': 'patin',
 'bandeng': 'bandeng',
 'jaer': 'mujaer',
 'mujaer': 'mujaer',
 'mujair': 'mujaer',
 'majaer': 'mujaer',
 'mujir': 'mujaer',
 'muajir': 'mujaer',
 'cumi': 'cumi',
 'udang': 'udang',
 'laut': 'laut',
 'merah': 'merah'}

In [207]:
df["vector_komoditas"] = df["vector_komoditas"].apply(lambda x: [final_komoditas_mapping.get(i) for i in x])

In [211]:
komoditas_berat = []
for k, b in zip(df["vector_komoditas"], df["vector_berat"]):
    if len(k) == len(b):
        komoditas_berat.append(dict(zip(k, b)))
    elif len(b) > 0:
        komoditas_berat.append({i: b[0] for i in k})

agg = pd.DataFrame(komoditas_berat).sum().sort_values(ascending=False).to_dict()
i = 1
for k, v in agg.items():
    if k is None:
        continue
    print(f"{i}. {k}: {int(v)}kg")
    i += 1

1. lele: 2459kg
2. mas: 1541kg
3. nila: 1049kg
4. tongkol: 946kg
5. mujaer: 487kg
6. bawal: 412kg
7. kakap: 375kg
8. gurame: 353kg
9. kembung: 349kg
10. laut: 115kg
11. kerapu: 105kg
12. patin: 85kg
13. salem: 52kg
14. bandeng: 42kg
15. merah: 36kg
16. udang: 27kg
17. cumi: 20kg


In [209]:
df[df["vector_komoditas"].apply(lambda x: len(x)) == 0]

Unnamed: 0,komoditas,berat,clean_komoditas,clean_berat,vector_komoditas,vector_berat


In [212]:
df

Unnamed: 0,komoditas,berat,clean_komoditas,clean_berat,vector_komoditas,vector_berat
0,"lele, bawal, nila","lele 6kg, bawal 1kg, nila 1kg",lele bawal nila,lele 6kg bawal 1kg nila 1kg,"[lele, bawal, nila]","[6, 1, 1]"
1,kakap lele kembung tongkol salem,kakap 2kg lele 1kg kembung 1kg tongkol 1kg sal...,kakap lele kembung tongkol salem,kakap 2kg lele 1kg kembung 1kg tongkol 1kg sal...,"[kakap, lele, kembung, tongkol, salem]","[2, 1, 1, 1, 1]"
2,"ikan nila, kakap, lele, bawal, kembung, salam",rata2 1kg kecuali kembung 2kg,nila kakap lele bawal kembung salam,rata2 1kg kecuali kembung 2kg,"[nila, kakap, lele, bawal, kembung, salem]","[1, 2]"
3,nila kembung tongkol lele bawal,nila 1kg bawal 2kg kembung 5kg lele 2kg tongko...,nila kembung tongkol lele bawal,nila 1kg bawal 2kg kembung 5kg lele 2kg tongko...,"[nila, kembung, tongkol, lele, bawal]","[1, 2, 5, 2, 3]"
4,"lele, kerapu, ikan mas, pindang tongkol, gurame",rata2 1kg,lele kerapu mas pindang tongkol gurame,rata2 1kg,"[lele, kerapu, mas, None, tongkol, gurame]",[1]
...,...,...,...,...,...,...
1049,ikan lele,3kg,lele,3kg,[lele],[3]
1050,"mas, mujaer",55,mas mujaer,5 5,"[mas, mujaer]","[5, 5]"
1051,"ikan mas, lele, bawal, mujair",5,mas lele bawal mujair,5,"[mas, lele, bawal, mujaer]",[5]
1052,"tongkol, nila",45,tongkol nila,4 5,"[tongkol, nila]","[4, 5]"
