# Init

In [1]:
!pip install -qq pysastrawi

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/210.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m204.8/210.6 kB[0m [31m6.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m210.6/210.6 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import random
import numpy as np

from collections import Counter
import nltk
from nltk.tokenize import word_tokenize 
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.probability import FreqDist
nltk.download('punkt')

import pandas as pd
import re
import seaborn as sns
import warnings
import string
warnings.filterwarnings("ignore", 'This pattern has match groups')
import matplotlib.pyplot as plt
import matplotlib

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
seed_val = 1906350912
random.seed(seed_val)
np.random.seed(seed_val)

In [4]:
categories = ['Anak', 'Bedah', 'Gigi', 'Gizi', 'Jantung', 'Jiwa',
       'Kandungan', 'Kulit dan Kelamin', 'Mata', 'Paru', 'Penyakit Dalam',
       'Saraf', 'THT', 'Tulang', 'Urologi', 'Umum']

categories_new = ['Anak', 'Bedah', 'Gigi', 'Gizi', 'Jantung', 'Jiwa',
       'Kandungan', 'Kulit dan Kelamin', 'Mata', 'Paru', 'Penyakit Dalam',
       'Saraf', 'THT', 'Tulang', 'Urologi']

In [5]:
df_human = pd.read_csv('Dataset/Human_Annotated.csv', index_col='ID')
df_human.fillna("", inplace = True)

df_test = pd.read_csv(f"Dataset/Gold_Standard.csv", index_col="ID")
df_test.fillna("", inplace = True)
df_test['Count'] = df_test.drop(columns=['JUDUL', 'ISI']).values.sum(axis=1)
df_test = df_test[df_test['Count'] <= 3].drop(columns=['Count'])
df_test = df_test[["JUDUL", "ISI"] + categories]

df = pd.concat([df_human, df_test])

In [6]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
mlb.fit_transform([categories])

array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])

In [7]:
from nltk.parse.transitionparser import remove
from collections import defaultdict

def lower_text(texts):
    return [s.lower() for s in texts]

def remove_punc_text(texts):
    return [s.translate(str.maketrans("","",string.punctuation + "1234567890")) for s in texts]

def strip_text(texts):
    return [s.strip() for s in texts]

def remove_stopword_text(texts):
    factory = StopWordRemoverFactory()
    stopwords = factory.get_stop_words()
    stopwords.remove('mata')
    stopwords.remove('ingat')
    stopwords.remove('orang')
    ext_stopwords = ['dok', 'doc', 'dokter', 'terima', 'kasih', 'terimakasih']
    all_stopwords = stopwords + ext_stopwords
    dictionary = ArrayDictionary(all_stopwords)
    stopword = StopWordRemover(dictionary)
    return [stopword.remove(s) for s in texts]

def stemming_text(texts):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    return [stemmer.stem(s) for s in texts]

def tokenize_text(texts):
    return [word_tokenize(s) for s in texts]

def cleaning_text(texts):
    texts = lower_text(texts)
    texts = remove_punc_text(texts)
    texts = strip_text(texts)
    texts = remove_stopword_text(texts)
    texts = stemming_text(texts)
    return texts

def cleaning_text_tokenize(texts):
    texts = lower_text(texts)
    texts = remove_punc_text(texts)
    texts = strip_text(texts)
    texts = remove_stopword_text(texts)
    texts = stemming_text(texts)
    texts = tokenize_text(texts)
    return texts

In [8]:
df['ALL'] = cleaning_text(df['JUDUL'] + ' ' + df['ISI'])
df = df.drop(columns = ['JUDUL', 'ISI'])

In [9]:
def is_in(kw, text):
  if (len(kw.split()) == 1):
    kw = " " + kw + " "
    text = " " + text + " "
  return kw in text

def get_kw_weights(kw_dict):
  res = {}
  for col, kws in kw_dict.items():
    res[col] = get_fp_weight(col, kws)
  return res

def get_fp_weight(col, kws):
  res = {}
  for kw in kws:
    for idx, row in df.iterrows():
      if (is_in(kw, row['ALL']) and row[col]):
        if kw not in res:
          res[kw] = 0
        res[kw] += 1
  return res

def count_kw_fp(kw_dict, weights):
  res = {
      'Id' : [],
      'Text' : [],
  }
  for col in kw_dict.keys():
    res[col] = []
    res[f"KW {col}"] = []
  for idx, row in df.iterrows():
    res['Id'].append(idx)
    res['Text'].append(row['ALL'])
    for col, kws in kw_dict.items():
      count_kw, occur_kw = find_kw_fp(row, col, kws, weights[col])
      res[col].append(count_kw)
      res[f"KW {col}"].append(occur_kw)
  return res
    
def find_kw_fp(row, col, kws, weight):
  res = -1
  res_lst = []
  if not row[col]:
      return res, res_lst
  res = 0
  for kw in kws:
    if is_in(kw, row['ALL']):
      res += weight[kw]
      res_lst.append(kw)
  return res, res_lst

def count_kw_fn(kw_dict):
  res = {
      'Id' : [],
      'Text' : [],
  }
  for col in kw_dict.keys():
    res[col] = []
    res[f"KW {col}"] = []
  for idx, row in df.iterrows():
    res['Id'].append(idx)
    res['Text'].append(row['ALL'])
    for col, kws in kw_dict.items():
      count_kw, occur_kw = find_kw_fn(row, col, kws)
      res[col].append(count_kw)
      res[f"KW {col}"].append(occur_kw)
  return res
    
def find_kw_fn(row, col, kws):
  res = -1
  res_lst = []
  if row[col]:
      return res, res_lst
  res = 0
  for kw in kws:
    if is_in(kw, row['ALL']):
      res += 1
      res_lst.append(kw)
  return res, res_lst

def print_dict(dictionary):
  for k, v in dictionary.items():
    print(f"{k} : {v}")

def print_table(lst, cols):
    rows = [lst[i:i+cols] for i in range(0, len(lst), cols)]
    col_width = max(max(len(str(word)) for word in row) for row in rows)
    for row in rows:
        print("  ".join(str(word).ljust(col_width) for word in row))

In [10]:
def count_kw_in_id(fn):
  res = {}
  for kw, v in fn.items():
    for id in v[1:]:
      if id not in res:
        res[id] = 0
      res[id] += 1
  return res

def grouping_tuple(lst):
  res = {}
  for (_, i) in lst:
    if i not in res:
      res[i] = 0
    res[i] += 1
  return res

In [11]:
kw_dict = {
  "Anak" : ['anak', 'bayi', 'asi', 'imunisasi', 'anak usia', 'anak anak', 'anak umur', 'bayi minum', 'minum asi', 'minum susu', 'susu formula', 'formula anak', 'bayi usia', 'anak alergi', 'susu sapi', 'asi susu', 'tahun anak', 'anak makan', 'susu anak', 'sapi anak', 'imunisasi anak', 'anak susu', 'usia anak', 'anak lakilaki', 'anak minum', 'formula asi', 'badan anak', 'susu soya', 'anak umur tahun', 'minum asi minum', 'asi minum susu', 'minum susu formula', 'susu formula anak', 'susu formula bayi', 'pilih susu formula', 'susu formula susu', 'bayi alergi susu', 'anak alergi susu', 'alergi susu sapi', 'asi susu formula', 'usia tahun anak', 'anak usia tahun', 'susu sapi anak', 'umur tahun anak', 'anak susu formula', 'anak anak umur', 'bayi minum susu', 'anak minum susu', 'susu formula asi', 'berat badan anak', 'konsumsi susu formula', 'anak konsumsi susu', 'minum susu soya'], 
  "Bedah" : ['wasir', 'perut kanan', 'usus buntu', 'sakit usus', 'gejala usus', 'pasca operasi', 'perut belah', 'operasi usus', 'obat wasir', 'buntu operasi', 'batu empedu', 'bab darah', 'bekas luka', 'luka operasi', 'bekas operasi', 'nyeri perut kanan', 'perut kanan sakit', 'sakit usus buntu', 'usus buntu siang', 'perut kanan nyeri', 'gejala usus buntu', 'sakit perut kanan', 'perut belah kanan', 'operasi usus buntu', 'usus buntu operasi', 'usus buntu sembuh', 'wasir obat wasir', 'luka bekas operasi', 'operasi batu empedu', 'pasca operasi usus'],
  "Gigi" : ['mulut', 'sariawan', 'lidah', 'gigi', 'lubang', 'mutih', 'geraham', 'gusi', 'ngilu', 'karang', 'cabut', 'gigi gigi', 'gigi lubang', 'gigi geraham', 'ngilu ganggu', 'gigi tambal', 'saraf gigi', 'alami sariawan', 'tambal gigi', 'gigi bersih', 'karang gigi', 'gigi ngilu', 'alami ngilu', 'gigi darah', 'gusi bengkak', 'obat gigi', 'pasta gigi', 'cabut gigi', 'sikat gigi', 'derita sariawan', 'sariawan minggu', 'sariawan sembuh', 'gigi sakit', 'gigi kuning', 'mutih gigi', 'gigi warna', 'gigi putih', 'putih gigi', 'kuning gigi', 'gusi darah', 'darah sariawan', 'gigi patah', 'warna gigi', 'bau mulut', 'gigi warna kuning', 'gusi darah sariawan', 'warna kuning gigi'],
  "Gizi" : ['berat', 'kurus', 'makan', 'diet', 'gemuk', 'berat badan', 'badan turun', 'tubuh kurus', 'makan hindar', 'susu kedelai', 'sayur buah', 'badan tinggi', 'porsi makan', 'tinggi badan', 'badan berat', 'badan kurus', 'makan berat', 'milik berat', 'badan ideal', 'kurus berat badan', 'berat badan turun', 'pantang makan derita', 'makan derita asam', 'derita asam lambung', 'tahun berat badan', 'air kelapa muda', 'tinggi berat badan', 'kg berat badan', 'berat badan makan', 'badan berat badan', 'turun berat badan', 'berat badan berat', 'minggu berat badan', 'makan berat badan', 'berat badan susah', 'milik berat badan', 'susah berat badan', 'cepat berat badan', 'berat badan ideal', 'keluh berat badan', 'berat badan milik', 'berat badan selamat', 'badan tinggi badan', 'gimana berat badan', 'normal berat badan', 'berat badan capai', 'badan capai kg'],
  "Jantung" : ['dada', 'sesak', 'jantung', 'denyut', 'ekg', 'detak', 'berdebarberdebar', 'hipertensi', 'bisoprolol', 'hipertiroid', 'aritmia', 'nyeri dada', 'sesak nafas', 'dada sakit', 'jantung debar', 'dada tengah', 'keringat dingin', 'sakit jantung', 'dada kiri', 'serang jantung', 'kiri nyeri', 'alami sesak', 'spesialis jantung', 'jantung normal', 'dada sesak', 'darah tinggi', 'sakit dada', 'dada belah', 'jantung berdebarberdebar', 'jantung jantung', 'tekan darah', 'darah normal', 'detak jantung', 'denyut jantung', 'riwayat sakit', 'debar jantung', 'akibat jantung', 'dosis mghari', 'obatobat an', 'tinggi keluh', 'rekam jantung', 'nyaman dada', 'jantung cepat', 'obat turun', 'turun tekan', 'dada kiri nyeri', 'dada belah kiri', 'tinggi keluh sakit', 'keluh sakit dada'],
  "Jiwa" : ['panik', 'depresi', 'sedih', 'bunuh', 'cemas', 'takut', 'anxiety', 'disorder', 'jiwa', 'psikiater', 'konsentrasi', 'benci', 'panic', 'alami depresi', 'anxiety disorder', 'sulit tidur', 'depresi berat', 'obat depresi', 'ganggu jiwa', 'social anxiety', 'tekan darah', 'detak jantung', 'panic attack', 'efek samping pusing', 'obat minum obat', 'suka banting barang', 'orang mudah kaget', 'disorder fobia sosial'],   
  "Kandungan" : ['hamil', 'haid', 'flek', 'kista', 'rahim', 'indung', 'caesar', 'menstruasi', 'mens', 'kb', 'janin', 'tanda hamil', 'indung telur', 'operasi caesar', 'haid atur', 'usia kandung', 'program hamil', 'hamil minggu', 'siklus haid', 'usia hamil', 'flek hamil', 'badan janin', 'berat janin', 'pasca operasi caesar', 'usia hamil minggu', 'hamil berat badan', 'kista indung telur', 'usia kandung minggu', 'berat badan janin'],
  "Kulit dan Kelamin" : ['jerawat', 'kelamin', 'gatal', 'bintikbintik', 'kulit', 'bintik', 'garuk', 'kutil', 'tahi lalat', 'bintikbintik merah', 'kulit kelupas', 'wajah jerawat', 'muncul bintikbintik', 'kutil kelamin', 'bekas jerawat', 'hilang tahi lalat', 'tahi lalat hilang'],
  "Mata" : ['mata', 'kacamata', 'belek', 'softlens', 'sakit mata', 'mata air', 'mata minus', 'mata belah', 'kiri minus', 'mata kiri', 'mata anak', 'ubah warna', 'minus silinder', 'silinder minus', 'pakai kacamata', 'minus mata', 'air belek', 'mata normal', 'tetes mata', 'merah air', 'obat mata', 'mata silinder', 'kanan silinder', 'sembuh mata', 'mata merah', 'silinder sembuh', 'mata kanan', 'silinder mata', 'mata ubah', 'ganggu mata', 'merah belek', 'buta warna', 'mata belek', 'belek selamat', 'belek mata', 'minus silindris', 'silinder pakai', 'tangan mata', 'periksa mata', 'pakai softlens', 'softlens minus', 'mata minus silinder', 'silinder minus silinder', 'minus silinder minus', 'obat tetes mata', 'obat mata silinder', 'mata silinder adik', 'mata silinder sembuh', 'minus silinder mata', 'silinder mata silinder', 'mata merah belek', 'mata ubah warna', 'mata anak belek', 'mata minus silindris', 'minus silinder pakai', 'bangun tidur mata', 'merah bangun tidur', 'pakai softlens minus', 'mata merah air', 'merah air belek', 'mata silinder mata', 'belek bangun tidur'],
  "Paru" : ['rokok', 'tenggorok', 'sesak', 'asap', 'batuk', 'dahak', 'bronkitis', 'flu', 'ingus', 'asma', 'debu', 'pilek', 'paru', 'tbc', 'fdc', 'tb', 'fibrosis', 'sesak nafas', 'flu batuk', 'batuk flu', 'asma kambuh', 'hidung sumbat', 'batuk darah', 'obat tbc'],
  "Penyakit Dalam" : ['diabetes', 'stroke', 'sakit ulu', 'ulu hati', 'hati sakit', 'nyeri ulu', 'sakit ulu hati', 'ulu hati sakit', 'nyeri ulu hati', 'sakit asam lambung'],
  "Saraf" : ['otak', 'kepala', 'leher', 'bentur', 'syaraf', 'stroke', 'sakit kepala', 'kepala sakit', 'alami sakit', 'sakit tulang', 'kepala pusing', 'ct scan', 'kepala belah', 'sakit pinggang', 'kepala bentur', 'sakit leher', 'tulang ekor', 'bentur kepala', 'cedera kepala', 'stroke ringan', 'nyeri tulang', 'alami sakit kepala', 'sakit tulang punggung'],
  "THT" : ['tenggorok', 'mimisan', 'flu', 'ingus', 'telinga', 'tht', 'pilek', 'alami mimisan', 'telinga belah', 'pusing mual', 'dengung telinga', 'telinga kiri', 'dengar telinga', 'telinga kanan', 'hidung sumbat', 'darah hidung', 'gendang telinga', 'telinga air', 'kotor telinga', 'telinga dengung', 'telinga sakit', 'tetes telinga', 'sakit telinga', 'telinga belah kanan', 'dengung telinga belah', 'hidung belah kanan', 'gendang telinga pecah', 'telinga belah kiri', 'mimisan bangun tidur', 'obat tetes telinga', 'hidung belah kiri', 'belah kiri dengung', 'telinga dengung telinga', 'dengung telinga dengung'],
  "Tulang" : ['bengkak', 'tulang', 'patah', 'urut', 'retak', 'sendi', 'ligamen', 'sakit tulang', 'sakit nyeri', 'patah tulang', 'tulang kering', 'telapak kaki', 'kaki kiri', 'tulang ekor', 'tulang patah', 'alami patah', 'jalan normal', 'lutut kanan', 'gelang kaki', 'kering kanan', 'nyeri tulang', 'tulang dada', 'cedera lutut', 'lutut kiri', 'tulang sendi', 'tulang nyeri', 'alami patah tulang', 'tulang kering kanan', 'kering kanan bentur'], 
  "Urologi" : ['rahim', 'ginjal', 'bak', 'kemih', 'urin', 'kencing', 'urologi', 'testis', 'prostat', 'kateter', 'beser', 'perut sakit', 'buang air', 'salur kemih', 'kencing darah', 'selang ginjal', 'usg ginjal', 'bedah urologi', 'pasang selang', 'batu ginjal', 'operasi batu', 'kandung kemih', 'retensi urin', 'ginjal operasi', 'perut belah kiri', 'operasi batu ginjal', 'batu ginjal operasi']
}

In [12]:
df2 = pd.concat([df_human, df_test])
df2['ALL'] = (df2['JUDUL'] + ' [SEP] ' + df2['ISI'])

In [13]:
data = count_kw_fn(kw_dict)
dfn = pd.DataFrame(data).set_index('Id')
dfn["Text"] = df2.loc[list(dfn.index)]["ALL"]
dfn.loc[list(df_human.index)].to_csv("false_neg_human.csv")
dfn.loc[list(df_test.index)].to_csv("false_neg_test.csv")
dfn.head(3)

Unnamed: 0_level_0,Text,Anak,KW Anak,Bedah,KW Bedah,Gigi,KW Gigi,Gizi,KW Gizi,Jantung,...,Penyakit Dalam,KW Penyakit Dalam,Saraf,KW Saraf,THT,KW THT,Tulang,KW Tulang,Urologi,KW Urologi
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DS-1,mengapa keringat badan sangat berlebihan ? [SE...,0,[],0,[],0,[],0,[],0,...,0,[],0,[],0,[],0,[],0,[]
DS-23,suka menghayal dan berhalusinasi [SEP] selamat...,0,[],0,[],0,[],0,[],0,...,0,[],0,[],0,[],0,[],0,[]
DS-87,feses warna kuning di sertai darah apakah ada ...,0,[],-1,[],0,[],0,[],0,...,-1,[],0,[],0,[],0,[],0,[]


In [14]:
weights = get_kw_weights(kw_dict)
data = count_kw_fp(kw_dict, weights)
dfp = pd.DataFrame(data).set_index('Id')
dfp["Text"] = df2.loc[list(dfp.index)]["ALL"]
dfp.loc[list(df_human.index)].to_csv("false_pos_human.csv")
dfp.loc[list(df_test.index)].to_csv("false_pos_test.csv")
dfp.head(3)

Unnamed: 0_level_0,Text,Anak,KW Anak,Bedah,KW Bedah,Gigi,KW Gigi,Gizi,KW Gizi,Jantung,...,Penyakit Dalam,KW Penyakit Dalam,Saraf,KW Saraf,THT,KW THT,Tulang,KW Tulang,Urologi,KW Urologi
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
DS-1,mengapa keringat badan sangat berlebihan ? [SE...,-1,[],-1,[],-1,[],-1,[],-1,...,-1,[],-1,[],-1,[],-1,[],-1,[]
DS-23,suka menghayal dan berhalusinasi [SEP] selamat...,-1,[],-1,[],-1,[],-1,[],-1,...,-1,[],-1,[],-1,[],-1,[],-1,[]
DS-87,feses warna kuning di sertai darah apakah ada ...,-1,[],0,[],-1,[],-1,[],-1,...,0,[],-1,[],-1,[],-1,[],-1,[]
