# Initialization

## Import

In [1]:
!pip install -qq xgboost
!pip install -qq scikit-multilearn
!pip install -qq pysastrawi

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.4/89.4 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m210.6/210.6 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import re
import random
import seaborn as sns
import string
from collections import Counter
import warnings
warnings.filterwarnings("ignore", 'This pattern has match groups')

from tqdm import tqdm

import nltk
from nltk.tokenize import word_tokenize 
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.probability import FreqDist
nltk.download('punkt')

from xgboost import XGBClassifier

from skmultilearn.problem_transform import ClassifierChain, BinaryRelevance
from skmultilearn.model_selection import IterativeStratification

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, f1_score, hamming_loss

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Load Data

In [3]:
seed_val = 1906350912
random.seed(seed_val)
np.random.seed(seed_val)
categories_umum = ['Gigi', 'Mata', 'Paru', 'Jantung',  'Urologi', 'Kandungan', 
              'Gizi', 'Tulang', 'Saraf', 'Jiwa', 'THT', 'Kulit dan Kelamin',
              'Penyakit Dalam', 'Bedah', 'Anak', 'Umum']
categories = [
    'Kulit dan Kelamin', 'Mata', 'Paru', 'Gigi', 'Urologi',
    'Kandungan', 'Gizi', 'Jantung', 'Bedah', 'Jiwa', 
    'Penyakit Dalam',
    'THT',
    'Anak', 'Tulang', 'Saraf',
    ]

In [4]:
df_train = pd.read_csv(f"Dataset/Human_Annotated.csv", index_col="ID")
df_train.fillna("", inplace = True)

df_train_umumnt = df_train.drop(columns=["Umum"])
df_train_umumnt["COUNT CLASS"] = df_train_umumnt.drop(columns=["JUDUL", "ISI"]).sum(axis=1)
df_train = df_train_umumnt[df_train_umumnt["COUNT CLASS"] > 0].drop(columns=["COUNT CLASS"])
del df_train_umumnt

df_train["ALL"] = df_train['JUDUL'] + " " + df_train['ISI']
df_train = df_train.drop(columns=['JUDUL', 'ISI'])
df_train = df_train[['ALL'] + categories]


df_test = pd.read_csv(f"Dataset/Gold_Standard.csv", index_col="ID")
df_test.fillna("", inplace = True)
df_test['Count'] = df_test.drop(columns=['JUDUL', 'ISI']).values.sum(axis=1)
df_test = df_test[df_test['Count'] <= 3].drop(columns=['Count'])
df_test = df_test[["JUDUL", "ISI"] + categories_umum]

df_test_umumnt = df_test.drop(columns=["Umum"])
df_test_umumnt["COUNT CLASS"] = df_test_umumnt.drop(columns=["JUDUL", "ISI"]).sum(axis=1)
df_test = df_test_umumnt[df_test_umumnt["COUNT CLASS"] > 0].drop(columns=["COUNT CLASS"])
del df_test_umumnt

df_test["ALL"] = df_test['JUDUL'] + " " + df_test['ISI']
df_test = df_test.drop(columns=['JUDUL', 'ISI'])
df_test = df_test[['ALL'] + categories]

## Function

In [5]:
from nltk.parse.transitionparser import remove
from collections import defaultdict

def lower_text(texts):
    return [s.lower() for s in texts]

def remove_punc_text(texts):
    return [s.translate(str.maketrans("","",string.punctuation + "1234567890")) for s in texts]

def strip_text(texts):
    return [s.strip() for s in texts]

def remove_stopword_text(texts):
    factory = StopWordRemoverFactory()
    stopwords = factory.get_stop_words()
    stopwords.remove('mata')
    stopwords.remove('ingat')
    stopwords.remove('orang')
    ext_stopwords = ['dok', 'doc', 'dokter', 'terima', 'kasih', 'terimakasih', 'sep']
    all_stopwords = stopwords + ext_stopwords
    dictionary = ArrayDictionary(all_stopwords)
    stopword = StopWordRemover(dictionary)
    return [stopword.remove(s) for s in texts]

def stemming_text(texts):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    return [stemmer.stem(s) for s in texts]

def tokenize_text(texts):
    return [word_tokenize(s) for s in texts]

def cleaning_text_stemstop(texts):
    texts = lower_text(texts)
    texts = remove_punc_text(texts)
    texts = strip_text(texts)
    texts = remove_stopword_text(texts)
    texts = stemming_text(texts)
    return texts

def cleaning_text_stem(texts):
    texts = lower_text(texts)
    texts = remove_punc_text(texts)
    texts = strip_text(texts)
    texts = stemming_text(texts)
    return texts

def cleaning_text_raw(texts):
    texts = lower_text(texts)
    texts = remove_punc_text(texts)
    texts = strip_text(texts)
    return texts

In [6]:
def mean(lst):
    return sum(lst) / len(lst)

def train_k_times(model, X_train, y_train, \
              X_test, y_test, k, preprocess):
  accs = []
  f1s = []
  hammings = []
  for i in range(k):
    print(i, end=" ")
    X_vec_train = preprocess.fit_transform(X_train)
    X_vec_test = preprocess.transform(X_test)
    model.fit(X_vec_train, y_train)
    y_pred = model.predict(X_vec_test)
    acc, f1, hamming = report(y_test, y_pred)
    accs.append(acc)
    f1s.append(f1)
    hammings.append(hamming)
  return accs, f1s, hammings

def report(y_test, y_pred):
  acc = accuracy_score(y_test, y_pred)
  f1 = f1_score(y_test, y_pred, average='micro', zero_division=0)
  hamming = hamming_loss(y_test, y_pred)
  return acc, f1, hamming

In [7]:
import csv  
import os

# Specifying path

path = 'Result/Conventional_ML_Best_Result.csv'

# Checking whether the specified path exists

if not os.path.exists(path):
  print("gada")
  f = open(path, "w")
  writer = csv.writer(f)
  writer.writerow(['Id', "Preprocess", "SVD", 'Model', 'Multilabel Approach', 'Accuracy', 'F1-Score', 'Hamming Loss'])
  f.close()
else:
  print('ada')
def record_result(row):
  f = open(path, "a")
  writer = csv.writer(f)
  writer.writerow(row)
  f.close()

ada


# Raw

In [8]:
preprocess = 'raw'
X_train = df_train['ALL'].values
y_train = df_train.drop(columns=['ALL']).values
X_test = df_test['ALL'].values
y_test = df_test.drop(columns=['ALL']).values

X_train = cleaning_text_raw(X_train)
X_test = cleaning_text_raw(X_test)

## With SVD 100 components

In [9]:
svd = 'svd100'
pipeline = make_pipeline(
    TfidfVectorizer(min_df=2, sublinear_tf = True, ngram_range=(1,3)),
    TruncatedSVD(n_components=100, n_iter=10),
    MinMaxScaler()
)
pipeline.fit(X_train)

### SVC

In [10]:
model = 'svc'

Classifier Chain

In [11]:
clf = ClassifierChain(
    classifier=SVC(),
    require_dense=[False, True]
)

accs, f1s, hammings = train_k_times(clf, X_train, y_train, X_test, y_test, 10, pipeline)
print(accs)
print(f1s)
print(hammings)

0 1 2 3 4 5 6 7 8 9 [0.8251599147121536, 0.8272921108742004, 0.835820895522388, 0.835820895522388, 0.8294243070362474, 0.8272921108742004, 0.8230277185501066, 0.837953091684435, 0.835820895522388, 0.8294243070362474]
[0.8562091503267973, 0.8624454148471615, 0.8633405639913233, 0.8661588683351469, 0.8633879781420765, 0.8593238822246456, 0.8558951965065501, 0.8721311475409835, 0.8699453551912568, 0.8615049073064341]
[0.018763326226012792, 0.01791044776119403, 0.01791044776119403, 0.017484008528784647, 0.017768301350390904, 0.01833688699360341, 0.018763326226012792, 0.016631130063965886, 0.01691542288557214, 0.018052594171997158]


# Stemming

In [12]:
preprocess = 'stem'
X_train = df_train['ALL'].values
y_train = df_train.drop(columns=['ALL']).values
X_test = df_test['ALL'].values
y_test = df_test.drop(columns=['ALL']).values

X_train = cleaning_text_stem(X_train)
X_test = cleaning_text_stem(X_test)

## With SVD 100 components

In [13]:
svd = 'svd100'
pipeline = make_pipeline(
    TfidfVectorizer(min_df=2, sublinear_tf = True, ngram_range=(1,3)),
    TruncatedSVD(n_components=100, n_iter=10),
    MinMaxScaler()
)
pipeline.fit(X_train)

### SVC

In [14]:
model = 'svc'

Binary Relevance

In [15]:
clf = BinaryRelevance(
    classifier=SVC(),
    require_dense=[False, True]
)

accs, f1s, hammings = train_k_times(clf, X_train, y_train, X_test, y_test, 10, pipeline)
print(accs)
print(f1s)
print(hammings)

0 1 2 3 4 5 6 7 8 9 [0.7761194029850746, 0.7739872068230277, 0.7718550106609808, 0.7718550106609808, 0.7718550106609808, 0.7739872068230277, 0.7825159914712153, 0.7846481876332623, 0.7761194029850746, 0.7782515991471215]
[0.8661233993015134, 0.8644859813084111, 0.8621495327102803, 0.8621495327102803, 0.8624708624708626, 0.8648018648018648, 0.8697674418604651, 0.869061413673233, 0.8641114982578397, 0.8664343786295007]
[0.016346837242359632, 0.01648898365316276, 0.016773276474769012, 0.016773276474769012, 0.016773276474769012, 0.01648898365316276, 0.015920398009950248, 0.016062544420753375, 0.016631130063965886, 0.016346837242359632]


Classifier Chain

In [16]:
clf = ClassifierChain(
    classifier=SVC(),
    require_dense=[False, True]
)

accs, f1s, hammings = train_k_times(clf, X_train, y_train, X_test, y_test, 10, pipeline)
print(accs)
print(f1s)
print(hammings)

0 1 2 3 4 5 6 7 8 9 [0.8294243070362474, 0.8315565031982942, 0.8315565031982942, 0.8272921108742004, 0.8400852878464818, 0.8336886993603412, 0.835820895522388, 0.8315565031982942, 0.8251599147121536, 0.8336886993603412]
[0.8687089715536105, 0.8639825897714907, 0.8649237472766885, 0.8605664488017428, 0.8707926167209555, 0.8661588683351469, 0.8636363636363635, 0.8630434782608696, 0.8640350877192983, 0.8642779587404995]
[0.017057569296375266, 0.017768301350390904, 0.017626154939587774, 0.018194740582800285, 0.01691542288557214, 0.017484008528784647, 0.01791044776119403, 0.01791044776119403, 0.017626154939587774, 0.017768301350390904]


# Stemming Stop

In [17]:
preprocess = 'stemstop'
X_train = df_train['ALL'].values
y_train = df_train.drop(columns=['ALL']).values
X_test = df_test['ALL'].values
y_test = df_test.drop(columns=['ALL']).values

X_train = cleaning_text_stemstop(X_train)
X_test = cleaning_text_stemstop(X_test)

## With SVD 100 components

In [18]:
svd = 'svd100'
pipeline = make_pipeline(
    TfidfVectorizer(min_df=2, sublinear_tf = True, ngram_range=(1,3)),
    TruncatedSVD(n_components=100, n_iter=10),
    MinMaxScaler()
)
pipeline.fit(X_train)

### SVC

In [19]:
model = 'svc'

Binary Relevance

In [20]:
clf = BinaryRelevance(
    classifier=SVC(),
    require_dense=[False, True]
)

accs, f1s, hammings = train_k_times(clf, X_train, y_train, X_test, y_test, 10, pipeline)
print(accs)
print(f1s)
print(hammings)

0 1 2 3 4 5 6 7 8 9 [0.8208955223880597, 0.814498933901919, 0.8187633262260128, 0.8251599147121536, 0.814498933901919, 0.8230277185501066, 0.8166311300639659, 0.8230277185501066, 0.8166311300639659, 0.8187633262260128]
[0.8871508379888269, 0.8831460674157304, 0.8878923766816144, 0.8891377379619261, 0.8811659192825112, 0.8921348314606741, 0.8853932584269663, 0.8903803131991052, 0.8846584546472565, 0.888638920134983]
[0.014356787491115849, 0.014783226723525231, 0.014214641080312722, 0.014072494669509595, 0.015067519545131485, 0.013646055437100213, 0.014498933901918977, 0.013930348258706468, 0.014641080312722104, 0.014072494669509595]


Classifier Chain

In [21]:
clf = ClassifierChain(
    classifier=SVC(),
    require_dense=[False, True]
)

accs, f1s, hammings = train_k_times(clf, X_train, y_train, X_test, y_test, 10, pipeline)
print(accs)
print(f1s)
print(hammings)

0 1 2 3 4 5 6 7 8 9 [0.8656716417910447, 0.8592750533049041, 0.8656716417910447, 0.8678038379530917, 0.8699360341151386, 0.8656716417910447, 0.8614072494669509, 0.8656716417910447, 0.8592750533049041, 0.8678038379530917]
[0.8858057630736393, 0.8815368196371398, 0.8869936034115138, 0.8888888888888888, 0.891025641025641, 0.8877005347593583, 0.8846153846153846, 0.8869936034115138, 0.8803418803418804, 0.887940234791889]
[0.015209665955934613, 0.01577825159914712, 0.015067519545131485, 0.014783226723525231, 0.014498933901918977, 0.014925373134328358, 0.01535181236673774, 0.015067519545131485, 0.015920398009950248, 0.014925373134328358]


### XGBoost

In [22]:
model = 'xgb'

Binary Relevance

In [23]:
clf = BinaryRelevance(
    classifier=XGBClassifier(),
    require_dense=[False, True]
)

accs, f1s, hammings = train_k_times(clf, X_train, y_train, X_test, y_test, 10, pipeline)
print(accs)
print(f1s)
print(hammings)

0 1 2 3 4 5 6 7 8 9 [0.7590618336886994, 0.7334754797441365, 0.7526652452025586, 0.7526652452025586, 0.7569296375266524, 0.746268656716418, 0.7505330490405118, 0.7633262260127932, 0.7590618336886994, 0.7505330490405118]
[0.8512110726643598, 0.8378995433789954, 0.8440366972477065, 0.8519362186788154, 0.8509670079635949, 0.8460661345496009, 0.8455284552845529, 0.8509174311926604, 0.8515981735159818, 0.844141069397042]
[0.01833688699360341, 0.020184790334044064, 0.019331911869225303, 0.01847903340440654, 0.018621179815209665, 0.019189765458422176, 0.01890547263681592, 0.01847903340440654, 0.01847903340440654, 0.01947405828002843]


Classifier Chain

In [24]:
clf = ClassifierChain(
    classifier=XGBClassifier(),
    require_dense=[False, True]
)

accs, f1s, hammings = train_k_times(clf, X_train, y_train, X_test, y_test, 10, pipeline)
print(accs)
print(f1s)
print(hammings)

0 1 2 3 4 5 6 7 8 9 [0.7803837953091685, 0.7547974413646056, 0.7697228144989339, 0.7484008528784648, 0.7377398720682303, 0.744136460554371, 0.7547974413646056, 0.7569296375266524, 0.7420042643923241, 0.767590618336887]
[0.8590909090909091, 0.8435374149659863, 0.8558456299659478, 0.8454545454545455, 0.8316151202749141, 0.8384180790960453, 0.846242774566474, 0.8463302752293578, 0.8395904436860069, 0.8522727272727273]
[0.017626154939587774, 0.019616204690831557, 0.018052594171997158, 0.019331911869225303, 0.020895522388059702, 0.02032693674484719, 0.01890547263681592, 0.01904761904761905, 0.020042643923240937, 0.01847903340440654]


### SGD LogReg

In [25]:
model = 'sgd_logreg'

Classifier Chain

In [26]:
clf = ClassifierChain(
    classifier=SGDClassifier(loss='log_loss'),
    require_dense=[False, True]
)
accs, f1s, hammings = train_k_times(clf, X_train, y_train, X_test, y_test, 10, pipeline)
print(accs)
print(f1s)
print(hammings)

0 1 2 3 4 5 6 7 8 9 [0.8166311300639659, 0.744136460554371, 0.8272921108742004, 0.7761194029850746, 0.767590618336887, 0.7782515991471215, 0.8251599147121536, 0.837953091684435, 0.8294243070362474, 0.7249466950959488]
[0.8683937823834197, 0.8145896656534954, 0.8729641693811075, 0.8362877997914495, 0.8211382113821137, 0.8471337579617834, 0.8613756613756612, 0.8767123287671232, 0.8742004264392325, 0.7958974358974359]
[0.018052594171997158, 0.02601279317697228, 0.016631130063965886, 0.022316986496090974, 0.025017768301350392, 0.02046908315565032, 0.018621179815209665, 0.016631130063965886, 0.016773276474769012, 0.028287135749822317]


## With SVD 250 components

In [27]:
svd = 'svd250'
pipeline = make_pipeline(
    TfidfVectorizer(min_df=2, sublinear_tf = True, ngram_range=(1,3)),
    TruncatedSVD(n_components=250, n_iter=10),
    MinMaxScaler()
)
pipeline.fit(X_train)

### SVC

In [28]:
model = 'svc'

Binary Relevance

In [29]:
clf = BinaryRelevance(
    classifier=SVC(),
    require_dense=[False, True]
)

accs, f1s, hammings = train_k_times(clf, X_train, y_train, X_test, y_test, 10, pipeline)
print(accs)
print(f1s)
print(hammings)

0 1 2 3 4 5 6 7 8 9 [0.8038379530916845, 0.7974413646055437, 0.7995735607675906, 0.8081023454157783, 0.7974413646055437, 0.7995735607675906, 0.7995735607675906, 0.7995735607675906, 0.8038379530916845, 0.8059701492537313]
[0.8858447488584476, 0.8820160366552119, 0.8825541619156215, 0.8904109589041096, 0.8843069873997708, 0.8853211009174312, 0.8853211009174312, 0.8817451205510907, 0.887115165336374, 0.8868571428571429]
[0.014214641080312722, 0.014641080312722104, 0.014641080312722104, 0.013646055437100213, 0.014356787491115849, 0.014214641080312722, 0.014214641080312722, 0.014641080312722104, 0.014072494669509595, 0.014072494669509595]


Classifier Chain

In [30]:
clf = ClassifierChain(
    classifier=SVC(),
    require_dense=[False, True]
)

accs, f1s, hammings = train_k_times(clf, X_train, y_train, X_test, y_test, 10, pipeline)
print(accs)
print(f1s)
print(hammings)

0 1 2 3 4 5 6 7 8 9 [0.8550106609808102, 0.8464818763326226, 0.8507462686567164, 0.8528784648187633, 0.8550106609808102, 0.8528784648187633, 0.8571428571428571, 0.8528784648187633, 0.8635394456289979, 0.8422174840085288]
[0.8869752421959096, 0.8872017353579177, 0.8807733619763695, 0.8826695371367062, 0.8850698174006445, 0.8886486486486488, 0.8879310344827586, 0.8857758620689655, 0.8927038626609441, 0.8759439050701185]
[0.014925373134328358, 0.014783226723525231, 0.01577825159914712, 0.015493958777540867, 0.015209665955934613, 0.014641080312722104, 0.014783226723525231, 0.015067519545131485, 0.014214641080312722, 0.016346837242359632]


### SGD LogReg

In [31]:
model = 'sgd_logreg'

Classifier Chain

In [32]:
clf = ClassifierChain(
    classifier=SGDClassifier(loss='log_loss'),
    require_dense=[False, True]
)
accs, f1s, hammings = train_k_times(clf, X_train, y_train, X_test, y_test, 10, pipeline)
print(accs)
print(f1s)
print(hammings)

0 1 2 3 4 5 6 7 8 9 [0.8336886993603412, 0.8230277185501066, 0.8038379530916845, 0.8443496801705757, 0.5415778251599147, 0.8208955223880597, 0.8294243070362474, 0.7313432835820896, 0.7803837953091685, 0.7761194029850746]
[0.8869565217391303, 0.8808290155440415, 0.8648068669527897, 0.8956521739130434, 0.7164461247637052, 0.8841025641025642, 0.8854166666666666, 0.8204081632653062, 0.8507614213197969, 0.8447580645161291]
[0.014783226723525231, 0.016346837242359632, 0.01791044776119403, 0.013646055437100213, 0.042643923240938165, 0.016062544420753375, 0.015636105188343994, 0.025017768301350392, 0.020895522388059702, 0.021890547263681594]


## With SVD 500 components

In [33]:
svd = 'svd500'
pipeline = make_pipeline(
    TfidfVectorizer(min_df=2, sublinear_tf = True, ngram_range=(1,3)),
    TruncatedSVD(n_components=500, n_iter=10),
    MinMaxScaler()
)
pipeline.fit(X_train)

### SGD LogReg

In [34]:
model = 'sgd_logreg'

Classifier Chain

In [35]:
clf = ClassifierChain(
    classifier=SGDClassifier(loss='log_loss'),
    require_dense=[False, True]
)
accs, f1s, hammings = train_k_times(clf, X_train, y_train, X_test, y_test, 10, pipeline)
print(accs)
print(f1s)
print(hammings)

0 1 2 3 4 5 6 7 8 9 [0.835820895522388, 0.7846481876332623, 0.8251599147121536, 0.7995735607675906, 0.8251599147121536, 0.7057569296375267, 0.5330490405117271, 0.7761194029850746, 0.7505330490405118, 0.8208955223880597]
[0.8940677966101694, 0.8659160696008189, 0.888421052631579, 0.8721804511278195, 0.8775292864749734, 0.8281853281853281, 0.7418772563176895, 0.8538622129436326, 0.8469184890656064, 0.882921589688507]
[0.014214641080312722, 0.018621179815209665, 0.015067519545131485, 0.01691542288557214, 0.016346837242359632, 0.025302061122956646, 0.04065387348969438, 0.01990049751243781, 0.021890547263681594, 0.015493958777540867]
