# Initialization

## Import

In [None]:
!pip install -qq xgboost
!pip install -qq scikit-multilearn
!pip install -qq pysastrawi

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.4/89.4 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m210.6/210.6 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import re
import random
import seaborn as sns
import string
from collections import Counter
import warnings
warnings.filterwarnings("ignore", 'This pattern has match groups')

from tqdm import tqdm

import nltk
from nltk.tokenize import word_tokenize 
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.probability import FreqDist
nltk.download('punkt')

from xgboost import XGBClassifier

from skmultilearn.problem_transform import ClassifierChain, BinaryRelevance
from skmultilearn.model_selection import IterativeStratification
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, f1_score, hamming_loss

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Load Data

In [None]:
seed_val = 1906350912
random.seed(seed_val)
np.random.seed(seed_val)
categories_umum = ['Gigi', 'Mata', 'Paru', 'Jantung',  'Urologi', 'Kandungan', 
              'Gizi', 'Tulang', 'Saraf', 'Jiwa', 'THT', 'Kulit dan Kelamin',
              'Penyakit Dalam', 'Bedah', 'Anak', 'Umum']

categories = [
    'Kandungan', 'Gizi', 'Jantung', 'Bedah', 'Jiwa', 
    'Penyakit Dalam',
    'Kulit dan Kelamin', 'Mata', 'Paru', 'Gigi', 'Urologi',
    'THT',
    'Anak', 'Tulang', 'Saraf',
]

In [None]:
df_train = pd.read_csv(f"Dataset/Human_Annotated.csv", index_col="ID")
df_train.fillna("", inplace = True)

df_train_umumnt = df_train.drop(columns=["Umum"])
df_train_umumnt["COUNT CLASS"] = df_train_umumnt.drop(columns=["JUDUL", "ISI"]).sum(axis=1)
df_train = df_train_umumnt[df_train_umumnt["COUNT CLASS"] > 0].drop(columns=["COUNT CLASS"])
del df_train_umumnt

df_train["ALL"] = df_train['JUDUL'] + " " + df_train['ISI']
df_train = df_train.drop(columns=['JUDUL', 'ISI'])
df_train = df_train[['ALL'] + categories]


df_test = pd.read_csv(f"Dataset/Gold_Standard.csv", index_col="ID")
df_test.fillna("", inplace = True)
df_test['Count'] = df_test.drop(columns=['JUDUL', 'ISI']).values.sum(axis=1)
df_test = df_test[df_test['Count'] <= 3].drop(columns=['Count'])
df_test = df_test[["JUDUL", "ISI"] + categories_umum]

df_test_umumnt = df_test.drop(columns=["Umum"])
df_test_umumnt["COUNT CLASS"] = df_test_umumnt.drop(columns=["JUDUL", "ISI"]).sum(axis=1)
df_test = df_test_umumnt[df_test_umumnt["COUNT CLASS"] > 0].drop(columns=["COUNT CLASS"])
del df_test_umumnt

df_test["ALL"] = df_test['JUDUL'] + " " + df_test['ISI']
df_test = df_test.drop(columns=['JUDUL', 'ISI'])
df_test = df_test[['ALL'] + categories]

## Function

In [None]:
from nltk.parse.transitionparser import remove
from collections import defaultdict

def lower_text(texts):
    return [s.lower() for s in texts]

def remove_punc_text(texts):
    return [s.translate(str.maketrans("","",string.punctuation + "1234567890")) for s in texts]

def strip_text(texts):
    return [s.strip() for s in texts]

def remove_stopword_text(texts):
    factory = StopWordRemoverFactory()
    stopwords = factory.get_stop_words()
    stopwords.remove('mata')
    stopwords.remove('ingat')
    stopwords.remove('orang')
    ext_stopwords = ['dok', 'doc', 'dokter', 'terima', 'kasih', 'terimakasih', 'sep']
    all_stopwords = stopwords + ext_stopwords
    dictionary = ArrayDictionary(all_stopwords)
    stopword = StopWordRemover(dictionary)
    return [stopword.remove(s) for s in texts]

def stemming_text(texts):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    return [stemmer.stem(s) for s in texts]

def tokenize_text(texts):
    return [word_tokenize(s) for s in texts]

def cleaning_text_stemstop(texts):
    texts = lower_text(texts)
    texts = remove_punc_text(texts)
    texts = strip_text(texts)
    texts = remove_stopword_text(texts)
    texts = stemming_text(texts)
    return texts

def cleaning_text_stem(texts):
    texts = lower_text(texts)
    texts = remove_punc_text(texts)
    texts = strip_text(texts)
    texts = stemming_text(texts)
    return texts

def cleaning_text_raw(texts):
    texts = lower_text(texts)
    texts = remove_punc_text(texts)
    texts = strip_text(texts)
    return texts

In [None]:
def mean(lst):
    return sum(lst) / len(lst)

def cross_val(model, X, y, k, preprocess, id):
  # if (type(clf) == ClassifierChain):
  #   print("Bukan BR")
  #   return

  # if (type(clf) == BinaryRelevance):
  #   print("Bukan CC")
  #   return
  acc_lst = []
  f1_lst = []
  hamm_lst = []
  acc_mean = 0
  f1_mean = 0
  hamm_mean = 0
  
  cv = IterativeStratification(n_splits=k, order=1)
  with tqdm(total=k, desc="Fold") as pbar:
    for i, (train_idx, val_idx) in enumerate(cv.split(X, y)):
      X_train = np.array([X[i] for i in train_idx])
      y_train = np.array([y[i] for i in train_idx])
      X_val = np.array([X[i] for i in val_idx])
      y_val = np.array([y[i] for i in val_idx])
      X_train = preprocess.fit_transform(X_train)
      # X_test_vec = preprocess.transform(X_test)
      X_val = preprocess.transform(X_val)
      model.fit(X_train, y_train)
      y_pred = model.predict(X_val)

      acc_lst.append(accuracy_score(y_val, y_pred.A))
      f1_lst.append(f1_score(y_val, y_pred.A, average='micro', zero_division=0))
      hamm_lst.append(hamming_loss(y_val, y_pred.A))

      acc_mean = mean(acc_lst)
      f1_mean = mean(f1_lst)
      hamm_mean = mean(hamm_lst)
      

      pbar.set_postfix(accuracy = str(f'{acc_mean}'), f1 = str(f'{f1_mean}'), 
                       hamming = str(f'{hamm_mean}'))
      pbar.update()
  record_result([f"{id['preprocess']}_{id['svd']}_{id['model']}_{id['multi']}", 
                 id['preprocess'], id['svd'], id['model'], id['multi'], 
                 acc_mean, f1_mean, hamm_mean])
  print(f'Fold Accuracy: {acc_mean}')
  print(f'Fold F1 Score: {f1_mean}')
  print(f'Fold Hamming Score: {hamm_mean}')


In [None]:
import csv  
import os

# Specifying path

path = 'Result/Conventional_ML_Validation.csv'

# Checking whether the specified path exists

if not os.path.exists(path):
  print("gada")
  f = open(path, "w")
  writer = csv.writer(f)
  writer.writerow(['Id', "Preprocess", "SVD", 'Model', 'Multilabel Approach', 'Accuracy', 'F1-Score', 'Hamming Loss'])
  f.close()
else:
  print('ada')
def record_result(row):
  f = open(path, "a")
  writer = csv.writer(f)
  writer.writerow(row)
  f.close()

ada


# Raw

In [None]:
preprocess = 'raw'
X_train = df_train['ALL'].values
y_train = df_train.drop(columns=['ALL']).values
X_test = df_test['ALL'].values
y_test = df_test.drop(columns=['ALL']).values

X_train = cleaning_text_raw(X_train)
X_test = cleaning_text_raw(X_test)

## With SVD 100 components

In [None]:
svd = 'svd100'
pipeline = make_pipeline(
    TfidfVectorizer(min_df=2, sublinear_tf = True, ngram_range=(1,3)),
    TruncatedSVD(n_components=100, n_iter=10),
    MinMaxScaler()
)
pipeline.fit(X_train)

### Logistic Regression

In [None]:
model = 'logreg'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=LogisticRegression(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Bukan CC


Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=LogisticRegression(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Fold Accuracy: 0.5300839002976687
Fold F1 Score: 0.6705989649984332
Fold Hamming Score: 0.03979946658181834


### SGD Logistic Regression

In [None]:
model = 'sgd_logreg'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=SGDClassifier(loss='log_loss'),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Bukan CC


Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=SGDClassifier(loss='log_loss'),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Fold: 100%|██████████| 10/10 [00:40<00:00,  4.03s/it, accuracy=0.604859724131258, f1=0.7188649396460568, hamming=0.03890704305394756]

Fold Accuracy: 0.604859724131258
Fold F1 Score: 0.7188649396460568
Fold Hamming Score: 0.03890704305394756





### SVC

In [None]:
model = 'svc'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=SVC(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Bukan CC


Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=SVC(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Fold: 100%|██████████| 10/10 [00:48<00:00,  4.90s/it, accuracy=0.6541207882958145, f1=0.7401905140460898, hamming=0.03539795207225213]

Fold Accuracy: 0.6541207882958145
Fold F1 Score: 0.7401905140460898
Fold Hamming Score: 0.03539795207225213





### SGD SVC

In [None]:
model = 'sgd_svc'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=SGDClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Bukan CC


Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=SGDClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Fold: 100%|██████████| 10/10 [00:37<00:00,  3.73s/it, accuracy=0.6088006413522191, f1=0.7004825646542189, hamming=0.04362560574404211]

Fold Accuracy: 0.6088006413522191
Fold F1 Score: 0.7004825646542189
Fold Hamming Score: 0.04362560574404211





### XGBoost

In [None]:
model = 'xgb'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=XGBClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Bukan CC


Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=XGBClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Fold: 100%|██████████| 10/10 [06:30<00:00, 39.03s/it, accuracy=0.5774270400072588, f1=0.7063292168960642, hamming=0.037330379733533293]

Fold Accuracy: 0.5774270400072588
Fold F1 Score: 0.7063292168960642
Fold Hamming Score: 0.037330379733533293





### Decision Tree

In [None]:
model = 'tree'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=DecisionTreeClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Bukan CC


Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=DecisionTreeClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Fold: 100%|██████████| 10/10 [01:08<00:00,  6.89s/it, accuracy=0.4395088040863097, f1=0.6011394393212808, hamming=0.05189102542551325]

Fold Accuracy: 0.4395088040863097
Fold F1 Score: 0.6011394393212808
Fold Hamming Score: 0.05189102542551325





## With SVD 250 components

In [None]:
svd = 'svd250'
pipeline = make_pipeline(
    TfidfVectorizer(min_df=2, sublinear_tf = True, ngram_range=(1,3)),
    TruncatedSVD(n_components=250, n_iter=10),
    MinMaxScaler()
)
pipeline.fit(X_train)

### Logistic Regression

In [None]:
model = 'logreg'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=LogisticRegression(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Bukan CC


Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=LogisticRegression(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Fold Accuracy: 0.5315218242415887
Fold F1 Score: 0.67668828326006
Fold Hamming Score: 0.03906082044911559





### SGD Logistic Regression

In [None]:
model = 'sgd_logreg'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=SGDClassifier(loss='log_loss'),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Bukan CC


Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=SGDClassifier(loss='log_loss'),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Fold: 100%|██████████| 10/10 [01:20<00:00,  8.06s/it, accuracy=0.5987596613988604, f1=0.7157043191553067, hamming=0.03955702332830889]

Fold Accuracy: 0.5987596613988604
Fold F1 Score: 0.7157043191553067
Fold Hamming Score: 0.03955702332830889





### SVC

In [None]:
model = 'svc'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=SVC(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Bukan CC


Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=SVC(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Fold: 100%|██████████| 10/10 [02:28<00:00, 14.89s/it, accuracy=0.618229830005804, f1=0.6772559059434565, hamming=0.04513782274549632]

Fold Accuracy: 0.618229830005804
Fold F1 Score: 0.6772559059434565
Fold Hamming Score: 0.04513782274549632





### SGD SVC

In [None]:
model = 'sgd_svc'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=SGDClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Bukan CC


Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=SGDClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Fold: 100%|██████████| 10/10 [01:20<00:00,  8.00s/it, accuracy=0.5645614329956115, f1=0.6659055457282377, hamming=0.04616893556980929]

Fold Accuracy: 0.5645614329956115
Fold F1 Score: 0.6659055457282377
Fold Hamming Score: 0.04616893556980929





### XGBoost

In [None]:
model = 'xgb'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=XGBClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Bukan CC


Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=XGBClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Fold: 100%|██████████| 10/10 [15:04<00:00, 90.42s/it, accuracy=0.5467936571516245, f1=0.6830712141221427, hamming=0.03900422343946435]

Fold Accuracy: 0.5467936571516245
Fold F1 Score: 0.6830712141221427
Fold Hamming Score: 0.03900422343946435





### Decision Tree

In [None]:
model = 'tree'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=DecisionTreeClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Bukan CC


Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=DecisionTreeClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Fold: 100%|██████████| 10/10 [03:05<00:00, 18.60s/it, accuracy=0.43467817491936717, f1=0.5959948795676405, hamming=0.05132725110324956]

Fold Accuracy: 0.43467817491936717
Fold F1 Score: 0.5959948795676405
Fold Hamming Score: 0.05132725110324956





## With SVD 500 components

In [None]:
svd = 'svd500'
pipeline = make_pipeline(
    TfidfVectorizer(min_df=2, sublinear_tf = True, ngram_range=(1,3)),
    TruncatedSVD(n_components=500, n_iter=10),
    MinMaxScaler()
)
pipeline.fit(X_train)

### Logistic Regression

In [None]:
model = 'logreg'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=LogisticRegression(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Bukan CC


Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=LogisticRegression(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Fold Accuracy: 0.5285552016072568
Fold F1 Score: 0.6692201688974402
Fold Hamming Score: 0.03955475825166999





### SGD Logistic Regression

In [None]:
model = 'sgd_logreg'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=SGDClassifier(loss='log_loss'),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Bukan CC


Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=SGDClassifier(loss='log_loss'),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Fold: 100%|██████████| 10/10 [03:00<00:00, 18.07s/it, accuracy=0.6358780506953825, f1=0.7483111210945528, hamming=0.034113618926663265]

Fold Accuracy: 0.6358780506953825
Fold F1 Score: 0.7483111210945528
Fold Hamming Score: 0.034113618926663265





### SVC

In [None]:
model = 'svc'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=SVC(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Bukan CC


Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=SVC(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Fold: 100%|██████████| 10/10 [06:57<00:00, 41.77s/it, accuracy=0.5102936586931065, f1=0.5656543739257287, hamming=0.05955913952696942]

Fold Accuracy: 0.5102936586931065
Fold F1 Score: 0.5656543739257287
Fold Hamming Score: 0.05955913952696942





### SGD SVC

In [None]:
model = 'sgd_svc'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=SGDClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Bukan CC


Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=SGDClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Fold: 100%|██████████| 10/10 [02:52<00:00, 17.21s/it, accuracy=0.521169440618878, f1=0.6453557521571798, hamming=0.04861605630864436]

Fold Accuracy: 0.521169440618878
Fold F1 Score: 0.6453557521571798
Fold Hamming Score: 0.04861605630864436





### XGBoost

In [None]:
model = 'xgb'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=XGBClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Bukan CC


Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=XGBClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Fold: 100%|██████████| 10/10 [28:15<00:00, 169.57s/it, accuracy=0.5264174499823092, f1=0.6661844679484868, hamming=0.0402662499975788]

Fold Accuracy: 0.5264174499823092
Fold F1 Score: 0.6661844679484868
Fold Hamming Score: 0.0402662499975788





### Decision Tree

In [None]:
model = 'tree'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=DecisionTreeClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Bukan CC


Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=DecisionTreeClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Fold: 100%|██████████| 10/10 [06:34<00:00, 39.43s/it, accuracy=0.4229387035838836, f1=0.5883377539640751, hamming=0.05073842417072548]

Fold Accuracy: 0.4229387035838836
Fold F1 Score: 0.5883377539640751
Fold Hamming Score: 0.05073842417072548





## Without SVD

In [None]:
svd = 'non'
pipeline = make_pipeline(
    TfidfVectorizer(min_df=2, sublinear_tf = True, ngram_range=(1,3)),
    FunctionTransformer(lambda x: x.toarray(), accept_sparse=True),
    MinMaxScaler()
)
pipeline.fit_transform(X_train)

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

### Logistic Regression

In [None]:
model = 'logreg'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=LogisticRegression(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Bukan CC


Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=LogisticRegression(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Fold: 100%|██████████| 10/10 [01:05<00:00,  6.56s/it, accuracy=0.4660126031882001, f1=0.6190237115053135, hamming=0.04400354517993972]

Fold Accuracy: 0.4660126031882001
Fold F1 Score: 0.6190237115053135
Fold Hamming Score: 0.04400354517993972





### SGD Logistic Regression

In [None]:
model = 'sgd_logreg'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=SGDClassifier(loss='log_loss'),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Bukan CC


Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=SGDClassifier(loss='log_loss'),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Fold: 100%|██████████| 10/10 [00:17<00:00,  1.70s/it, accuracy=0.539948131916901, f1=0.6710853181792779, hamming=0.042971400269559486]

Fold Accuracy: 0.539948131916901
Fold F1 Score: 0.6710853181792779
Fold Hamming Score: 0.042971400269559486





### SVC

In [None]:
model = 'svc'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=SVC(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Bukan CC


Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=SVC(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Fold: 100%|██████████| 10/10 [04:28<00:00, 26.85s/it, accuracy=0.16154164102037138, f1=0.259492182835496, hamming=0.06578612671152402]

Fold Accuracy: 0.16154164102037138
Fold F1 Score: 0.259492182835496
Fold Hamming Score: 0.06578612671152402





### SGD SVC

In [None]:
model = 'sgd_svc'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=SGDClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Bukan CC


Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=SGDClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Fold: 100%|██████████| 10/10 [00:17<00:00,  1.72s/it, accuracy=0.5514807683664468, f1=0.6824916587876452, hamming=0.04342504869016712]

Fold Accuracy: 0.5514807683664468
Fold F1 Score: 0.6824916587876452
Fold Hamming Score: 0.04342504869016712





### XGBoost

In [None]:
model = 'xgb'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=XGBClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Bukan CC


Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=XGBClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Fold: 100%|██████████| 10/10 [07:55<00:00, 47.51s/it, accuracy=0.5697651981088774, f1=0.724096549660268, hamming=0.03767793333609286]

Fold Accuracy: 0.5697651981088774
Fold F1 Score: 0.724096549660268
Fold Hamming Score: 0.03767793333609286





### Naive Bayes

In [None]:
model = 'nb'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=MultinomialNB(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Bukan CC


Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=MultinomialNB(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Fold: 100%|██████████| 10/10 [00:14<00:00,  1.50s/it, accuracy=0.481201956994039, f1=0.6294942748542487, hamming=0.04447659209166682]

Fold Accuracy: 0.481201956994039
Fold F1 Score: 0.6294942748542487
Fold Hamming Score: 0.04447659209166682





### Decision Tree

In [None]:
model = 'tree'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=DecisionTreeClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Bukan CC


Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=DecisionTreeClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Fold: 100%|██████████| 10/10 [01:02<00:00,  6.24s/it, accuracy=0.5007693626967045, f1=0.6875914766298943, hamming=0.048879567536167165]

Fold Accuracy: 0.5007693626967045
Fold F1 Score: 0.6875914766298943
Fold Hamming Score: 0.048879567536167165





# Stemming

In [None]:
preprocess = 'stem'
X_train = df_train['ALL'].values
y_train = df_train.drop(columns=['ALL']).values
X_test = df_test['ALL'].values
y_test = df_test.drop(columns=['ALL']).values

X_train = cleaning_text_stem(X_train)
X_test = cleaning_text_stem(X_test)

## With SVD 100 components

In [None]:
svd = 'svd100'
pipeline = make_pipeline(
    TfidfVectorizer(min_df=2, sublinear_tf = True, ngram_range=(1,3)),
    TruncatedSVD(n_components=100, n_iter=10),
    MinMaxScaler()
)
pipeline.fit(X_train)

### Logistic Regression

In [None]:
model = 'logreg'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=LogisticRegression(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Bukan CC


Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=LogisticRegression(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Fold Accuracy: 0.5229076965706814
Fold F1 Score: 0.6626689845265188
Fold Hamming Score: 0.0408969991429118





### SGD Logistic Regression

In [None]:
model = 'sgd_logreg'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=SGDClassifier(loss='log_loss'),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Bukan CC


Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=SGDClassifier(loss='log_loss'),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Fold: 100%|██████████| 10/10 [00:35<00:00,  3.57s/it, accuracy=0.6062545362115465, f1=0.7087098638262384, hamming=0.04097810529962847]

Fold Accuracy: 0.6062545362115465
Fold F1 Score: 0.7087098638262384
Fold Hamming Score: 0.04097810529962847





### SVC

In [None]:
model = 'svc'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=SVC(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Bukan CC


Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=SVC(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Fold: 100%|██████████| 10/10 [00:49<00:00,  4.94s/it, accuracy=0.6572938275042521, f1=0.7444181406839255, hamming=0.03511367798600064]

Fold Accuracy: 0.6572938275042521
Fold F1 Score: 0.7444181406839255
Fold Hamming Score: 0.03511367798600064





### SGD SVC

In [None]:
model = 'sgd_svc'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=SGDClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Bukan CC


Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=SGDClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Fold: 100%|██████████| 10/10 [00:33<00:00,  3.31s/it, accuracy=0.6136032505859375, f1=0.7000012557203358, hamming=0.04393785821386812]

Fold Accuracy: 0.6136032505859375
Fold F1 Score: 0.7000012557203358
Fold Hamming Score: 0.04393785821386812





### XGBoost

In [None]:
model = 'xgb'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=XGBClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Bukan CC


Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=XGBClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Fold: 100%|██████████| 10/10 [06:23<00:00, 38.31s/it, accuracy=0.5734144408528611, f1=0.7072037276213322, hamming=0.0374175469808523]

Fold Accuracy: 0.5734144408528611
Fold F1 Score: 0.7072037276213322
Fold Hamming Score: 0.0374175469808523





### Decision Tree

In [None]:
model = 'tree'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=DecisionTreeClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Bukan CC


Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=DecisionTreeClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Fold: 100%|██████████| 10/10 [01:05<00:00,  6.54s/it, accuracy=0.4367824586658184, f1=0.5927601571933386, hamming=0.053714407469598414]

Fold Accuracy: 0.4367824586658184
Fold F1 Score: 0.5927601571933386
Fold Hamming Score: 0.053714407469598414





## With SVD 250 components

In [None]:
svd = 'svd250'
pipeline = make_pipeline(
    TfidfVectorizer(min_df=2, sublinear_tf = True, ngram_range=(1,3)),
    TruncatedSVD(n_components=250, n_iter=10),
    MinMaxScaler()
)
pipeline.fit(X_train)

### Logistic Regression

In [None]:
model = 'logreg'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=LogisticRegression(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Bukan CC


Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=LogisticRegression(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Fold Accuracy: 0.5357191583143102
Fold F1 Score: 0.6765940259165834
Fold Hamming Score: 0.03915594206806923





### SGD Logistic Regression

In [None]:
model = 'sgd_logreg'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=SGDClassifier(loss='log_loss'),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Bukan CC


Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=SGDClassifier(loss='log_loss'),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Fold: 100%|██████████| 10/10 [01:24<00:00,  8.49s/it, accuracy=0.61798102212676, f1=0.7330012857993762, hamming=0.03747541971037364]

Fold Accuracy: 0.61798102212676
Fold F1 Score: 0.7330012857993762
Fold Hamming Score: 0.03747541971037364





### SVC

In [None]:
model = 'svc'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=SVC(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Bukan CC


Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=SVC(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Fold: 100%|██████████| 10/10 [02:22<00:00, 14.23s/it, accuracy=0.6206541271176412, f1=0.6886244828324478, hamming=0.043526396443868284]

Fold Accuracy: 0.6206541271176412
Fold F1 Score: 0.6886244828324478
Fold Hamming Score: 0.043526396443868284





### SGD SVC

In [None]:
model = 'sgd_svc'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=SGDClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Bukan CC


Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=SGDClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Fold: 100%|██████████| 10/10 [01:20<00:00,  8.00s/it, accuracy=0.5795640361209041, f1=0.6947349433386939, hamming=0.04223094078910257]

Fold Accuracy: 0.5795640361209041
Fold F1 Score: 0.6947349433386939
Fold Hamming Score: 0.04223094078910257





### XGBoost

In [None]:
model = 'xgb'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=XGBClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Bukan CC


Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=XGBClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Fold: 100%|██████████| 10/10 [14:37<00:00, 87.72s/it, accuracy=0.5348723930939314, f1=0.6773054005965482, hamming=0.03971550484900987]

Fold Accuracy: 0.5348723930939314
Fold F1 Score: 0.6773054005965482
Fold Hamming Score: 0.03971550484900987





### Decision Tree

In [None]:
model = 'tree'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=DecisionTreeClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Bukan CC


Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=DecisionTreeClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Fold: 100%|██████████| 10/10 [03:08<00:00, 18.82s/it, accuracy=0.4501708233594706, f1=0.5987736413256909, hamming=0.05065574734845572]

Fold Accuracy: 0.4501708233594706
Fold F1 Score: 0.5987736413256909
Fold Hamming Score: 0.05065574734845572





## With SVD 500 components

In [None]:
svd = 'svd500'
pipeline = make_pipeline(
    TfidfVectorizer(min_df=2, sublinear_tf = True, ngram_range=(1,3)),
    TruncatedSVD(n_components=500, n_iter=10),
    MinMaxScaler()
)
pipeline.fit(X_train)

### Logistic Regression

In [None]:
model = 'logreg'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=LogisticRegression(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Bukan CC


Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=LogisticRegression(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Fold Accuracy: 0.5180857599445756
Fold F1 Score: 0.6637003514546478
Fold Hamming Score: 0.04043312244636607





### SGD Logistic Regression

In [None]:
model = 'sgd_logreg'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=SGDClassifier(loss='log_loss'),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Bukan CC


Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=SGDClassifier(loss='log_loss'),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Fold: 100%|██████████| 10/10 [02:51<00:00, 17.17s/it, accuracy=0.6363933265974772, f1=0.7501041342127144, hamming=0.03399963369456799]

Fold Accuracy: 0.6363933265974772
Fold F1 Score: 0.7501041342127144
Fold Hamming Score: 0.03399963369456799





### SVC

In [None]:
model = 'svc'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=SVC(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Bukan CC


Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=SVC(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Fold: 100%|██████████| 10/10 [06:38<00:00, 39.83s/it, accuracy=0.5355334737872435, f1=0.591012816516003, hamming=0.055693609601193114]

Fold Accuracy: 0.5355334737872435
Fold F1 Score: 0.591012816516003
Fold Hamming Score: 0.055693609601193114





### SGD SVC

In [None]:
model = 'sgd_svc'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=SGDClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Bukan CC


Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=SGDClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Fold: 100%|██████████| 10/10 [02:43<00:00, 16.30s/it, accuracy=0.5346850296053153, f1=0.674702995394093, hamming=0.04696068490301898]

Fold Accuracy: 0.5346850296053153
Fold F1 Score: 0.674702995394093
Fold Hamming Score: 0.04696068490301898





### XGBoost

In [None]:
model = 'xgb'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=XGBClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Bukan CC


Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=XGBClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Fold: 100%|██████████| 10/10 [27:10<00:00, 163.08s/it, accuracy=0.53460733001118, f1=0.6686723432899111, hamming=0.04041508984526872]

Fold Accuracy: 0.53460733001118
Fold F1 Score: 0.6686723432899111
Fold Hamming Score: 0.04041508984526872





### Decision Tree

In [None]:
model = 'tree'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=DecisionTreeClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Bukan CC


Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=DecisionTreeClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Fold: 100%|██████████| 10/10 [06:57<00:00, 41.72s/it, accuracy=0.43195361316856307, f1=0.5922931722714405, hamming=0.050732146713095685]

Fold Accuracy: 0.43195361316856307
Fold F1 Score: 0.5922931722714405
Fold Hamming Score: 0.050732146713095685





## Without SVD

In [None]:
svd = 'non'
pipeline = make_pipeline(
    TfidfVectorizer(min_df=2, sublinear_tf = True, ngram_range=(1,3)),
    FunctionTransformer(lambda x: x.toarray(), accept_sparse=True),
    MinMaxScaler()
)
pipeline.fit_transform(X_train)

### Logistic Regression

In [None]:
model = 'logreg'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=LogisticRegression(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=LogisticRegression(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Fold: 100%|██████████| 10/10 [01:08<00:00,  6.88s/it, accuracy=0.4807405144200567, f1=0.6282350396507763, hamming=0.04339464644084685]

Fold Accuracy: 0.4807405144200567
Fold F1 Score: 0.6282350396507763
Fold Hamming Score: 0.04339464644084685





### SGD Logistic Regression

In [None]:
model = 'sgd_logreg'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=SGDClassifier(loss='log_loss'),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=SGDClassifier(loss='log_loss'),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

### SVC

In [None]:
model = 'svc'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=SVC(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Bukan CC


Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=SVC(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

### SGD SVC


In [None]:
model = 'sgd_svc'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=SGDClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=SGDClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Fold: 100%|██████████| 10/10 [00:16<00:00,  1.65s/it, accuracy=0.5603292985960485, f1=0.6891410658346853, hamming=0.042584645091355657]

Fold Accuracy: 0.5603292985960485
Fold F1 Score: 0.6891410658346853
Fold Hamming Score: 0.042584645091355657





### XGBoost

In [None]:
model = 'xgb'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=XGBClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=XGBClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

### Naive Bayes

In [None]:
model = 'nb'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=MultinomialNB(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=MultinomialNB(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

### Decision Tree

In [None]:
model = 'tree'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=DecisionTreeClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=DecisionTreeClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

# Stemming + Stopword

In [None]:
preprocess = 'stemstop'
X_train = df_train['ALL'].values
y_train = df_train.drop(columns=['ALL']).values
X_test = df_test['ALL'].values
y_test = df_test.drop(columns=['ALL']).values

X_train = cleaning_text_stemstop(X_train)
X_test = cleaning_text_stemstop(X_test)

## With SVD 100 components

In [None]:
svd = 'svd100'
pipeline = make_pipeline(
    TfidfVectorizer(min_df=2, sublinear_tf = True, ngram_range=(1,3)),
    TruncatedSVD(n_components=100, n_iter=10),
    MinMaxScaler()
)
pipeline.fit(X_train)

### Logistic Regression

In [None]:
model = 'logreg'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=LogisticRegression(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Bukan CC


Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=LogisticRegression(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

### SGD Logistic Regression

In [None]:
model = 'sgd_logreg'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=SGDClassifier(loss='log_loss'),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=SGDClassifier(loss='log_loss'),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

### SVC

In [None]:
model = 'svc'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=SVC(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=SVC(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

### SGD SVC

In [None]:
model = 'sgd_svc'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=SGDClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=SGDClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

### XGBoost

In [None]:
model = 'xgb'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=XGBClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=XGBClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

### Decision Tree

In [None]:
model = 'tree'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=DecisionTreeClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=DecisionTreeClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

## With SVD 250 components

In [None]:
svd = 'svd250'
pipeline = make_pipeline(
    TfidfVectorizer(min_df=2, sublinear_tf = True, ngram_range=(1,3)),
    TruncatedSVD(n_components=250, n_iter=10),
    MinMaxScaler()
)
pipeline.fit(X_train)

### Logistic Regression

In [None]:
model = 'logreg'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=LogisticRegression(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=LogisticRegression(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

### SGD Logistic Regression

In [None]:
model = 'sgd_logreg'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=SGDClassifier(loss='log_loss'),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=SGDClassifier(loss='log_loss'),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

### SVC

In [None]:
model = 'svc'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=SVC(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=SVC(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

### SGD SVC

In [None]:
model = 'sgd_svc'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=SGDClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=SGDClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

### XGBoost

In [None]:
model = 'xgb'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=XGBClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=XGBClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

### Decision Tree

In [None]:
model = 'tree'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=DecisionTreeClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=DecisionTreeClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Fold: 100%|██████████| 10/10 [02:06<00:00, 12.65s/it, accuracy=0.5107673720070891, f1=0.6616556137143047, hamming=0.04647983068526441]

Fold Accuracy: 0.5107673720070891
Fold F1 Score: 0.6616556137143047
Fold Hamming Score: 0.04647983068526441





## With SVD 500 components

In [None]:
svd = 'svd500'
pipeline = make_pipeline(
    TfidfVectorizer(min_df=2, sublinear_tf = True, ngram_range=(1,3)),
    TruncatedSVD(n_components=500, n_iter=10),
    MinMaxScaler()
)
pipeline.fit(X_train)

### Logistic Regression

In [None]:
model = 'logreg'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=LogisticRegression(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=LogisticRegression(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

### SGD Logistic Regression

In [None]:
model = 'sgd_logreg'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=SGDClassifier(loss='log_loss'),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=SGDClassifier(loss='log_loss'),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

### SVC

In [None]:
model = 'svc'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=SVC(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=SVC(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

### SGD SVC

In [None]:
model = 'sgd_svc'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=SGDClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=SGDClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Fold: 100%|██████████| 10/10 [01:06<00:00,  6.65s/it, accuracy=0.529406856315211, f1=0.6838507612732523, hamming=0.04764283560629035]

Fold Accuracy: 0.529406856315211
Fold F1 Score: 0.6838507612732523
Fold Hamming Score: 0.04764283560629035





### XGBoost

In [None]:
model = 'xgb'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=XGBClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=XGBClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

### Decision Tree

In [None]:
model = 'tree'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=DecisionTreeClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=DecisionTreeClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

## Without SVD

In [None]:
svd = 'non'
pipeline = make_pipeline(
    TfidfVectorizer(min_df=2, sublinear_tf = True, ngram_range=(1,3)),
    FunctionTransformer(lambda x: x.toarray(), accept_sparse=True),
    MinMaxScaler()
)
pipeline.fit_transform(X_train)

### Logistic Regression

In [None]:
model = 'logreg'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=LogisticRegression(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=LogisticRegression(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

### SGD Logistic Regression

In [None]:
model = 'sgd_logreg'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=SGDClassifier(loss='log_loss'),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=SGDClassifier(loss='log_loss'),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

### SVC

In [None]:
model = 'svc'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=SVC(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=SVC(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

### SGD SVC

In [None]:
model = 'sgd_svc'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=SGDClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=SGDClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

### XGBoost

In [None]:
model = 'xgb'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=XGBClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=XGBClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

### Naive Bayes

In [None]:
model = 'nb'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=MultinomialNB(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=MultinomialNB(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Fold: 100%|██████████| 10/10 [00:06<00:00,  1.61it/s, accuracy=0.6000966665717827, f1=0.7256583320149329, hamming=0.03725844807020641]

Fold Accuracy: 0.6000966665717827
Fold F1 Score: 0.7256583320149329
Fold Hamming Score: 0.03725844807020641





### Decision Tree

In [None]:
model = 'tree'

Binary Relevance

In [None]:
multi = 'br'
clf = BinaryRelevance(
    classifier=DecisionTreeClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Classifier Chain

In [None]:
multi = 'cc'
clf = ClassifierChain(
    classifier=DecisionTreeClassifier(),
    require_dense=[False, True]
)
id = {
    'preprocess':preprocess,
    'svd':svd,
    'model':model,
    'multi':multi
    }
clf = cross_val(clf, X_train, y_train, 10, pipeline, id)

Fold: 100%|██████████| 10/10 [00:22<00:00,  2.20s/it, accuracy=0.5256517145768251, f1=0.6930786821353986, hamming=0.04705635711598437]

Fold Accuracy: 0.5256517145768251
Fold F1 Score: 0.6930786821353986
Fold Hamming Score: 0.04705635711598437



