In [None]:
import os
import re
import string
import unicodedata
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import *
from indoNLP.preprocessing import *
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [None]:
if os.path.isdir("../data/"):
    main_dir = "../"
else:
    main_dir = "https://raw.githubusercontent.com/Hyuto/skripsi/master/"

SEED = 2022

In [None]:
data = pd.read_csv(main_dir + "data/sample-data.csv")
data.head()

In [None]:
data.dropna(inplace=True)
data["date"] = pd.to_datetime(data["date"]).dt.tz_localize(None)
data["label"] = data["label"].astype(int)
# data = data[data["label"] > 0]
data.info()

In [None]:
STEMMER = StemmerFactory().create_stemmer()


def preprocessing(text):
    text = text.lower()
    text = re.sub(r"\s+", " ", text, flags=re.UNICODE)  # remove whitespace
    text = emoji_to_words(text)  # remove emoji
    text = unicodedata.normalize("NFD", text).encode("ascii", "ignore").decode("ascii")
    text = remove_html(text)  # remove html tags
    text = remove_url(text)  # remove url
    # text = re.sub(r"(?<![\w@])@([\w@]+(?:[.!][\w@]+)*)", " ", text)
    text = replace_word_elongation(text)  # replace WE
    text = replace_slang(text)  # replace slang words
    text = text.translate(str.maketrans(string.digits, " " * len(string.digits)))  # remove numbers
    text = text.translate(
        str.maketrans(string.punctuation, " " * len(string.punctuation))
    )  # remove punctuation
    text = " ".join(text.split())
    text = STEMMER.stem(text)
    return " ".join(text.split())


data["cleaned"] = [preprocessing(x) for x in tqdm(data["content"].values)]

In [None]:
pipe_linear = Pipeline(
    [
        ("tf-idf", TfidfVectorizer(max_features=5000)),
        (
            "svm",
            SVC(
                C=1.3, kernel="linear", probability=True, class_weight="balanced", random_state=SEED
            ),
        ),
    ]
)

pipe_linear.fit(x_train, y_train)
pd.DataFrame(classification_report(y_test, pipe_linear.predict(x_test), output_dict=True)).T

In [None]:
pipe_linear_small = Pipeline(
    [
        ("tf-idf", TfidfVectorizer(max_features=1000)),
        (
            "svm",
            SVC(
                C=1.3, kernel="linear", probability=True, class_weight="balanced", random_state=SEED
            ),
        ),
    ]
)

pipe_linear_small.fit(x_train, y_train)
pd.DataFrame(classification_report(y_test, pipe_linear_small.predict(x_test), output_dict=True)).T

In [None]:
from onnx.checker import check_model
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import StringTensorType

os.makedirs("output", exist_ok=True)

def convert2onnx(model, output_name):
    initial_type = [("words", StringTensorType([None, 1]))]
    options = {"svm": {"zipmap": False}}
    onnx_model = convert_sklearn(model, initial_types=initial_type, options=options)
    with open(f"output/model-{output_name}.onnx", "wb") as writer:
        writer.write(onnx_model.SerializeToString())
    check_model(onnx_model)

    !python -m onnxruntime.tools.convert_onnx_models_to_ort "output/model-{output_name}.onnx"
    
#convert2onnx(pipe_linear, "svm-linear")
convert2onnx(pipe_linear_small, "svm-linear-small")