In [75]:
import pandas as pd
import nltk
import pickle
import datetime
import os

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

In [73]:
INPUT = 'sqllab_kbli_kbji_desc_20240822T013131.xlsx'
SPV = 'wilayah_tugas_sak_82024.xlsx'
OUTPUT = 'hasil_pemeriksaan_KBLI_KBJI'

In [65]:
if INPUT.split('.')[-1] == 'csv':
    df = pd.read_csv(INPUT, dtype='str')
else:
    df = pd.read_excel(INPUT, dtype='str')

In [17]:
# Preprocessing functions
# nltk.download('punkt')
# nltk.download('stopwords')
stopwords = set(nltk.corpus.stopwords.words('indonesian'))
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def preprocess_text(text):
    tokens = nltk.word_tokenize(text)
    cleaned_tokens = [stemmer.stem(token) for token in tokens if token.lower() not in stopwords]
    return " ".join(cleaned_tokens)

In [66]:
df.loc[:, 'combined_features'] = (
    'usaha ' + df['usaha'].apply(preprocess_text) + ' produk ' + df['produk'].apply(preprocess_text) + ' bidang ' + df['bidang'].apply(preprocess_text))

# Split data into features and labels
X = df['combined_features']
y = df['kbli']

In [19]:
with open('parameters/svm_kbli_count_vectorizer.pkl', 'rb') as f:
    kbli_count_vect = pickle.load(f)

with open('parameters/svm_kbli_tfidf_transformer.pkl', 'rb') as f:
    kbli_tfidf_transformer = pickle.load(f)

with open('parameters/svm_kbli_model.pkl', 'rb') as f:
    kbli_model = pickle.load(f)

with open('parameters/svm_kbji_count_vectorizer.pkl', 'rb') as f:
    kbji_count_vect = pickle.load(f)

with open('parameters/svm_kbji_tfidf_transformer.pkl', 'rb') as f:
    kbji_tfidf_transformer = pickle.load(f)

with open('parameters/svm_kbji_model.pkl', 'rb') as f:
    kbji_model = pickle.load(f)

In [67]:
X_kbli_counts = kbli_count_vect.transform(X)
X_kbji_counts = kbji_count_vect.transform(X)
X_kbli_tfidf = kbli_tfidf_transformer.transform(X_kbli_counts)
X_kbji_tfidf = kbji_tfidf_transformer.transform(X_kbji_counts)

kbli_predictions = kbli_model.predict(X_kbli_tfidf)
kbji_predictions = kbji_model.predict(X_kbji_tfidf)

In [68]:
df.loc[:, 'kbli_predicted'] = kbli_predictions
df.loc[:, 'kbji_predicted'] = kbji_predictions

In [69]:
df['match'] = 'KBLI dan KBJI berbeda'
df.loc[
    (df['kbli'] == df['kbli_predicted']) & (df['kbji'] == df['kbji_predicted']),
    'match'
] = 'KBLI dan KBJI sama'

df.loc[
    (df['kbli'] == df['kbli_predicted']) & (df['kbji'] != df['kbji_predicted']),
    'match'
] = 'KBJI berbeda'

df.loc[
    (df['kbli'] != df['kbli_predicted']) & (df['kbji'] == df['kbji_predicted']),
    'match'
] = 'KBLI berbeda'

In [70]:
spv = pd.read_excel(SPV)
spv['pml'] = spv['pml'].str.split(',').str[0]
df = pd.merge(spv, df.drop('combined_features', axis=1), on='idbs')

In [77]:
if not os.path.exists(OUTPUT):
    os.makedirs(OUTPUT)

date = datetime.date.today().strftime("%Y-%m-%d")

df.to_excel(f'{OUTPUT}/{date}_complete.xlsx', index=False)
for pml in df['pml'].unique():
    df.loc[df['pml']==pml].to_excel(f'{OUTPUT}/{date}_{pml}.xlsx', index=False)