## Training Data Preprocessing

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import re
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [None]:
data = pd.read_csv('./spam_training_data.csv')

In [4]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\thori\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
stop_words = set(stopwords.words("indonesian"))

In [6]:
factory = StemmerFactory()
lemmatizer = factory.create_stemmer()

In [7]:
def clean_complaint(complaint):
    # Remove usernames after '@'
    complaint = re.sub(r'@\w+', '', complaint)
    # Remove URLs
    complaint = re.sub(r'http\S+', '', complaint)
    # Remove punctuation and special symbols (commas, periods, etc.)
    complaint = re.sub(r'[^\w\s]', '', complaint)
    # Remove extra whitespace
    complaint = re.sub(r'\s+', ' ', complaint).strip()
    # Remove any zero-width characters like 'ㅤ'
    complaint = re.sub(r'[\u200B-\u200D\uFEFF\u3164]+', '', complaint)
    return complaint

In [8]:
def remove_stopwords(complaint):
    complaint = complaint.lower()
     #Tokenize and remove stop words
    words = [word for word in complaint.split() if word not in stop_words]
    # Apply lemmatization
    lemmatized_words = [lemmatizer.stem(word) for word in words]
    return ' '.join(lemmatized_words)

In [9]:
data['complaint'] = data['complaint'].apply(clean_complaint)

In [10]:
data['complaint'] = data['complaint'].apply(remove_stopwords)

In [11]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\thori\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
# Tokenize each complaint
data['tokenized_complaint'] = data['complaint'].apply(word_tokenize)

In [13]:
data.to_csv('./spam_training_data.csv', index=False)

## Logistic Regression Model

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

In [18]:
data['vectorized'] = data['complaint']

In [19]:
vectorizer = TfidfVectorizer()
vectorizer.fit(data['vectorized'])
X = vectorizer.transform(data['vectorized'])

In [20]:
y = data['is_complaint']

In [21]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

In [43]:
with open('logistic_regression_model.pkl', 'wb') as model_file:
    pickle.dump(clf, model_file)

with open('tfidf_vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

In [24]:
y_pred = clf.predict(X_val)

In [25]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.79      1.00      0.89       151
           1       1.00      0.03      0.05        40

    accuracy                           0.80       191
   macro avg       0.90      0.51      0.47       191
weighted avg       0.84      0.80      0.71       191



## SVM Model

In [36]:
from sklearn.svm import LinearSVC

In [37]:
model_SVM = LinearSVC()
model_SVM.fit(X_train, y_train)

In [41]:
y_pred_SVM = model_SVM.predict(X_val)

In [42]:
print(classification_report(y_val, y_pred_SVM))

              precision    recall  f1-score   support

           0       0.84      0.94      0.89       151
           1       0.59      0.33      0.42        40

    accuracy                           0.81       191
   macro avg       0.72      0.63      0.65       191
weighted avg       0.79      0.81      0.79       191



In [44]:
with open('svm_model.pkl', 'wb') as model_file:
    pickle.dump(clf, model_file)

## Random Forest Model

## Naive Bayes Model

## XGBoost Model

## Simple Neural Network

## Bidirectional LSTM

## Test

In [26]:
df = pd.read_csv('../../scraper/data_x/merged_data.csv')

In [27]:
df['complaint'] = df['complaint'].apply(clean_complaint) 

In [28]:
df['complaint'] = df['complaint'].apply(remove_stopwords) 

In [29]:
# Tokenize each complaint
df['tokenized_complaint'] = df['complaint'].apply(word_tokenize)

In [30]:
df['vectorized'] = df['complaint']

In [31]:
# Load the saved model and vectorizer
with open('logistic_regression_model.pkl', 'rb') as model_file:
    loaded_model = pickle.load(model_file)

with open('tfidf_vectorizer.pkl', 'rb') as vectorizer_file:
    loaded_vectorizer = pickle.load(vectorizer_file)

In [32]:
x = loaded_vectorizer.transform(df['vectorized'])

In [33]:
y = loaded_model.predict(x)

In [34]:
df['is_complaint']=y

In [35]:
df[df['is_complaint']==1]

Unnamed: 0,complaint,category,tokenized_complaint,vectorized,is_complaint
1111,macet fasilitas bersih program andra soni bara...,fasilitas umum,"[macet, fasilitas, bersih, program, andra, son...",macet fasilitas bersih program andra soni bara...,1
1131,denda ngerusak ngehancurin fasilitas,fasilitas umum,"[denda, ngerusak, ngehancurin, fasilitas]",denda ngerusak ngehancurin fasilitas,1
1217,fasilitas bagus ga rawat perintah masyarakat r...,fasilitas umum,"[fasilitas, bagus, ga, rawat, perintah, masyar...",fasilitas bagus ga rawat perintah masyarakat r...,1
1296,ngamuk rusak fasilitas ya,fasilitas umum,"[ngamuk, rusak, fasilitas, ya]",ngamuk rusak fasilitas ya,1
1431,polusi suara ngerusak fasilitas pribadi orang ...,fasilitas umum,"[polusi, suara, ngerusak, fasilitas, pribadi, ...",polusi suara ngerusak fasilitas pribadi orang ...,1
1499,sih ngerusak fasilitas ya,fasilitas umum,"[sih, ngerusak, fasilitas, ya]",sih ngerusak fasilitas ya,1
1581,dilaporinkan kalo ngerusak fasilitas,fasilitas umum,"[dilaporinkan, kalo, ngerusak, fasilitas]",dilaporinkan kalo ngerusak fasilitas,1
1740,gak fasilitas pakai goblok emang fasilitas ana...,fasilitas umum,"[gak, fasilitas, pakai, goblok, emang, fasilit...",gak fasilitas pakai goblok emang fasilitas ana...,1
1771,hobi orang tolol suka ngerusak fasilitas gak m...,fasilitas umum,"[hobi, orang, tolol, suka, ngerusak, fasilitas...",hobi orang tolol suka ngerusak fasilitas gak m...,1
1815,bangun kota nusantara akses didik sehat fasili...,fasilitas umum,"[bangun, kota, nusantara, akses, didik, sehat,...",bangun kota nusantara akses didik sehat fasili...,1
