In [115]:
import pandas as pd
import numpy as np
import tensorflow as tf
import re
import pickle
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [116]:
data = pd.read_csv('./spam_training_data.csv')

In [117]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\thori\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [118]:
stop_words = set(stopwords.words("indonesian"))

In [119]:
factory = StemmerFactory()
lemmatizer = factory.create_stemmer()

In [120]:
def clean_complaint(complaint):
    # Remove usernames after '@'
    complaint = re.sub(r'@\w+', '', complaint)
    # Remove URLs
    complaint = re.sub(r'http\S+', '', complaint)
    # Remove punctuation and special symbols (commas, periods, etc.)
    complaint = re.sub(r'[^\w\s]', '', complaint)
    # Remove extra whitespace
    complaint = re.sub(r'\s+', ' ', complaint).strip()
    # Remove any zero-width characters like 'ㅤ'
    complaint = re.sub(r'[\u200B-\u200D\uFEFF\u3164]+', '', complaint)
    return complaint

In [121]:
def remove_stopwords(complaint):
    complaint = complaint.lower()
     #Tokenize and remove stop words
    words = [word for word in complaint.split() if word not in stop_words]
    # Apply lemmatization
    lemmatized_words = [lemmatizer.stem(word) for word in words]
    return ' '.join(lemmatized_words)

In [122]:
data['complaint'] = data['complaint'].apply(clean_complaint)

In [123]:
data['complaint'] = data['complaint'].apply(remove_stopwords)

In [124]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\thori\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [125]:
# Tokenize each complaint
data['tokenized_complaint'] = data['complaint'].apply(word_tokenize)

In [126]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [127]:
data['vectorized'] = data['complaint']

In [194]:
vectorizer = TfidfVectorizer()
vectorizer.fit(data['vectorized'])
X = vectorizer.transform(data['vectorized'])

In [195]:
y = data['is_complaint']

In [196]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [197]:
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

In [198]:
with open('complaint_detection_model.pkl', 'wb') as model_file:
    pickle.dump(clf, model_file)

with open('tfidf_vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

In [199]:
y_pred = clf.predict(X_val)

In [200]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.79      1.00      0.89       151
           1       1.00      0.03      0.05        40

    accuracy                           0.80       191
   macro avg       0.90      0.51      0.47       191
weighted avg       0.84      0.80      0.71       191



In [218]:
df = pd.read_csv('../../scraper/data_x/merged_data.csv')

In [219]:
df['complaint'] = df['complaint'].apply(clean_complaint) 

In [None]:
df['complaint'] = df['complaint'].apply(remove_stopwords) 

In [None]:
# Tokenize each complaint
df['tokenized_complaint'] = df['complaint'].apply(word_tokenize)

In [None]:
df['vectorized'] = df['complaint']

In [None]:
# Load the saved model and vectorizer
with open('logistic_regression_model.pkl', 'rb') as model_file:
    loaded_model = pickle.load(model_file)

with open('tfidf_vectorizer.pkl', 'rb') as vectorizer_file:
    loaded_vectorizer = pickle.load(vectorizer_file)

In [None]:
x = loaded_vectorizer.transform(df['vectorized'])

In [None]:
y = loaded_model.predict(x)

In [None]:
y