In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from jcopml.tuning import grid_search_params as gsp
from jcopml.feature_importance import mean_score_decrease
from jcopml.pipeline import num_pipe, cat_pipe

from luwiji.text_proc import illustration

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

sw_indo = stopwords.words("indonesian") + list(punctuation)

# Import data


In [3]:
df = pd.read_csv("data/spam.csv")
df.head()

Unnamed: 0,Teks,label
0,[PROMO] Beli paket Flash mulai 1GB di MY TELKO...,1
1,2.5 GB/30 hari hanya Rp 35 Ribu Spesial buat A...,1
2,"2016-07-08 11:47:11.Plg Yth, sisa kuota Flash ...",1
3,"2016-08-07 11:29:47.Plg Yth, sisa kuota Flash ...",1
4,4.5GB/30 hari hanya Rp 55 Ribu Spesial buat an...,1


# Dataset splitting

In [4]:
X = df.Teks
y = df.label

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((914,), (229,), (914,), (229,))

# Training


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from jcopml.tuning import random_search_params as rsp


In [6]:
pipeline = Pipeline([
    ("prep", TfidfVectorizer(tokenizer=word_tokenize, stop_words=sw_indo)),
    ("algo", LogisticRegression(solver="lbfgs", n_jobs=-1, random_state=42))
])

model = RandomizedSearchCV(pipeline, rsp.logreg_params, cv=3, n_iter=50, n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

print(model.best_params_)
print(model.score(X_train, y_train))
print(model.score(X_test, y_test))

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   37.6s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  1.8min finished
  'stop_words.' % sorted(inconsistent))


{'algo__C': 3.54605033321065, 'algo__fit_intercept': True}
0.9978118161925602
0.982532751091703


# Sanity check

In [7]:
text = ["sayang belikan aku pulsa dinomor ini 089654857415, nanti aq bayar yachh dirumah"]
model.predict(text), model.predict_proba(text)

(array([1], dtype=int64), array([[0.20838641, 0.79161359]]))

In [8]:
text = ["Di eskalator busway carolus ya pak.Terima kasih"]
model.predict(text)

array([0], dtype=int64)