# **Import package**

In [None]:
! pip install jcopml

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
import sklearn

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from jcopml.pipeline import num_pipe,cat_pipe
from jcopml.feature_importance import mean_score_decrease

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from string import punctuation

swIndo = stopwords.words('indonesian') + list(punctuation)

In [None]:
df = pd.read_csv('/content/spam.csv')

In [None]:
df.head()

Unnamed: 0,Teks,label
0,[PROMO] Beli paket Flash mulai 1GB di MY TELKO...,1
1,2.5 GB/30 hari hanya Rp 35 Ribu Spesial buat A...,1
2,"2016-07-08 11:47:11.Plg Yth, sisa kuota Flash ...",1
3,"2016-08-07 11:29:47.Plg Yth, sisa kuota Flash ...",1
4,4.5GB/30 hari hanya Rp 55 Ribu Spesial buat an...,1


# **Dataset splitting**

In [None]:
X = df.Teks
y = df.label

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((914,), (229,), (914,), (229,))

# **Training pake Xgboost**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import lightgbm as lgbm
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from jcopml.tuning import random_search_params as rsp
from sklearn.model_selection import RandomizedSearchCV

In [None]:
rsp.xgb_params

{'algo__max_depth': Integer(low=1, high=10),
 'algo__learning_rate': Real(low=-2, high=0, prior='log-uniform'),
 'algo__n_estimators': Integer(low=100, high=200),
 'algo__subsample': Real(low=0.3, high=0.8, prior='uniform'),
 'algo__gamma': Integer(low=1, high=10),
 'algo__colsample_bytree': Real(low=0.1, high=1, prior='uniform'),
 'algo__reg_alpha': Real(low=-3, high=1, prior='log-uniform'),
 'algo__reg_lambda': Real(low=-3, high=1, prior='log-uniform')}

In [None]:
pipeline = Pipeline([
    ('prep', TfidfVectorizer(tokenizer=word_tokenize, stop_words=swIndo)),
    ('algo', XGBClassifier(n_jobs=-1,random_state=42))
])

model = RandomizedSearchCV(pipeline, rsp.xgb_params, cv=3, n_iter=50, n_jobs=-1, verbose=1)
model.fit(X_train,y_train)

print(model.best_params_)
print(model.score(X_train, y_train), model.best_score_, model.score(X_test,y_test))

Fitting 3 folds for each of 50 candidates, totalling 150 fits


  % sorted(inconsistent)


{'algo__colsample_bytree': 0.6580905305683858, 'algo__gamma': 2, 'algo__learning_rate': 0.0186550629964385, 'algo__max_depth': 10, 'algo__n_estimators': 168, 'algo__reg_alpha': 0.07564864570436132, 'algo__reg_lambda': 0.002195628859106995, 'algo__subsample': 0.3445329308165615}
0.9343544857768052 0.8982707794075352 0.9519650655021834


# **Training pake LogReg**

In [None]:
pipeline = Pipeline([
    ('prep', TfidfVectorizer(tokenizer=word_tokenize, stop_words=swIndo)),
    ('algo', LogisticRegression(solver='lbfgs',n_jobs=-1,random_state=42))
])

modelBaru = RandomizedSearchCV(pipeline, rsp.logreg_params, cv=3, n_iter=50, n_jobs=-1, verbose=1)
modelBaru.fit(X_train,y_train)

print(modelBaru.best_params_)
print(modelBaru.score(X_train, y_train), modelBaru.best_score_, modelBaru.score(X_test,y_test))

Fitting 3 folds for each of 50 candidates, totalling 150 fits


  % sorted(inconsistent)


{'algo__C': 14.608535648003944, 'algo__fit_intercept': True}
1.0 0.96388769053782 0.9781659388646288


# **Sanity check**

In [None]:
teks = ['sqyqng beliin qku pulsq di nomor ni yqqqcc 089746528177, mqqcih bgetttlohhhhzz']
model.predict(teks)

array([0])

### <font color='red'>Dari sini kita liat model kita lebih bagus yg pake logreg karena lebih ngenal mana yang cenderung SPAM

In [None]:
teks = ['sqyqng beliin qku pulsq di nomor ni yqqqcc 089746528177, mqqcih bgetttlohhhhzz']
modelBaru.predict(teks)

array([1])

In [None]:
teks = ['Ini aku ada di perpus, nanti kita ketemu di taman aja gimana?']
modelBaru.predict(teks),modelBaru.predict_proba(teks)

(array([0]), array([[0.95984116, 0.04015884]]))

In [None]:
teks = [
    "Pulsa habis? Ada Paket Darurat  (750 MB, 75 Menit Tsel, 30 SMS Tsel, 2 Hari). BAYAR NANTI Rp7000 sesuai S&K. Balas ketik PD7 jk setuju /hub *505# klik tsel.me/505"
]
modelBaru.predict(teks),modelBaru.predict_proba(teks)

(array([1]), array([[0.06315904, 0.93684096]]))