### nltk natural language tool kit

In [1]:
# ! pip install nltk

In [2]:
# import nltk
# nltk.download("stopwords")
# nltk.download("punkt")

In [3]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from string import punctuation

In [4]:
text = "Halo nama saya Tamimi. saya mahasiswa UNTIRTA. kamu tahu UNTIRTA?"

## Normalization (lowe case)

In [5]:
text = text.lower()

## Tokenization

In [6]:
sent_tokenize(text)

['halo nama saya tamimi.', 'saya mahasiswa untirta.', 'kamu tahu untirta?']

In [7]:
tokens = word_tokenize(text)
tokens

['halo',
 'nama',
 'saya',
 'tamimi',
 '.',
 'saya',
 'mahasiswa',
 'untirta',
 '.',
 'kamu',
 'tahu',
 'untirta',
 '?']

## Punctuation Removal / Alphanumeric Cleansing

In [8]:
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [9]:
tokens = [token for token in tokens if token not in punctuation]
tokens

['halo',
 'nama',
 'saya',
 'tamimi',
 'saya',
 'mahasiswa',
 'untirta',
 'kamu',
 'tahu',
 'untirta']

In [10]:
tokens = [token for token in tokens if token not in stopwords.words("indonesian")]
tokens

['halo', 'nama', 'tamimi', 'mahasiswa', 'untirta', 'untirta']

## Why preprocessing => vocabulary as a feature

# Import Packages

In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle, os

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from string import punctuation

from jcopml.pipeline import num_pipe, cat_pipe
from jcopml.utils import save_model, load_model
from jcopml.plot import plot_missing_value
from jcopml.feature_importance import mean_score_decrease

  from pandas import MultiIndex, Int64Index


In [16]:
sw_indo = stopwords.words("indonesian") + list(punctuation)

df = pd.read_csv("data/data/spam.csv")
df.head()
# spam = 1

Unnamed: 0,Teks,label
0,[PROMO] Beli paket Flash mulai 1GB di MY TELKO...,1
1,2.5 GB/30 hari hanya Rp 35 Ribu Spesial buat A...,1
2,"2016-07-08 11:47:11.Plg Yth, sisa kuota Flash ...",1
3,"2016-08-07 11:29:47.Plg Yth, sisa kuota Flash ...",1
4,4.5GB/30 hari hanya Rp 55 Ribu Spesial buat an...,1


In [17]:
X = df.Teks
y = df.label

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((914,), (229,), (914,), (229,))

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from jcopml.tuning import random_search_params as rsp

In [19]:
pipeline = Pipeline([
    ('prep', TfidfVectorizer(tokenizer=word_tokenize, stop_words=sw_indo)),
    ('algo', LogisticRegression(solver='lbfgs', n_jobs=-1, random_state=42))
])


model = RandomizedSearchCV(pipeline, rsp.logreg_params, cv=3, n_iter=50, n_jobs=-1, verbose=1, random_state=42)
model.fit(X_train, y_train)

print(model.best_params_)
print(model.score(X_train, y_train), model.best_score_, model.score(X_test, y_test))

Fitting 3 folds for each of 50 candidates, totalling 150 fits




{'algo__C': 3.907967156822884, 'algo__fit_intercept': True}
0.9978118161925602 0.9638912855910267 0.982532751091703


In [17]:
rsp.logreg_params

{'algo__fit_intercept': [True, False],
 'algo__C': Real(low=-3, high=3, prior='log-uniform')}