In [1]:
import numpy as np
import pandas as pd
import pickle as pkl
from sklearn.base import BaseEstimator
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.metrics import precision_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Data Loading

In [2]:
df = pd.read_csv("dataset/preprocessed_text.csv")
df.head()

Unnamed: 0,preprocessed_text,is_gambling_site
0,vegas situs judi slot online slot gacor resmi ...,1
1,sip situs judi slot online maxwin gacor slot p...,1
2,vegas situs judi slot online slot gacor resmi ...,1
3,slot link situs slot gacor akurat gampang mena...,1
4,slot online gacor pg soft demo slot pragmatic ...,1


In [3]:
df.isna().sum()

preprocessed_text    0
is_gambling_site     0
dtype: int64

In [4]:
# drop rows with null value
df = df.dropna()

In [5]:
df.groupby("is_gambling_site")["preprocessed_text"].apply(lambda text: text.str.split().apply(len).mean())

is_gambling_site
0    818.666667
1     45.235294
Name: preprocessed_text, dtype: float64

# Data Splitting

In [6]:
X = df["preprocessed_text"]
y = df["is_gambling_site"]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Modeling

In [8]:
count_vec_pipe = Pipeline([
	("count_vec", CountVectorizer()),
	("model", MultinomialNB())
])

count_vec_pipe

In [9]:
tfidf_pipe = Pipeline([
	("tfidf", TfidfVectorizer()),
	("pipe", MultinomialNB())
])

tfidf_pipe

In [10]:
print("Precision Scores")
for model_name, pipeline in zip(["count_vec_model", "tfidf_model"], [count_vec_pipe, tfidf_pipe]):
	print(model_name)
	
	kf = StratifiedKFold(shuffle=True, random_state=42, n_splits=10)
	cv_res = cross_val_score(pipeline, cv=kf, X=X_train, y=y_train, scoring="precision")
	
	print(f"\t{cv_res}")
	print(f"\tavg: {cv_res.mean():,.2f}")
	print(f"\tstd: {cv_res.std():,.2f}", end="\n\n")

Precision Scores
count_vec_model
	[1.  1.  1.  1.  1.  0.5 1.  1.  1.  1. ]
	avg: 0.95
	std: 0.15

tfidf_model
	[1.  1.  1.  1.  1.  0.5 1.  1.  1.  1. ]
	avg: 0.95
	std: 0.15





- Tfidf process outperforms count vectorizer with higher and more steady precision (low std)

In [11]:
param_grid = {
    "tfidf__ngram_range": [(1, 1), (1, 2), (2, 2), (2, 3), (3, 3)],
    "tfidf__max_df": [0.8, 0.9, 1],
    "tfidf__min_df": np.linspace(0, 0.3, 3),
}

param_grid

{'tfidf__ngram_range': [(1, 1), (1, 2), (2, 2), (2, 3), (3, 3)],
 'tfidf__max_df': [0.8, 0.9, 1],
 'tfidf__min_df': array([0.  , 0.15, 0.3 ])}

In [12]:
grid_tfidf = GridSearchCV(tfidf_pipe, param_grid, scoring="precision", verbose=4)
grid_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 45 candidates, totalling 225 fits
[CV 1/5] END tfidf__max_df=0.8, tfidf__min_df=0.0, tfidf__ngram_range=(1, 1);, score=0.750 total time=   0.0s
[CV 2/5] END tfidf__max_df=0.8, tfidf__min_df=0.0, tfidf__ngram_range=(1, 1);, score=1.000 total time=   0.0s
[CV 3/5] END tfidf__max_df=0.8, tfidf__min_df=0.0, tfidf__ngram_range=(1, 1);, score=1.000 total time=   0.0s
[CV 4/5] END tfidf__max_df=0.8, tfidf__min_df=0.0, tfidf__ngram_range=(1, 1);, score=1.000 total time=   0.0s
[CV 5/5] END tfidf__max_df=0.8, tfidf__min_df=0.0, tfidf__ngram_range=(1, 1);, score=1.000 total time=   0.0s
[CV 1/5] END tfidf__max_df=0.8, tfidf__min_df=0.0, tfidf__ngram_range=(1, 2);, score=0.750 total time=   0.0s
[CV 2/5] END tfidf__max_df=0.8, tfidf__min_df=0.0, tfidf__ngram_range=(1, 2);, score=1.000 total time=   0.0s
[CV 3/5] END tfidf__max_df=0.8, tfidf__min_df=0.0, tfidf__ngram_range=(1, 2);, score=1.000 total time=   0.0s
[CV 4/5] END tfidf__max_df=0.8, tfidf__min_df=0.0, tfidf__



[CV 4/5] END tfidf__max_df=0.8, tfidf__min_df=0.0, tfidf__ngram_range=(2, 3);, score=1.000 total time=   0.0s
[CV 5/5] END tfidf__max_df=0.8, tfidf__min_df=0.0, tfidf__ngram_range=(2, 3);, score=1.000 total time=   0.0s
[CV 1/5] END tfidf__max_df=0.8, tfidf__min_df=0.0, tfidf__ngram_range=(3, 3);, score=0.750 total time=   0.0s
[CV 2/5] END tfidf__max_df=0.8, tfidf__min_df=0.0, tfidf__ngram_range=(3, 3);, score=1.000 total time=   0.0s
[CV 3/5] END tfidf__max_df=0.8, tfidf__min_df=0.0, tfidf__ngram_range=(3, 3);, score=1.000 total time=   0.0s
[CV 4/5] END tfidf__max_df=0.8, tfidf__min_df=0.0, tfidf__ngram_range=(3, 3);, score=1.000 total time=   0.0s
[CV 5/5] END tfidf__max_df=0.8, tfidf__min_df=0.0, tfidf__ngram_range=(3, 3);, score=1.000 total time=   0.0s
[CV 1/5] END tfidf__max_df=0.8, tfidf__min_df=0.15, tfidf__ngram_range=(1, 1);, score=0.750 total time=   0.0s
[CV 2/5] END tfidf__max_df=0.8, tfidf__min_df=0.15, tfidf__ngram_range=(1, 1);, score=1.000 total time=   0.0s
[CV 3/5]

50 fits failed out of a total of 225.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/liarta/miniconda3/envs/main/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/liarta/miniconda3/envs/main/lib/python3.9/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/liarta/miniconda3/envs/main/lib/python3.9/site-packages/sklearn/pipeline.py", line 469, in fit
    Xt = self._fit(X, y, routed_params)
  File "/Users/liarta/miniconda3/envs/main/lib/python3.9/site-packages/sklearn/pipel

In [13]:
grid_tfidf.best_params_

{'tfidf__max_df': 0.8, 'tfidf__min_df': 0.0, 'tfidf__ngram_range': (1, 1)}

In [14]:
def get_pred_res(estimator: BaseEstimator, X_train: pd.DataFrame) -> pd.Series:
    """
    Get prediction results of an estimator as Series
    
		params:
			estimator (BaseEstimator): Sklearn Estimator
			X_train (DataFrame): features to predict
        
        return:
			prediction results as Series
    """
    
    return estimator.predict(X_train)

In [15]:
y_train_pred = get_pred_res(grid_tfidf, X_train)
y_test_pred = get_pred_res(grid_tfidf, X_test)

In [16]:
print(classification_report(y_train, y_train_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.94      1.00      0.97        15

    accuracy                           0.94        16
   macro avg       0.47      0.50      0.48        16
weighted avg       0.88      0.94      0.91        16



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [17]:
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.50      1.00      0.67         2

    accuracy                           0.50         4
   macro avg       0.25      0.50      0.33         4
weighted avg       0.25      0.50      0.33         4



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [18]:
print("precision train:", precision_score(y_train, y_train_pred))
print("precision test:", precision_score(y_test, y_test_pred))

precision train: 0.9375
precision test: 0.5


In [19]:
with open("deploy/model/tfidf_model.pkl", "wb") as f:
    pkl.dump(grid_tfidf.best_estimator_, f)