## Menentukan Library yang dipakai

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [4]:
import nltk 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Meload Dataset

In [5]:
data = pd.read_csv('dataset_wa_spam_v1.csv', sep=';')
data.head()

Unnamed: 0,teks,label
0,J0K3RNET88 situs cardgames hkbgaming SB0 live ...,1
1,Situs betting On'line JOK3RB3T888 terbaik yang...,1
2,Situs betting On'line JOK3RB3T888 terbaik yang...,1
3,Menang jutaan di Sakon6 On'Line Main juga D1nG...,1
4,Mainkan To~gel cambodiaSlot'games On-line live...,1


## Melakukan Text Preprocessing

## Case Folding

In [6]:
import re

# membuat fungsi untuk case folding
def casefolding(text):
    text = text.lower()                                         # merubah kalimat menjadi huruf kecil
    text = re.sub(r'https?://\S+|www\.\S+',  '', text)          # menghapus url dari kalimat
    text = re.sub(r'[-+]?[0-9]+', '', text)                      # menghapus angka dari kalimat
    text = re.sub(r'[^\w\s]', '', text)                         # menghapus tanda baca
    text = text.strip()
    return text


In [7]:
# membandingkan before dan after case folding
raw_sample = data['teks'].iloc[2]
case_folding = casefolding(raw_sample)

print('Raw data\t :', raw_sample)
print('Case Folding\t :', case_folding)


Raw data	 : Situs betting On'line JOK3RB3T888 terbaik yang menyediakan jackpot terbesar hingga ratusan juta rupiah setiap harinya segera daftar di bit.ly/J0K3RBET888
Case Folding	 : situs betting online jokrbt terbaik yang menyediakan jackpot terbesar hingga ratusan juta rupiah setiap harinya segera daftar di bitlyjkrbet


## Melakukan Word Normalization

In [8]:
key_norm = pd.read_csv('key_norm.csv')

def text_normalize(text):
    text = ' '.join([key_norm[key_norm['singkat'] == word]['hasil'].values[0]
    if(key_norm['singkat'] == word). any()
    else word for word in text.split()                 
    ])

    text = str.lower(text)
    return text


In [9]:
# membandingkan before dan after word normalization

raw_data = data['teks'].iloc[2]
word_normal = text_normalize(case_folding)

print('Raw Data\t :', raw_data)
print('Word Normalize\t :', word_normal)


Raw Data	 : Situs betting On'line JOK3RB3T888 terbaik yang menyediakan jackpot terbesar hingga ratusan juta rupiah setiap harinya segera daftar di bit.ly/J0K3RBET888
Word Normalize	 : situs betting online jokrbt terbaik yang menyediakan jackpot terbesar hingga ratusan juta rupiah setiap harinya segera daftar di bitlyjkrbet


## Melakukan Stopword Removal

In [10]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

stopwords_ind = stopwords.words('indonesian')


In [11]:
len(stopwords_ind)

758

In [12]:
# melihat daftar stopword dari nltk
stopwords_ind


['ada',
 'adalah',
 'adanya',
 'adapun',
 'agak',
 'agaknya',
 'agar',
 'akan',
 'akankah',
 'akhir',
 'akhiri',
 'akhirnya',
 'aku',
 'akulah',
 'amat',
 'amatlah',
 'anda',
 'andalah',
 'antar',
 'antara',
 'antaranya',
 'apa',
 'apaan',
 'apabila',
 'apakah',
 'apalagi',
 'apatah',
 'artinya',
 'asal',
 'asalkan',
 'atas',
 'atau',
 'ataukah',
 'ataupun',
 'awal',
 'awalnya',
 'bagai',
 'bagaikan',
 'bagaimana',
 'bagaimanakah',
 'bagaimanapun',
 'bagi',
 'bagian',
 'bahkan',
 'bahwa',
 'bahwasanya',
 'baik',
 'bakal',
 'bakalan',
 'balik',
 'banyak',
 'bapak',
 'baru',
 'bawah',
 'beberapa',
 'begini',
 'beginian',
 'beginikah',
 'beginilah',
 'begitu',
 'begitukah',
 'begitulah',
 'begitupun',
 'bekerja',
 'belakang',
 'belakangan',
 'belum',
 'belumlah',
 'benar',
 'benarkah',
 'benarlah',
 'berada',
 'berakhir',
 'berakhirlah',
 'berakhirnya',
 'berapa',
 'berapakah',
 'berapalah',
 'berapapun',
 'berarti',
 'berawal',
 'berbagai',
 'berdatangan',
 'beri',
 'berikan',
 'berikut'

In [13]:
# membuat fungsi stopword removal

# menambahkan kata dalam stopword
more_stopword = ['gacor', 'juta', 'rb', 'situs']
stopwords_ind = stopwords_ind + more_stopword

def remove_stop_word(text):
    clean_words = []
    text = text.split()
    for word in text:
        if word not in stopwords_ind:
            clean_words.append(word)
    return " ".join(clean_words)


In [14]:
raw_sample = data['teks'].iloc[2]
case_folding = casefolding(raw_sample)
stopword_removal = remove_stop_word(case_folding)

print('Raw Data \t\t :', raw_data)
print('Case Folding \t\t :', case_folding)
print('Stopword Removal \t\t', stopword_removal)


Raw Data 		 : Situs betting On'line JOK3RB3T888 terbaik yang menyediakan jackpot terbesar hingga ratusan juta rupiah setiap harinya segera daftar di bit.ly/J0K3RBET888
Case Folding 		 : situs betting online jokrbt terbaik yang menyediakan jackpot terbesar hingga ratusan juta rupiah setiap harinya segera daftar di bitlyjkrbet
Stopword Removal 		 betting online jokrbt terbaik menyediakan jackpot terbesar ratusan rupiah harinya daftar bitlyjkrbet


## Melakukan Stemming

In [15]:
!pip -q install sastrawi

In [16]:
# merubah kata menjadi kata dasar
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

factory = StemmerFactory()
stemmer = factory.create_stemmer()

# membuat fungsi untuk stemming bahasa indonesia
def stemming(text):
    text = stemmer.stem(text)
    return text


In [17]:
raw_sample = data['teks'].iloc[2]
case_folding = casefolding(raw_sample)
stopword_removal = remove_stop_word(case_folding)
text_stemming = stemming(stopword_removal)

print('Raw Data \t\t :', raw_sample)
print('case_folding \t\t :', case_folding)
print('stopword_removal \t\t :', stopword_removal)
print('stemming \t\t :', text_stemming)


Raw Data 		 : Situs betting On'line JOK3RB3T888 terbaik yang menyediakan jackpot terbesar hingga ratusan juta rupiah setiap harinya segera daftar di bit.ly/J0K3RBET888
case_folding 		 : situs betting online jokrbt terbaik yang menyediakan jackpot terbesar hingga ratusan juta rupiah setiap harinya segera daftar di bitlyjkrbet
stopword_removal 		 : betting online jokrbt terbaik menyediakan jackpot terbesar ratusan rupiah harinya daftar bitlyjkrbet
stemming 		 : betting online jokrbt baik sedia jackpot besar ratus rupiah hari daftar bitlyjkrbet


## Melakukan Teks Preprocessing Pipeline

In [18]:
# membuat fungsi untuk menggabungkan seluruh langkah text preproceessing
def text_preprocessing_process(text):
    text = casefolding(text)
    text = text_normalize(text)
    text = remove_stop_word(text)
    text = stemming(text)
    return text


In [19]:
%%time
data['clean_text']= data['teks'].apply(text_preprocessing_process)


CPU times: total: 1min 10s
Wall time: 1min 20s


In [20]:
data

Unnamed: 0,teks,label,clean_text
0,J0K3RNET88 situs cardgames hkbgaming SB0 live ...,1,jkrnet cardgames hkbgaming sb live dindong d t...
1,Situs betting On'line JOK3RB3T888 terbaik yang...,1,betting online jokrbt baik sedia jackpot besar...
2,Situs betting On'line JOK3RB3T888 terbaik yang...,1,betting online jokrbt baik sedia jackpot besar...
3,Menang jutaan di Sakon6 On'Line Main juga D1nG...,1,menang juta sakon online main dngdng live sbo ...
4,Mainkan To~gel cambodiaSlot'games On-line live...,1,main togel cambodiaslotgames online live dndn ...
...,...,...,...
145,Aku mah pertama ke draw nah itu ada kaya berbe...,0,mah draw kaya bentuk spidol gitu
146,Atau cara manual bang ttd dulu di HVS kosong n...,0,manual bang tanda tangan hvs kosong tempeling ...
147,Kalo abstrak mending nanti setelah semua seles...,0,kalo abstrak mending selesai mah antar gim
148,Tuliskan rasa terimakasih kepada siapa saja ya...,0,tulis terimakasih kait msib


In [22]:
# menyimpan data yang sudah di preprocessing ke dalam file csv
data.to_csv('clean_data.csv')

## Proses Feature Engineering

In [21]:
# pisahkan kolom feature daan target
x = data['clean_text']
y = data['label']


In [23]:
x

0      jkrnet cardgames hkbgaming sb live dindong d t...
1      betting online jokrbt baik sedia jackpot besar...
2      betting online jokrbt baik sedia jackpot besar...
3      menang juta sakon online main dngdng live sbo ...
4      main togel cambodiaslotgames online live dndn ...
                             ...                        
145                     mah draw kaya bentuk spidol gitu
146    manual bang tanda tangan hvs kosong tempeling ...
147           kalo abstrak mending selesai mah antar gim
148                          tulis terimakasih kait msib
149                             mad dapet refrensi lapor
Name: clean_text, Length: 150, dtype: object

In [24]:
y

0      1
1      1
2      1
3      1
4      1
      ..
145    0
146    0
147    0
148    0
149    0
Name: label, Length: 150, dtype: int64

## Mengekstrasi Feature

In [25]:
# save model
import pickle

#TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

#Unigram
vec_TF_IDF = TfidfVectorizer(ngram_range=(1,1))
vec_TF_IDF.fit(x)

x_tf_idf = vec_TF_IDF. transform(x)

pickle.dump(vec_TF_IDF.vocabulary_,open("feature_tf-idf.sav", "wb"))


In [26]:
# menampilkan vocabulary dari tf-idf
vec_TF_IDF.vocabulary_


{'jkrnet': 379,
 'cardgames': 158,
 'hkbgaming': 316,
 'sb': 649,
 'live': 463,
 'dindong': 218,
 'tanggal': 715,
 'sydney': 705,
 'hongkong': 324,
 'slotgames': 685,
 'lengkap': 457,
 'gabung': 276,
 'bitlyjkrnt': 112,
 'betting': 87,
 'online': 554,
 'jokrbt': 382,
 'baik': 55,
 'sedia': 654,
 'jackpot': 362,
 'besar': 83,
 'ratus': 622,
 'rupiah': 640,
 'hari': 306,
 'daftar': 198,
 'bitlyjkrbet': 111,
 'menang': 506,
 'juta': 390,
 'sakon': 643,
 'main': 483,
 'dngdng': 233,
 'sbo': 650,
 'tgel': 733,
 'japan': 365,
 'slotgams': 686,
 'ceme': 165,
 'cuttlyspr': 196,
 'dapat': 200,
 'togel': 749,
 'cambodiaslotgames': 155,
 'dndn': 229,
 'sakong': 644,
 'hkb': 311,
 'adu': 8,
 'sportsbook': 697,
 'csnbola': 191,
 'tangkas': 716,
 'tlmandiri': 745,
 'sidtlmndri': 674,
 'jayapoker': 367,
 'bitlyjypkr': 113,
 'bndr': 130,
 'nline': 545,
 'cambodiasydney': 156,
 'china': 169,
 'singapore': 680,
 'diskon': 223,
 'jg': 374,
 'domino': 236,
 'dngdong': 234,
 'promo': 606,
 'sltgams': 688,


In [27]:
# melihat jumlah feature
print(len(vec_TF_IDF.get_feature_names_out()))


805


In [28]:
# melihat fitur apa saja yang ada di dalam corpus
print(vec_TF_IDF.get_feature_names_out())


['aad' 'aaq' 'abbas' 'abstrak' 'adela' 'adik' 'adm' 'admin' 'adu' 'aduq'
 'afapker' 'ag' 'agus' 'agustus' 'ahabet' 'ai' 'ain' 'air' 'ajak'
 'akademik' 'akhir' 'akses' 'akulink' 'akuma' 'akun' 'alhamdulillah' 'all'
 'allah' 'amak' 'ambil' 'an' 'aneh' 'angella' 'answer' 'antar' 'anter'
 'anti' 'anyway' 'aplikasi' 'apusin' 'ass' 'assalamualaikum' 'atas' 'atuh'
 'away' 'ayena' 'ayo' 'ayok' 'bab' 'baca' 'bad' 'bade' 'bagi' 'bahas'
 'bahela' 'baik' 'baim' 'bakar' 'ball' 'balltl' 'bang' 'banget' 'bank'
 'bantu' 'bantuin' 'bapau' 'barang' 'bareng' 'baru' 'basa' 'batikpker'
 'bawa' 'bayar' 'bb' 'bca' 'bdinfo' 'bebas' 'bekal' 'belah' 'belegug'
 'bentuk' 'berani' 'berkah' 'besar' 'besok' 'beta' 'bete' 'betting'
 'bettmu' 'biar' 'big' 'bigtwo' 'bikin' 'bilang' 'bimbing' 'bingung'
 'bitdodmbt' 'bitdotexapokr' 'bitlyahabet' 'bitlybatikpker'
 'bitlybrovegas' 'bitlybrves' 'bitlyeraswasembadapt' 'bitlyftjoly'
 'bitlygebyarshopee' 'bitlyhadiahptshopee' 'bitlyhadiahshopee'
 'bitlyhoraspkr' 'bitlyindotgl'

In [29]:
x1 = vec_TF_IDF.transform(x).toarray()
data_tabular_tf_idf = pd.DataFrame(x1,columns=vec_TF_IDF.get_feature_names_out())
data_tabular_tf_idf



Unnamed: 0,aad,aaq,abbas,abstrak,adela,adik,adm,admin,adu,aduq,...,wr,wshp,ya,yaa,yaaa,yhu,yuk,zainul,zeus,zoom
0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.296246,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
146,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
147,0.0,0.0,0.0,0.410941,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
148,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
data_tabular_tf_idf.iloc[1:10,80:90]

Unnamed: 0,bentuk,berani,berkah,besar,besok,beta,bete,betting,bettmu,biar
1,0.0,0.0,0.0,0.269547,0.0,0.0,0.0,0.325643,0.0,0.0
2,0.0,0.0,0.0,0.269547,0.0,0.0,0.0,0.325643,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.212626,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Proses Feature Selection

In [32]:
x_train = np.array(data_tabular_tf_idf)
y_train = np.array(y)


In [34]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

chi2_features = SelectKBest(chi2, k=800)
x_kbest_features = chi2_features.fit_transform(x_train, y_train)

# untuk reduced features 
print('Original Feature Number', x_train.shape[1])
print('Reduced feature Number', x_kbest_features.shape[1])


Original Feature Number 805
Reduced feature Number 800


In [35]:
Data = pd.DataFrame(chi2_features.scores_,columns=['Nilai'])
Data


Unnamed: 0,Nilai
0,0.712181
1,0.794340
2,0.913650
3,0.821882
4,0.522480
...,...
800,0.748048
801,1.310053
802,0.552224
803,1.496309


In [36]:
# menampilkan feature beserta nilainya

feature = vec_TF_IDF.get_feature_names_out()
feature

Data['Fitur'] = feature
Data


Unnamed: 0,Nilai,Fitur
0,0.712181,aad
1,0.794340,aaq
2,0.913650,abbas
3,0.821882,abstrak
4,0.522480,adela
...,...,...
800,0.748048,yhu
801,1.310053,yuk
802,0.552224,zainul
803,1.496309,zeus


In [37]:
# mengurutkan nilai feature terbaik
Data.sort_values(by='Nilai', ascending=False)


Unnamed: 0,Nilai,Fitur
615,11.263136,pulsa
483,7.938684,main
581,7.490355,pin
426,7.053414,kode
29,6.987961,ambil
...,...,...
331,0.275844,hp
358,0.258021,investas
270,0.258021,forex
753,0.258021,trader


In [38]:
mask = chi2_features.get_support()
mask


array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [39]:
# menampilkan fitur yang terpilih berdasarkan nilai mask atau nilai tertinggi yang sudah ditetapkan pada chi square

new_feature=[]
for bool, f in zip(mask, feature):
    if bool :
        new_feature.append(f)
    selected_feature=new_feature
selected_feature


['aad',
 'aaq',
 'abbas',
 'abstrak',
 'adela',
 'adik',
 'adm',
 'admin',
 'adu',
 'aduq',
 'afapker',
 'ag',
 'agus',
 'agustus',
 'ahabet',
 'ai',
 'ain',
 'air',
 'ajak',
 'akademik',
 'akhir',
 'akses',
 'akulink',
 'akuma',
 'akun',
 'alhamdulillah',
 'all',
 'allah',
 'amak',
 'ambil',
 'an',
 'aneh',
 'angella',
 'answer',
 'antar',
 'anter',
 'anti',
 'anyway',
 'aplikasi',
 'apusin',
 'ass',
 'assalamualaikum',
 'atas',
 'atuh',
 'away',
 'ayena',
 'ayo',
 'ayok',
 'bab',
 'baca',
 'bad',
 'bade',
 'bagi',
 'bahas',
 'bahela',
 'baik',
 'baim',
 'bakar',
 'ball',
 'balltl',
 'bang',
 'banget',
 'bank',
 'bantu',
 'bantuin',
 'bapau',
 'barang',
 'bareng',
 'baru',
 'basa',
 'batikpker',
 'bawa',
 'bayar',
 'bb',
 'bca',
 'bdinfo',
 'bebas',
 'bekal',
 'belah',
 'belegug',
 'bentuk',
 'berani',
 'berkah',
 'besar',
 'besok',
 'beta',
 'bete',
 'betting',
 'bettmu',
 'biar',
 'big',
 'bigtwo',
 'bikin',
 'bilang',
 'bimbing',
 'bingung',
 'bitdodmbt',
 'bitdotexapokr',
 'bitlya

In [40]:
# membuat vocabulary baru berdasarkan fitur yang terseleksi

new_selected_feature = {}

for (k,v) in vec_TF_IDF.vocabulary_.items():
    if k in selected_feature:
        new_selected_feature[k]=v

new_selected_feature


{'jkrnet': 379,
 'cardgames': 158,
 'hkbgaming': 316,
 'sb': 649,
 'live': 463,
 'dindong': 218,
 'tanggal': 715,
 'sydney': 705,
 'hongkong': 324,
 'slotgames': 685,
 'lengkap': 457,
 'gabung': 276,
 'bitlyjkrnt': 112,
 'betting': 87,
 'online': 554,
 'jokrbt': 382,
 'baik': 55,
 'sedia': 654,
 'jackpot': 362,
 'besar': 83,
 'ratus': 622,
 'rupiah': 640,
 'hari': 306,
 'daftar': 198,
 'bitlyjkrbet': 111,
 'menang': 506,
 'juta': 390,
 'sakon': 643,
 'main': 483,
 'dngdng': 233,
 'sbo': 650,
 'tgel': 733,
 'japan': 365,
 'slotgams': 686,
 'ceme': 165,
 'cuttlyspr': 196,
 'dapat': 200,
 'togel': 749,
 'cambodiaslotgames': 155,
 'dndn': 229,
 'sakong': 644,
 'hkb': 311,
 'adu': 8,
 'sportsbook': 697,
 'csnbola': 191,
 'tangkas': 716,
 'tlmandiri': 745,
 'sidtlmndri': 674,
 'jayapoker': 367,
 'bitlyjypkr': 113,
 'bndr': 130,
 'nline': 545,
 'cambodiasydney': 156,
 'china': 169,
 'singapore': 680,
 'diskon': 223,
 'jg': 374,
 'domino': 236,
 'dngdong': 234,
 'promo': 606,
 'sltgams': 688,


In [41]:
len(new_selected_feature)

800

In [42]:
pickle.dump(new_selected_feature,open("new_selected_feature_tf-idf.sav","wb"))

# menampilkan fitur-fitur yang sudah diseleksi

data_selected_feature = pd.DataFrame(x_kbest_features, columns=selected_feature)
data_selected_feature


Unnamed: 0,aad,aaq,abbas,abstrak,adela,adik,adm,admin,adu,aduq,...,wr,wshp,ya,yaa,yaaa,yhu,yuk,zainul,zeus,zoom
0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.296246,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
146,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
147,0.0,0.0,0.0,0.410941,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
148,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Proses Pemodelan

In [43]:
selected_x = x_kbest_features
selected_x


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [44]:
# import library
import random
from sklearn.model_selection import train_test_split

# import algoritma naive bayes
from sklearn.naive_bayes import MultinomialNB


In [45]:
x = selected_x
y = data.label

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=0)


In [46]:
# menampilkan jumlah data training dan data testing
print('Banyaknya X_train :', len(x_train))
print('Banyaknya X_test :', len(x_test))
print('Banyaknya Y_train :', len(y_train))
print('Banyaknya Y_test :', len(y_test))


Banyaknya X_train : 120
Banyaknya X_test : 30
Banyaknya Y_train : 120
Banyaknya Y_test : 30


In [47]:
# proses training menggunakan naive bayes
text_algorithm = MultinomialNB()


In [48]:
model = text_algorithm.fit(x_train, y_train)

In [53]:
# membuat model prediksi

data_input = ("assalamualaikum kak sorry ganggu malem gin penasaran sih gabut coba baca ulang buku rak")
data_input = text_preprocessing_process(data_input)

#load
tfidf = TfidfVectorizer

loaded_vec = TfidfVectorizer(decode_error="replace", vocabulary=set(pickle.load(open("new_selected_feature_tf-idf.sav", "rb"))))

hasil = model.predict(loaded_vec.fit_transform([data_input]))

if(hasil==0):
    s = "Pesan Normal"
elif(hasil==1):
    s = "Pesan Judi Online"
else:
    s= "Pesan Penipuan"

print("Hasil Prediksi :\n", s)


Hasil Prediksi :
 Pesan Normal


## Mengevaluasi Model yang ada

In [54]:
# masukan library yang dibutuhkan
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

predicted = model.predict(x_test)

CM = confusion_matrix(y_test, predicted)

print(classification_report(y_test, predicted))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00         6
           1       1.00      1.00      1.00        14
           2       1.00      1.00      1.00        10

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



In [None]:
# menyimpan model
pickle.dump(model,open("model_fraud.sav","wb"))
