## Menentukan Library yang dipakai

In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [51]:
import nltk 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Meload Dataset

In [52]:
data = pd.read_csv('dataset_wa_spam_v1.csv', sep=';')
data.head()

Unnamed: 0,teks,label
0,J0K3RNET88 situs cardgames hkbgaming SB0 live ...,1
1,Situs betting On'line JOK3RB3T888 terbaik yang...,1
2,Situs betting On'line JOK3RB3T888 terbaik yang...,1
3,Menang jutaan di Sakon6 On'Line Main juga D1nG...,1
4,Mainkan To~gel cambodiaSlot'games On-line live...,1


## Melakukan Text Preprocessing

## Case Folding

In [53]:
import re

# membuat fungsi untuk case folding
def casefolding(text):
    text = text.lower()                                         # merubah kalimat menjadi huruf kecil
    text = re.sub(r'https?://\S+|www\.\S+',  '', text)          # menghapus url dari kalimat
    text = re.sub(r'[-+]?[0-9]+', '', text)                      # menghapus angka dari kalimat
    text = re.sub(r'[^\w\s]', '', text)                         # menghapus tanda baca
    text = text.strip()
    return text


In [59]:
# membandingkan before dan after case folding
raw_sample = data['teks'].iloc[151]
case_folding = casefolding(raw_sample)

print('Raw data\t :', raw_sample)
print('Case Folding\t :', case_folding)


Raw data	 : Hallo dengan nama Ahmad? Sekarang bapa anda sedang kami tahan di kantor polisi Bandung dengan kasus tabrak lari. Demi membebaskannya kami meminta tebusan agar beliau bisa bebas sebesar 2jt.
Case Folding	 : hallo dengan nama ahmad sekarang bapa anda sedang kami tahan di kantor polisi bandung dengan kasus tabrak lari demi membebaskannya kami meminta tebusan agar beliau bisa bebas sebesar jt


## Melakukan Word Normalization

In [60]:
key_norm = pd.read_csv('key_norm.csv')

def text_normalize(text):
    text = ' '.join([key_norm[key_norm['singkat'] == word]['hasil'].values[0]
    if(key_norm['singkat'] == word). any()
    else word for word in text.split()                 
    ])

    text = str.lower(text)
    return text


In [61]:
# membandingkan before dan after word normalization

raw_data = data['teks'].iloc[32]
word_normal = text_normalize(case_folding)

print('Raw Data\t :', raw_data)
print('Word Normalize\t :', word_normal)


Raw Data	 : Berani depo minimal 20rb aja ak bantu kknya maxwin 1juta skrg jga di game yang kk kuasai ak setting maxwin di bet 200 lngsung di buy freespin pertama ak berani jaminkan kemenangan 100% akun kamu di situs akuLINK : https://bit.ly/Linkgacorgressya
Word Normalize	 : hallo dengan nama ahmad sekarang bapa anda sedang kami tahan di kantor polisi bandung dengan kasus tabrak lari demi membebaskannya kami meminta tebusan agar beliau bisa bebas sebesar jt


## Melakukan Stopword Removal

In [62]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

stopwords_ind = stopwords.words('indonesian')


In [63]:
len(stopwords_ind)

758

In [64]:
# melihat daftar stopword dari nltk
stopwords_ind


['ada',
 'adalah',
 'adanya',
 'adapun',
 'agak',
 'agaknya',
 'agar',
 'akan',
 'akankah',
 'akhir',
 'akhiri',
 'akhirnya',
 'aku',
 'akulah',
 'amat',
 'amatlah',
 'anda',
 'andalah',
 'antar',
 'antara',
 'antaranya',
 'apa',
 'apaan',
 'apabila',
 'apakah',
 'apalagi',
 'apatah',
 'artinya',
 'asal',
 'asalkan',
 'atas',
 'atau',
 'ataukah',
 'ataupun',
 'awal',
 'awalnya',
 'bagai',
 'bagaikan',
 'bagaimana',
 'bagaimanakah',
 'bagaimanapun',
 'bagi',
 'bagian',
 'bahkan',
 'bahwa',
 'bahwasanya',
 'baik',
 'bakal',
 'bakalan',
 'balik',
 'banyak',
 'bapak',
 'baru',
 'bawah',
 'beberapa',
 'begini',
 'beginian',
 'beginikah',
 'beginilah',
 'begitu',
 'begitukah',
 'begitulah',
 'begitupun',
 'bekerja',
 'belakang',
 'belakangan',
 'belum',
 'belumlah',
 'benar',
 'benarkah',
 'benarlah',
 'berada',
 'berakhir',
 'berakhirlah',
 'berakhirnya',
 'berapa',
 'berapakah',
 'berapalah',
 'berapapun',
 'berarti',
 'berawal',
 'berbagai',
 'berdatangan',
 'beri',
 'berikan',
 'berikut'

In [65]:
# membuat fungsi stopword removal

# menambahkan kata dalam stopword
more_stopword = ['gacor', 'juta', 'rb', 'situs']
stopwords_ind = stopwords_ind + more_stopword

def remove_stop_word(text):
    clean_words = []
    text = text.split()
    for word in text:
        if word not in stopwords_ind:
            clean_words.append(word)
    return " ".join(clean_words)


In [66]:
raw_sample = data['teks'].iloc[151]
case_folding = casefolding(raw_sample)
stopword_removal = remove_stop_word(case_folding)

print('Raw Data \t\t :', raw_data)
print('Case Folding \t\t :', case_folding)
print('Stopword Removal \t\t', stopword_removal)


Raw Data 		 : Berani depo minimal 20rb aja ak bantu kknya maxwin 1juta skrg jga di game yang kk kuasai ak setting maxwin di bet 200 lngsung di buy freespin pertama ak berani jaminkan kemenangan 100% akun kamu di situs akuLINK : https://bit.ly/Linkgacorgressya
Case Folding 		 : hallo dengan nama ahmad sekarang bapa anda sedang kami tahan di kantor polisi bandung dengan kasus tabrak lari demi membebaskannya kami meminta tebusan agar beliau bisa bebas sebesar jt
Stopword Removal 		 hallo nama ahmad bapa tahan kantor polisi bandung tabrak lari membebaskannya tebusan beliau bebas jt


## Melakukan Stemming

In [13]:
!pip -q install sastrawi

In [67]:
# merubah kata menjadi kata dasar
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

factory = StemmerFactory()
stemmer = factory.create_stemmer()

# membuat fungsi untuk stemming bahasa indonesia
def stemming(text):
    text = stemmer.stem(text)
    return text


In [69]:
raw_sample = data['teks'].iloc[151]
case_folding = casefolding(raw_sample)
stopword_removal = remove_stop_word(case_folding)
text_stemming = stemming(stopword_removal)

print('Raw Data \t\t :', raw_sample)
print('case_folding \t\t :', case_folding)
print('stopword_removal \t\t :', stopword_removal)
print('stemming \t\t :', text_stemming)


Raw Data 		 : Hallo dengan nama Ahmad? Sekarang bapa anda sedang kami tahan di kantor polisi Bandung dengan kasus tabrak lari. Demi membebaskannya kami meminta tebusan agar beliau bisa bebas sebesar 2jt.
case_folding 		 : hallo dengan nama ahmad sekarang bapa anda sedang kami tahan di kantor polisi bandung dengan kasus tabrak lari demi membebaskannya kami meminta tebusan agar beliau bisa bebas sebesar jt
stopword_removal 		 : hallo nama ahmad bapa tahan kantor polisi bandung tabrak lari membebaskannya tebusan beliau bebas jt
stemming 		 : hallo nama ahmad bapa tahan kantor polisi bandung tabrak lari bebas tebus beliau bebas jt


## Melakukan Teks Preprocessing Pipeline

In [70]:
# membuat fungsi untuk menggabungkan seluruh langkah text preproceessing
def text_preprocessing_process(text):
    text = casefolding(text)
    text = text_normalize(text)
    text = remove_stop_word(text)
    text = stemming(text)
    return text


In [71]:
%%time
data['clean_text']= data['teks'].apply(text_preprocessing_process)


CPU times: total: 1min 14s
Wall time: 1min 36s


In [72]:
data

Unnamed: 0,teks,label,clean_text
0,J0K3RNET88 situs cardgames hkbgaming SB0 live ...,1,jkrnet cardgames hkbgaming sb live dindong d t...
1,Situs betting On'line JOK3RB3T888 terbaik yang...,1,betting online jokrbt baik sedia jackpot besar...
2,Situs betting On'line JOK3RB3T888 terbaik yang...,1,betting online jokrbt baik sedia jackpot besar...
3,Menang jutaan di Sakon6 On'Line Main juga D1nG...,1,menang juta sakon online main dngdng live sbo ...
4,Mainkan To~gel cambodiaSlot'games On-line live...,1,main togel cambodiaslotgames online live dndn ...
...,...,...,...
147,Kalo abstrak mending nanti setelah semua seles...,0,kalo abstrak mending selesai mah antar gim
148,Tuliskan rasa terimakasih kepada siapa saja ya...,0,tulis terimakasih kait msib
149,Mad ane dapet refrensi untuk laporan akhir yan...,0,mad dapet refrensi lapor
150,Silahkan download aplikasi yang sudah diberika...,2,silah download aplikasi tunggu hadir


In [73]:
# menyimpan data yang sudah di preprocessing ke dalam file csv
data.to_csv('clean_data.csv')

## Proses Feature Engineering

In [74]:
# pisahkan kolom feature daan target
x = data['clean_text']
y = data['label']


In [75]:
x

0      jkrnet cardgames hkbgaming sb live dindong d t...
1      betting online jokrbt baik sedia jackpot besar...
2      betting online jokrbt baik sedia jackpot besar...
3      menang juta sakon online main dngdng live sbo ...
4      main togel cambodiaslotgames online live dndn ...
                             ...                        
147           kalo abstrak mending selesai mah antar gim
148                          tulis terimakasih kait msib
149                             mad dapet refrensi lapor
150                 silah download aplikasi tunggu hadir
151    hallo nama ahmad bapa tahan kantor polisi band...
Name: clean_text, Length: 152, dtype: object

In [76]:
y

0      1
1      1
2      1
3      1
4      1
      ..
147    0
148    0
149    0
150    2
151    2
Name: label, Length: 152, dtype: int64

## Mengekstrasi Feature

In [77]:
# save model
import pickle

#TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

#Unigram
vec_TF_IDF = TfidfVectorizer(ngram_range=(1,1))
vec_TF_IDF.fit(x)

x_tf_idf = vec_TF_IDF. transform(x)

pickle.dump(vec_TF_IDF.vocabulary_,open("feature_tf-idf.sav", "wb"))


In [78]:
# menampilkan vocabulary dari tf-idf
vec_TF_IDF.vocabulary_


{'jkrnet': 385,
 'cardgames': 162,
 'hkbgaming': 322,
 'sb': 657,
 'live': 470,
 'dindong': 222,
 'tanggal': 725,
 'sydney': 713,
 'hongkong': 330,
 'slotgames': 693,
 'lengkap': 464,
 'gabung': 281,
 'bitlyjkrnt': 116,
 'betting': 91,
 'online': 561,
 'jokrbt': 388,
 'baik': 56,
 'sedia': 662,
 'jackpot': 368,
 'besar': 87,
 'ratus': 630,
 'rupiah': 648,
 'hari': 312,
 'daftar': 202,
 'bitlyjkrbet': 115,
 'menang': 513,
 'juta': 396,
 'sakon': 651,
 'main': 490,
 'dngdng': 237,
 'sbo': 658,
 'tgel': 743,
 'japan': 371,
 'slotgams': 694,
 'ceme': 169,
 'cuttlyspr': 200,
 'dapat': 204,
 'togel': 759,
 'cambodiaslotgames': 159,
 'dndn': 233,
 'sakong': 652,
 'hkb': 317,
 'adu': 8,
 'sportsbook': 705,
 'csnbola': 195,
 'tangkas': 726,
 'tlmandiri': 755,
 'sidtlmndri': 682,
 'jayapoker': 373,
 'bitlyjypkr': 117,
 'bndr': 134,
 'nline': 552,
 'cambodiasydney': 160,
 'china': 173,
 'singapore': 688,
 'diskon': 227,
 'jg': 380,
 'domino': 240,
 'dngdong': 238,
 'promo': 614,
 'sltgams': 696,


In [79]:
# melihat jumlah feature
print(len(vec_TF_IDF.get_feature_names_out()))


815


In [80]:
# melihat fitur apa saja yang ada di dalam corpus
print(vec_TF_IDF.get_feature_names_out())


['aad' 'aaq' 'abbas' 'abstrak' 'adela' 'adik' 'adm' 'admin' 'adu' 'aduq'
 'afapker' 'ag' 'agus' 'agustus' 'ahabet' 'ahmad' 'ai' 'ain' 'air' 'ajak'
 'akademik' 'akhir' 'akses' 'akulink' 'akuma' 'akun' 'alhamdulillah' 'all'
 'allah' 'amak' 'ambil' 'an' 'aneh' 'angella' 'answer' 'antar' 'anter'
 'anti' 'anyway' 'aplikasi' 'apusin' 'ass' 'assalamualaikum' 'atas' 'atuh'
 'away' 'ayena' 'ayo' 'ayok' 'bab' 'baca' 'bad' 'bade' 'bagi' 'bahas'
 'bahela' 'baik' 'baim' 'bakar' 'ball' 'balltl' 'bandung' 'bang' 'banget'
 'bank' 'bantu' 'bantuin' 'bapa' 'bapau' 'barang' 'bareng' 'baru' 'basa'
 'batikpker' 'bawa' 'bayar' 'bb' 'bca' 'bdinfo' 'bebas' 'bekal' 'belah'
 'belegug' 'beliau' 'bentuk' 'berani' 'berkah' 'besar' 'besok' 'beta'
 'bete' 'betting' 'bettmu' 'biar' 'big' 'bigtwo' 'bikin' 'bilang'
 'bimbing' 'bingung' 'bitdodmbt' 'bitdotexapokr' 'bitlyahabet'
 'bitlybatikpker' 'bitlybrovegas' 'bitlybrves' 'bitlyeraswasembadapt'
 'bitlyftjoly' 'bitlygebyarshopee' 'bitlyhadiahptshopee'
 'bitlyhadiahshop

In [81]:
x1 = vec_TF_IDF.transform(x).toarray()
data_tabular_tf_idf = pd.DataFrame(x1,columns=vec_TF_IDF.get_feature_names_out())
data_tabular_tf_idf



Unnamed: 0,aad,aaq,abbas,abstrak,adela,adik,adm,admin,adu,aduq,...,wr,wshp,ya,yaa,yaaa,yhu,yuk,zainul,zeus,zoom
0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.296178,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147,0.0,0.0,0.0,0.410858,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
148,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
149,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
150,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [82]:
data_tabular_tf_idf.iloc[1:10,80:90]

Unnamed: 0,bekal,belah,belegug,beliau,bentuk,berani,berkah,besar,besok,beta
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.269612,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.269612,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.212716,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Proses Feature Selection

In [83]:
x_train = np.array(data_tabular_tf_idf)
y_train = np.array(y)


In [84]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

chi2_features = SelectKBest(chi2, k=800)
x_kbest_features = chi2_features.fit_transform(x_train, y_train)

# untuk reduced features 
print('Original Feature Number', x_train.shape[1])
print('Reduced feature Number', x_kbest_features.shape[1])


Original Feature Number 815
Reduced feature Number 800


In [85]:
Data = pd.DataFrame(chi2_features.scores_,columns=['Nilai'])
Data


Unnamed: 0,Nilai
0,0.681606
1,0.760171
2,0.874568
3,0.838150
4,0.535606
...,...
810,0.716934
811,1.337717
812,0.528720
813,1.527462


In [86]:
# menampilkan feature beserta nilainya

feature = vec_TF_IDF.get_feature_names_out()
feature

Data['Fitur'] = feature
Data


Unnamed: 0,Nilai,Fitur
0,0.681606,aad
1,0.760171,aaq
2,0.874568,abbas
3,0.838150,abstrak
4,0.535606,adela
...,...,...
810,0.716934,yhu
811,1.337717,yuk
812,0.528720,zainul
813,1.527462,zeus


In [87]:
# mengurutkan nilai feature terbaik
Data.sort_values(by='Nilai', ascending=False)


Unnamed: 0,Nilai,Fitur
623,10.785867,pulsa
490,8.122421,main
588,7.184902,pin
432,6.766343,kode
30,6.692790,ambil
...,...,...
337,0.267060,hp
275,0.251212,forex
340,0.251212,httpbitlytraderoctafx
364,0.251212,investas


In [88]:
mask = chi2_features.get_support()
mask


array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,

In [89]:
# menampilkan fitur yang terpilih berdasarkan nilai mask atau nilai tertinggi yang sudah ditetapkan pada chi square

new_feature=[]
for bool, f in zip(mask, feature):
    if bool :
        new_feature.append(f)
    selected_feature=new_feature
selected_feature


['aad',
 'aaq',
 'abbas',
 'abstrak',
 'adela',
 'adik',
 'adm',
 'admin',
 'adu',
 'aduq',
 'afapker',
 'ag',
 'agus',
 'agustus',
 'ahabet',
 'ahmad',
 'ai',
 'ain',
 'air',
 'ajak',
 'akademik',
 'akses',
 'akulink',
 'akuma',
 'akun',
 'alhamdulillah',
 'all',
 'allah',
 'amak',
 'ambil',
 'an',
 'aneh',
 'angella',
 'answer',
 'antar',
 'anter',
 'anti',
 'anyway',
 'aplikasi',
 'apusin',
 'ass',
 'assalamualaikum',
 'atas',
 'atuh',
 'away',
 'ayena',
 'ayo',
 'ayok',
 'bab',
 'baca',
 'bad',
 'bade',
 'bagi',
 'bahas',
 'bahela',
 'baik',
 'baim',
 'bakar',
 'ball',
 'balltl',
 'bandung',
 'bang',
 'banget',
 'bank',
 'bantu',
 'bantuin',
 'bapa',
 'bapau',
 'barang',
 'bareng',
 'baru',
 'basa',
 'batikpker',
 'bawa',
 'bayar',
 'bb',
 'bca',
 'bdinfo',
 'bebas',
 'bekal',
 'belah',
 'belegug',
 'beliau',
 'bentuk',
 'berani',
 'berkah',
 'besar',
 'besok',
 'beta',
 'bete',
 'betting',
 'bettmu',
 'biar',
 'big',
 'bigtwo',
 'bikin',
 'bilang',
 'bimbing',
 'bingung',
 'bitdod

In [90]:
# membuat vocabulary baru berdasarkan fitur yang terseleksi

new_selected_feature = {}

for (k,v) in vec_TF_IDF.vocabulary_.items():
    if k in selected_feature:
        new_selected_feature[k]=v

new_selected_feature


{'jkrnet': 385,
 'cardgames': 162,
 'hkbgaming': 322,
 'sb': 657,
 'live': 470,
 'dindong': 222,
 'tanggal': 725,
 'sydney': 713,
 'hongkong': 330,
 'slotgames': 693,
 'lengkap': 464,
 'gabung': 281,
 'bitlyjkrnt': 116,
 'betting': 91,
 'online': 561,
 'jokrbt': 388,
 'baik': 56,
 'sedia': 662,
 'jackpot': 368,
 'besar': 87,
 'ratus': 630,
 'rupiah': 648,
 'hari': 312,
 'daftar': 202,
 'bitlyjkrbet': 115,
 'menang': 513,
 'juta': 396,
 'sakon': 651,
 'main': 490,
 'dngdng': 237,
 'sbo': 658,
 'tgel': 743,
 'japan': 371,
 'slotgams': 694,
 'ceme': 169,
 'cuttlyspr': 200,
 'dapat': 204,
 'togel': 759,
 'cambodiaslotgames': 159,
 'dndn': 233,
 'sakong': 652,
 'hkb': 317,
 'adu': 8,
 'sportsbook': 705,
 'csnbola': 195,
 'tangkas': 726,
 'tlmandiri': 755,
 'sidtlmndri': 682,
 'jayapoker': 373,
 'bitlyjypkr': 117,
 'bndr': 134,
 'nline': 552,
 'cambodiasydney': 160,
 'china': 173,
 'singapore': 688,
 'diskon': 227,
 'jg': 380,
 'domino': 240,
 'dngdong': 238,
 'promo': 614,
 'sltgams': 696,


In [91]:
len(new_selected_feature)

800

In [92]:
pickle.dump(new_selected_feature,open("new_selected_feature_tf-idf.sav","wb"))

# menampilkan fitur-fitur yang sudah diseleksi

data_selected_feature = pd.DataFrame(x_kbest_features, columns=selected_feature)
data_selected_feature


Unnamed: 0,aad,aaq,abbas,abstrak,adela,adik,adm,admin,adu,aduq,...,wr,wshp,ya,yaa,yaaa,yhu,yuk,zainul,zeus,zoom
0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.296178,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
147,0.0,0.0,0.0,0.410858,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
148,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
149,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
150,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Proses Pemodelan

In [93]:
selected_x = x_kbest_features
selected_x


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [94]:
# import library
import random
from sklearn.model_selection import train_test_split

# import algoritma naive bayes
from sklearn.naive_bayes import MultinomialNB


In [95]:
x = selected_x
y = data.label

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=0)


In [96]:
# menampilkan jumlah data training dan data testing
print('Banyaknya X_train :', len(x_train))
print('Banyaknya X_test :', len(x_test))
print('Banyaknya Y_train :', len(y_train))
print('Banyaknya Y_test :', len(y_test))


Banyaknya X_train : 121
Banyaknya X_test : 31
Banyaknya Y_train : 121
Banyaknya Y_test : 31


In [97]:
# proses training menggunakan naive bayes
text_algorithm = MultinomialNB()


In [98]:
model = text_algorithm.fit(x_train, y_train)

In [99]:
# membuat model prediksi

data_input = ("assalamualaikum kak sorry ganggu malem gin penasaran sih gabut coba baca ulang buku rak")
data_input = text_preprocessing_process(data_input)

#load
tfidf = TfidfVectorizer

loaded_vec = TfidfVectorizer(decode_error="replace", vocabulary=set(pickle.load(open("new_selected_feature_tf-idf.sav", "rb"))))

hasil = model.predict(loaded_vec.fit_transform([data_input]))

if(hasil==0):
    s = "Pesan Normal"
elif(hasil==1):
    s = "Pesan Judi Online"
else:
    s= "Pesan Penipuan"

print("Hasil Prediksi :\n", s)


Hasil Prediksi :
 Pesan Normal


## Mengevaluasi Model yang ada

In [100]:
# masukan library yang dibutuhkan
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

predicted = model.predict(x_test)

CM = confusion_matrix(y_test, predicted)

print(classification_report(y_test, predicted))


              precision    recall  f1-score   support

           0       1.00      0.69      0.82        13
           1       0.90      1.00      0.95         9
           2       0.75      1.00      0.86         9

    accuracy                           0.87        31
   macro avg       0.88      0.90      0.87        31
weighted avg       0.90      0.87      0.87        31



In [None]:
# menyimpan model
pickle.dump(model,open("model_fraud.sav","wb"))
