## Masukan Library Yang Dibutuhkan

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

In [2]:
import nltk 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Masukan DataSet

In [3]:
data = pd.read_csv('dataset_sms_spam_v1.csv')

In [4]:
data.head(10)

Unnamed: 0,teks,label
0,[PROMO] Beli paket Flash mulai 1GB di MY TELKO...,2
1,2.5 GB/30 hari hanya Rp 35 Ribu Spesial buat A...,2
2,"2016-07-08 11:47:11.Plg Yth, sisa kuota Flash ...",2
3,"2016-08-07 11:29:47.Plg Yth, sisa kuota Flash ...",2
4,4.5GB/30 hari hanya Rp 55 Ribu Spesial buat an...,2
5,5 HARI LAGI ! EKSTRA Pulsa 50rb dg beli paket ...,2
6,"Ada iRing dgn tarif Rp. 0,1/7hr (perpanjangan ...",2
7,Akhir bulan harus tetap eksis loh! Internetan ...,2
8,Aktifkan iRing Coboy Jr - Terhebat. Tekan *808...,2
9,Ambil bonus harianmu di *600# (Bebas Pulsa). D...,2


## Text Preprocessing

## Case Folding untuk normalisasi huruf besar-kecil dan proses mengubah semua karakter dalam teks 


In [5]:
import re

# Membuat fungsi untuk case folding

def casefolding(text):
    text = text.lower()                                 # Mengubah menjadi huruf kecil
    text = re.sub(r'https?://\S+|www\.\S', '', text)    # Menghapus url dari dataset
    text = re.sub(r'[-+]?[0-9]+', '', text)             # Menghapus angka dari dataset
    text = re.sub(r'[^\w\s]', '', text)                 # Menghapus tanda baca
    text = text.strip()
    return text

In [6]:
# Membandingkan sebelum dan sesudah case folding

raw_sample = data['teks'].iloc[2]
case_folding = casefolding(raw_sample)


print('Raw Data\t :', raw_sample)
print('Case Folding\t :', case_folding)

Raw Data	 : 2016-07-08 11:47:11.Plg Yth, sisa kuota Flash Anda 478KB. Download MyTelkomsel apps di http://tsel.me/tsel utk cek kuota&beli paket Flash atau hub *363#
Case Folding	 : plg yth sisa kuota flash anda kb download mytelkomsel apps di  utk cek kuotabeli paket flash atau hub


## Normalisasi Kata (Word Normalization)

In [7]:
key_norm = pd.read_csv('key_norm.csv')

def text_normalize(text):
    text = ' '.join([key_norm[key_norm['singkat'] == word ]['hasil'].values[0]
                     if (key_norm['singkat'] == word).any()
                     else word for word in text.split()
                     ])
    
    text = str.lower(text)

    return text

In [8]:
# Membandingkan sebelum dan sesudah Word Normalization

raw_data = data['teks'].iloc[696]
word_normal = text_normalize(case_folding)

print('Raw Data\t : ', raw_data)
print('Word Normalize\t :', word_normal)

Raw Data	 :  Btw magicomnya yg sedang Gais, gaada yg gede
Word Normalize	 : pulang yang terhormat sisa kuota flash anda kb download mytelkomsel apps di untuk cek kuotabeli paket flash atau hubungi


## Filtering (Stopword Removal)

In [9]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

stopwords_ind = stopwords.words('indonesian')

In [10]:
len(stopwords_ind)

758

In [11]:
# Melihat Daftar StopWord Dari NLTK Bahasa Indonesia

stopwords_ind

['ada',
 'adalah',
 'adanya',
 'adapun',
 'agak',
 'agaknya',
 'agar',
 'akan',
 'akankah',
 'akhir',
 'akhiri',
 'akhirnya',
 'aku',
 'akulah',
 'amat',
 'amatlah',
 'anda',
 'andalah',
 'antar',
 'antara',
 'antaranya',
 'apa',
 'apaan',
 'apabila',
 'apakah',
 'apalagi',
 'apatah',
 'artinya',
 'asal',
 'asalkan',
 'atas',
 'atau',
 'ataukah',
 'ataupun',
 'awal',
 'awalnya',
 'bagai',
 'bagaikan',
 'bagaimana',
 'bagaimanakah',
 'bagaimanapun',
 'bagi',
 'bagian',
 'bahkan',
 'bahwa',
 'bahwasanya',
 'baik',
 'bakal',
 'bakalan',
 'balik',
 'banyak',
 'bapak',
 'baru',
 'bawah',
 'beberapa',
 'begini',
 'beginian',
 'beginikah',
 'beginilah',
 'begitu',
 'begitukah',
 'begitulah',
 'begitupun',
 'bekerja',
 'belakang',
 'belakangan',
 'belum',
 'belumlah',
 'benar',
 'benarkah',
 'benarlah',
 'berada',
 'berakhir',
 'berakhirlah',
 'berakhirnya',
 'berapa',
 'berapakah',
 'berapalah',
 'berapapun',
 'berarti',
 'berawal',
 'berbagai',
 'berdatangan',
 'beri',
 'berikan',
 'berikut'

In [12]:
# Membuat Fungsi StopWord Removal

# Menambahkan Kata Ke Dalam StopWord

more_stopword = ['tsel', 'gb', 'rb', 'btw']
stopwords_ind = stopwords_ind + more_stopword

def remove_stop_word(text):
    clear_words = []
    text = text.split()
    for word in text:
        if word not in stopwords_ind:
            clear_words.append(word)
    
    return " ".join(clear_words)

In [13]:
raw_sample = data['teks'].iloc[696]
case_folding = casefolding(raw_sample)
stopwords_removal = remove_stop_word(case_folding)

print('Raw Data \t\t :', raw_data)
print('Case Folding \t\t :', case_folding)
print('StopWord Removal \t\t :', stopwords_removal)

Raw Data 		 : Btw magicomnya yg sedang Gais, gaada yg gede
Case Folding 		 : btw magicomnya yg sedang gais gaada yg gede
StopWord Removal 		 : magicomnya yg gais gaada yg gede


## Stemming untuk menghilangkan imbuhan pada suatu kata

In [14]:
!pip -q install sastrawi

In [15]:
# Membuat Kata Menjadi Kata Dasar

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# Membuat objek StemmerFactory
factory = StemmerFactory()

# Membuat stemmer
stemmer = factory.create_stemmer()

# Membuat Fungsi Untuk Stemming Bahasa Indonesia
def stemming(text):
    text = stemmer.stem(text)
    return text

In [16]:
raw_sample = data['teks'].iloc[696]
case_folding = casefolding(raw_sample)
stopwords_removal = remove_stop_word(case_folding)
text_stemming = stemming(stopwords_removal)


print('Raw Data \t\t :', raw_sample)
print('Case Folding \t\t :', case_folding)
print('StopWord Removal \t\t :', stopwords_removal)
print('Stemming \t\t :', text_stemming)

Raw Data 		 : Btw magicomnya yg sedang Gais, gaada yg gede
Case Folding 		 : btw magicomnya yg sedang gais gaada yg gede
StopWord Removal 		 : magicomnya yg gais gaada yg gede
Stemming 		 : magicomnya yg gais gaada yg gede


## Text Preprocessing Pipiline

In [17]:
def text_preprocessing_process(text):
    text = casefolding(text)
    text = text_normalize(text)
    text = remove_stop_word(text)
    text = stemming(text)
    return text

In [18]:

%%time 
data['clean_text'] = data['teks'].apply(text_preprocessing_process)

CPU times: total: 3min 36s
Wall time: 6min 40s


In [19]:
data

Unnamed: 0,teks,label,clean_text
0,[PROMO] Beli paket Flash mulai 1GB di MY TELKO...,2,promo beli paket flash my telkomsel app extra ...
1,2.5 GB/30 hari hanya Rp 35 Ribu Spesial buat A...,2,rupiah ribu spesial pilih aktif promo sd novem...
2,"2016-07-08 11:47:11.Plg Yth, sisa kuota Flash ...",2,pulang hormat sisa kuota flash kb download myt...
3,"2016-08-07 11:29:47.Plg Yth, sisa kuota Flash ...",2,pulang hormat sisa kuota flash kb download myt...
4,4.5GB/30 hari hanya Rp 55 Ribu Spesial buat an...,2,rupiah ribu spesial pilih aktif buru skb
...,...,...,...
1138,"Yooo sama2, oke nanti aku umumin di grup kelas",0,yooo oke umumin grup kelas
1139,😁 sebelumnya ga ad nulis kerudung. Kirain warn...,0,nulis kerudung kirain warna jins
1140,Mba mau kirim 300 ya,0,mbak kirim ya
1141,nama1 beaok bwrangkat pagi...mau cas atay tra...,0,nama beaok bwrangkat pagimau cas atay tranfer


In [20]:
# Simpan Data Preprocessing Ke CSV

data.to_csv('clean_data.csv')

## Feature Engineeering

In [21]:
# Memisahkan Kolo Feature Dan Target

x = data['clean_text']
y = data['label']

In [22]:
x

0       promo beli paket flash my telkomsel app extra ...
1       rupiah ribu spesial pilih aktif promo sd novem...
2       pulang hormat sisa kuota flash kb download myt...
3       pulang hormat sisa kuota flash kb download myt...
4                rupiah ribu spesial pilih aktif buru skb
                              ...                        
1138                           yooo oke umumin grup kelas
1139                     nulis kerudung kirain warna jins
1140                                        mbak kirim ya
1141        nama beaok bwrangkat pagimau cas atay tranfer
1142                                       nomor bri nama
Name: clean_text, Length: 1143, dtype: object

In [23]:
y

0       2
1       2
2       2
3       2
4       2
       ..
1138    0
1139    0
1140    0
1141    0
1142    0
Name: label, Length: 1143, dtype: int64

## Feature Extraction (TF-IDF Dan N-Gram)

In [24]:
# Membuat Model

import pickle

# Membuat TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer

# UniGram

vec_TF_IDF = TfidfVectorizer(ngram_range=(1,1))
vec_TF_IDF.fit(x) 

x_tf_idf = vec_TF_IDF.transform(x)

pickle.dump(vec_TF_IDF.vocabulary_, open("feature_tf-idf.sav", "wb"))

In [25]:
# Menampilkan Vocabulary Dari TF-IDF

vec_TF_IDF.vocabulary_

{'promo': 2431,
 'beli': 355,
 'paket': 2224,
 'flash': 936,
 'my': 1975,
 'telkomsel': 3028,
 'app': 186,
 'extra': 907,
 'kuota': 1643,
 'lte': 1747,
 'telpon': 3031,
 'mnthr': 1926,
 'buru': 512,
 'cek': 554,
 'tselmemytsel': 3168,
 'sk': 2844,
 'rupiah': 2652,
 'ribu': 2589,
 'spesial': 2903,
 'pilih': 2311,
 'aktif': 86,
 'sd': 2706,
 'november': 2117,
 'pulang': 2466,
 'hormat': 1184,
 'sisa': 2836,
 'kb': 1491,
 'download': 786,
 'mytelkomsel': 1977,
 'apps': 191,
 'kuotabeli': 1644,
 'hubung': 1208,
 'skb': 2845,
 'ekstra': 857,
 'pulsa': 2468,
 'internet': 1295,
 'bulan': 498,
 'sjk': 2843,
 'augsept': 243,
 'detail': 698,
 'iring': 1318,
 'tarif': 2994,
 'panjang': 2237,
 'hits': 1172,
 'armada': 206,
 'curi': 633,
 'hati': 1135,
 'tekan': 3022,
 'okcall': 2171,
 'informasi': 1264,
 'eks': 854,
 'loh': 1734,
 'internetan': 1296,
 'pakai': 2222,
 'volume': 3290,
 'ultima': 3224,
 'mbhr': 1833,
 'harga': 1123,
 'tariflokasi': 2996,
 'tselmefl': 3166,
 'coboy': 601,
 'jr': 1407,

In [26]:
# Melihat Jumlah Feature

print(len(vec_TF_IDF.get_feature_names_out()))

3415


In [27]:
# Melihat Fitur Apa Saja Yang Ada Dalam Corpus

print(vec_TF_IDF.get_feature_names_out())

['aa' 'aamiiiin' 'aamiin' ... 'zjt' 'zona' 'ztkm']


In [28]:
x1 = vec_TF_IDF.transform(x).toarray()
data_tabular_tf_idf = pd.DataFrame(x1,columns=vec_TF_IDF.get_feature_names_out())
data_tabular_tf_idf

Unnamed: 0,aa,aamiiiin,aamiin,ab,abadi,abai,abbee,abdul,acara,acaratks,...,yudisium,yuk,yuks,yuni,yunit,zalora,zarkasi,zjt,zona,ztkm
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1138,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1139,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1140,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
data_tabular_tf_idf.iloc[10:20, 60:70]

Unnamed: 0,agus,agust,agustus,agustuskunjungi,ah,ahaha,ahayatourtravelcom,ahub,aidzin,aigoo
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Feature Selection

In [32]:
x_train = np.array(data_tabular_tf_idf)
y_train = np.array(y)

In [33]:
from sklearn.feature_selection import SelectKBest, chi2

# Inisialisasi SelectKBest dengan chi2 untuk memilih 3000 fitur terbaik
chi2_features = SelectKBest(chi2, k=3000)

# Melakukan seleksi fitur dan transformasi pada x_train dan y_train
x_kbest_feature = chi2_features.fit_transform(x_train, y_train)

# Menampilkan jumlah fitur sebelum dan setelah seleksi
print('Original Feature Number:', x_train.shape[1])
print('Reduced Feature Number:', x_kbest_feature.shape[1])

Original Feature Number: 3415
Reduced Feature Number: 3000


In [34]:
Data = pd.DataFrame(chi2_features.scores_, columns=['Nilai'])
Data

Unnamed: 0,Nilai
0,0.843018
1,0.419698
2,1.558607
3,0.686416
4,0.759870
...,...
3410,1.126664
3411,0.503012
3412,0.686416
3413,2.917329


In [35]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.feature_extraction.text import TfidfVectorizer


x_train = np.array(data_tabular_tf_idf)
y_train = np.array(y)

# Inisialisasi SelectKBest dengan chi2 untuk memilih 3000 fitur terbaik
chi2_features = SelectKBest(chi2, k=3000)

# Melakukan seleksi fitur dan transformasi pada x_train dan y_train
x_kbest_feature = chi2_features.fit_transform(x_train, y_train)

# Menampilkan jumlah fitur sebelum dan setelah seleksi
print('Original Feature Number:', x_train.shape[1])
print('Reduced Feature Number:', x_kbest_feature.shape[1])

# Mendapatkan skor chi2 untuk setiap fitur
scores = chi2_features.scores_

# Mendapatkan nama fitur dari vec_TF_IDF
feature_names = vec_TF_IDF.get_feature_names_out()

# Membuat DataFrame dari skor chi2
Data = pd.DataFrame({'Nilai': scores})

# Memastikan panjang feature_names sesuai dengan panjang Data
if len(feature_names) == len(Data):
    Data['Fitur'] = feature_names
else:
    print("Panjang feature_names tidak cocok dengan panjang Data.")

print(Data)

Original Feature Number: 3415
Reduced Feature Number: 3000
         Nilai     Fitur
0     0.843018        aa
1     0.419698  aamiiiin
2     1.558607    aamiin
3     0.686416        ab
4     0.759870     abadi
...        ...       ...
3410  1.126664    zalora
3411  0.503012   zarkasi
3412  0.686416       zjt
3413  2.917329      zona
3414  0.835998      ztkm

[3415 rows x 2 columns]


In [36]:
# Mengurutkan Nilai Fitur Terbaik

Data.sort_values(by='Nilai', ascending=False)

Unnamed: 0,Nilai,Fitur
2224,48.922113,paket
1643,45.004572,kuota
1096,43.663894,hadiah
2314,36.955677,pin
355,33.962373,beli
...,...,...
1612,0.044910,kopi
339,0.044468,bca
1789,0.031575,maksimal
3282,0.012716,via


In [37]:
mask = chi2_features.get_support()
mask

array([ True,  True,  True, ...,  True,  True,  True])

In [39]:
# Menampilkan Fitur Yang Terpilih Berdasarkan Nilai Musk Dan Chi Tertinggi

new_feature =[]
for bool, f in zip(mask, feature_names):
    if bool :
        new_feature.append(f)
    selected_feature=new_feature
selected_feature

['aa',
 'aamiiiin',
 'aamiin',
 'ab',
 'abadi',
 'abai',
 'abbee',
 'abdul',
 'acaratks',
 'account',
 'ada',
 'adapromo',
 'adi',
 'adiahbankbriblogspotcoid',
 'adiahbriblogspotcom',
 'adiahcareblogspotcom',
 'adiahindosatblogspotcom',
 'adiahkejutangbriblogspotcom',
 'adiahlptcareblogspotcom',
 'adiahmkiosblogspotcom',
 'adiahmtroniktk',
 'adiahpoinblogspotcom',
 'adiahptaxisblogspotcoid',
 'adiahptcareblogspotcom',
 'adiahptmkiosblogspotcom',
 'adiahramadhanblogspotcom',
 'adiahresmimkiosblogspotcom',
 'adiahtcareblogspotcom',
 'adiahtricareblogspotcoid',
 'adiahtricareblogspotcom',
 'adiahtricarecom',
 'adik',
 'adison',
 'admin',
 'administrasi',
 'adminlte',
 'ado',
 'adrian',
 'adu',
 'aduh',
 'advertising',
 'aea',
 'aesthetic',
 'afbe',
 'affc',
 'afr',
 'afrika',
 'agam',
 'agamascellcom',
 'agen',
 'agendain',
 'agenpulsa',
 'ags',
 'agst',
 'agsts',
 'agt',
 'agtskinfodlj',
 'agua',
 'agun',
 'agus',
 'agust',
 'agustuskunjungi',
 'ahaha',
 'ahayatourtravelcom',
 'ahub',
 '

In [40]:
# Membuat Vocabulary Baru Berdasarkan Fitur Yang Telah Di Seleksi

new_selected_feature = {}

for (k,v) in vec_TF_IDF.vocabulary_.items():
    if k in selected_feature:
        new_selected_feature[k]=v
new_selected_feature

{'promo': 2431,
 'beli': 355,
 'paket': 2224,
 'flash': 936,
 'my': 1975,
 'telkomsel': 3028,
 'app': 186,
 'extra': 907,
 'kuota': 1643,
 'lte': 1747,
 'telpon': 3031,
 'mnthr': 1926,
 'buru': 512,
 'cek': 554,
 'tselmemytsel': 3168,
 'sk': 2844,
 'rupiah': 2652,
 'ribu': 2589,
 'spesial': 2903,
 'pilih': 2311,
 'aktif': 86,
 'sd': 2706,
 'november': 2117,
 'pulang': 2466,
 'hormat': 1184,
 'sisa': 2836,
 'kb': 1491,
 'download': 786,
 'mytelkomsel': 1977,
 'apps': 191,
 'kuotabeli': 1644,
 'hubung': 1208,
 'skb': 2845,
 'ekstra': 857,
 'pulsa': 2468,
 'internet': 1295,
 'bulan': 498,
 'sjk': 2843,
 'augsept': 243,
 'detail': 698,
 'iring': 1318,
 'tarif': 2994,
 'panjang': 2237,
 'hits': 1172,
 'armada': 206,
 'curi': 633,
 'hati': 1135,
 'tekan': 3022,
 'okcall': 2171,
 'informasi': 1264,
 'eks': 854,
 'loh': 1734,
 'internetan': 1296,
 'pakai': 2222,
 'volume': 3290,
 'ultima': 3224,
 'mbhr': 1833,
 'harga': 1123,
 'tariflokasi': 2996,
 'tselmefl': 3166,
 'coboy': 601,
 'jr': 1407,

In [46]:
# Menampilkan Berapa Banyak Fitur Yang Terpilih

len(new_selected_feature)

3000

In [47]:
pickle.dump(new_selected_feature,open("new_selected_feature_tf-idf.sav", "wb"))

In [48]:
# Menampilkan Fitur-Fitur Yang Sudah Di Seleksi


data_selected_feature = pd.DataFrame(x_kbest_feature, columns=selected_feature)
data_selected_feature

Unnamed: 0,aa,aamiiiin,aamiin,ab,abadi,abai,abbee,abdul,acaratks,account,...,yudisium,yuk,yuks,yuni,yunit,zalora,zarkasi,zjt,zona,ztkm
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1138,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1139,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1140,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1141,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Modeling (Pemodelan) Naive Bayes

In [49]:
selected_x = x_kbest_feature
selected_x

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [50]:
# Imporr Library Yang Di Pake

import random
from sklearn.model_selection import train_test_split

# Import Algoritma Naive Bayes

from sklearn.naive_bayes import MultinomialNB

In [52]:
x = selected_x
y = data.label

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [53]:
# Menampilkan Jumlah Data Training Dan Testing

print('Banyaknya X_train :', len(x_train))
print('Banyaknya X_test :', len(x_test))
print('Banyaknya Y_train :', len(y_train))
print('Banyaknya Y_test :', len(y_train))

Banyaknya X_train : 914
Banyaknya X_test : 229
Banyaknya Y_train : 914
Banyaknya Y_test : 914


In [56]:
# Proses Traning Menggunakan Algoritma Naive Bayes

text_algorithm = MultinomialNB()

In [57]:
model = text_algorithm.fit(x_train, y_train)

In [58]:
# Membuat Model Prediksi

data_input = ("promo beli paket flash my telkomsel app extra kuota lte extra telpon mnthr buru cek tselmemytsel sk")
data_input = text_preprocessing_process(data_input)

# Load TF-IDF

tfidf = TfidfVectorizer

loader_vec = TfidfVectorizer(decode_error="replace", vocabulary=set(pickle.load(open("new_selected_feature_tf-idf.sav", "rb"))))

hasil = model.predict(loader_vec.fit_transform([data_input]))

if(hasil == 0):
    s = "SMS Normal"
elif(hasil == 1):
    s = "SMS Penipuan"
else:
    s = "SMS Promo"

print("Hasil Prediksi : \n", s)


Hasil Prediksi : 
 SMS Promo


In [60]:
# Membuat Model Prediksi

data_input = ("transfer uang rekening bni an ismawati rek sms kirim")
data_input = text_preprocessing_process(data_input)

# Load TF-IDF

tfidf = TfidfVectorizer

loader_vec = TfidfVectorizer(decode_error="replace", vocabulary=set(pickle.load(open("new_selected_feature_tf-idf.sav", "rb"))))

hasil = model.predict(loader_vec.fit_transform([data_input]))

if(hasil == 0):
    s = "SMS Normal"
elif(hasil == 1):
    s = "SMS Penipuan"
else:
    s = "SMS Promo"

print("Hasil Prediksi : \n", s)


Hasil Prediksi : 
 SMS Penipuan


In [61]:
# Membuat Model Prediksi

data_input = ("nama note nama dinamic emil kembaliin nama gmana")
data_input = text_preprocessing_process(data_input)

# Load TF-IDF

tfidf = TfidfVectorizer

loader_vec = TfidfVectorizer(decode_error="replace", vocabulary=set(pickle.load(open("new_selected_feature_tf-idf.sav", "rb"))))

hasil = model.predict(loader_vec.fit_transform([data_input]))

if(hasil == 0):
    s = "SMS Normal"
elif(hasil == 1):
    s = "SMS Penipuan"
else:
    s = "SMS Promo"

print("Hasil Prediksi : \n", s)


Hasil Prediksi : 
 SMS Normal


## Evaluasi Model

In [62]:
# Masukan Library Yang Di Butuhkan

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

predicted = model.predict(x_test)

CM = confusion_matrix(y_test, predicted)

print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.95      0.96      0.95       126
           1       0.92      0.89      0.91        66
           2       0.84      0.84      0.84        37

    accuracy                           0.92       229
   macro avg       0.90      0.90      0.90       229
weighted avg       0.92      0.92      0.92       229



KETERANGAN UNTUK PREDIKSI SMS PENIPUAN
- Label 0 = SMS Normal
- Label 1 = SMS Penipuan
- Label 2 = SMS Promo 

Accurasi Precision  Model Yang Di Bangun  0.90   
- SMS Normal (0) Mempunyai Nilai precision 0.95
- SMS Penipuan (1) Mempunyai Nilai precision 0.92
- SMS Promo (2) Mempunyai Nilai precision 0.84

Accurasi Recall Model Yang Di Bangun  0.90 
- SMS Normal (0) Mempunyai Nilai recall 0.96
- SMS Penipuan (1) Mempunyai Nilai recall 0.89
- SMS Promo (2) Mempunyai Nilai recall 0.84

Accurasi F1-score Model Yang Di Bangun  0.92 
- SMS Normal (0) Mempunyai Nilai f1-score 0.95
- SMS Penipuan (1) Mempunyai Nilai f1-score 0.91
- SMS Promo (2) Mempunyai Nilai f1-score 0.84


In [63]:
# Menyimpan Model KE .SAV Untuk Di Masukan Ke Streamlit

pickle.dump(model,open("model_fraud.sav", "wb"))