# Import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nuryadi01\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# DATASET

In [2]:
data = pd.read_csv('reviews.csv')
data.head()

Unnamed: 0,reviews,label
0,kemeja nya bagusss bgtttt😍😍😍aaaa mauuu nngisss...,1.0
1,"Jahitannya sih rapi,cuman ada benang yang ikut...",0.0
2,Sesuai harga. Agak tipis tapi masih oke kok. W...,0.0
3,"Wah gila sihhh sebagus itu, se worth it, se l...",1.0
4,Kain nya bagus halus \nTapi kok di bukak koto...,0.0


# TEXT PRE-PROCESSING

### CASE FOLDING

In [3]:
import re

def casefolding(text):
  text = text.lower()
  text = re.sub(r'[-+]?[0-9]+', '', text)
  text = re.sub(r'[^\w\s]', '', text)
  text = text.strip()
  return  text

In [4]:
raw_sample = data['reviews'].iloc[0]
case_folding = casefolding(raw_sample)

print('raw data\t : ', raw_sample)
print('Case Folding\t :', case_folding)

raw data	 :  kemeja nya bagusss bgtttt😍😍😍aaaa mauuu nngisssss😩😩😩knpa ga dri dlu beli kemeja ditoko ini😜, ini kemejanya asli emg bagus, bahannya jga adem ga gerah,and ga nerawang jga... itu krna camera nya jelek jdi ga trlalu jelas kemejanya.. asliny baguss bgttt ga bhong sumpah
Case Folding	 : kemeja nya bagusss bgttttaaaa mauuu nngisssssknpa ga dri dlu beli kemeja ditoko ini ini kemejanya asli emg bagus bahannya jga adem ga gerahand ga nerawang jga itu krna camera nya jelek jdi ga trlalu jelas kemejanya asliny baguss bgttt ga bhong sumpah


### WORD NORMALIZATION

In [5]:
key_norm = pd.read_csv('key_norm.csv')

def text_normalize(text):
  text = ' '.join([key_norm[key_norm['singkat'] == word]['hasil'].values[0]
  if (key_norm['singkat'] == word).any()
    else
  word for word in text.split() 
  ])

  text = str.lower(text)
  return text

In [6]:
raw_data = data['reviews'].iloc[0]
word_normal = text_normalize(raw_data)

print('raw data\t : ', raw_data)
print('WORD NORMALIZE\t :', word_normal)

raw data	 :  kemeja nya bagusss bgtttt😍😍😍aaaa mauuu nngisssss😩😩😩knpa ga dri dlu beli kemeja ditoko ini😜, ini kemejanya asli emg bagus, bahannya jga adem ga gerah,and ga nerawang jga... itu krna camera nya jelek jdi ga trlalu jelas kemejanya.. asliny baguss bgttt ga bhong sumpah
WORD NORMALIZE	 : kemeja nya bagusss bgtttt😍😍😍aaaa mauuu nngisssss😩😩😩knpa tidak dari dulu beli kemeja di toko ini😜, ini kemejanya asli emang bagus, bahannya juga adem tidak gerah,and tidak nerawang jga... itu karena camera nya jelek jadi tidak terlalu jelas kemejanya.. asliny baguss bgttt tidak bhong sumpah


###  FILTERING (STOPWORD REMOVAL)

In [7]:
from  nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

stopwords_ind = stopwords.words('indonesian')

len(stopwords_ind)

758

### STEMMING

In [8]:
!pip -q install sastrawi

In [9]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

factory = StemmerFactory()
stemmer = factory.create_stemmer()

def stemming(text):
  text = stemmer.stem(text)
  return text

In [10]:
raw_sample = data['reviews'].iloc[0]
case_folding = casefolding(raw_sample)
text_stemming = stemming(case_folding)

print('Raw Data\t :', raw_sample)
print('Case Folding\t :', case_folding)
print('Stemming\t :', text_stemming)

Raw Data	 : kemeja nya bagusss bgtttt😍😍😍aaaa mauuu nngisssss😩😩😩knpa ga dri dlu beli kemeja ditoko ini😜, ini kemejanya asli emg bagus, bahannya jga adem ga gerah,and ga nerawang jga... itu krna camera nya jelek jdi ga trlalu jelas kemejanya.. asliny baguss bgttt ga bhong sumpah
Case Folding	 : kemeja nya bagusss bgttttaaaa mauuu nngisssssknpa ga dri dlu beli kemeja ditoko ini ini kemejanya asli emg bagus bahannya jga adem ga gerahand ga nerawang jga itu krna camera nya jelek jdi ga trlalu jelas kemejanya asliny baguss bgttt ga bhong sumpah
Stemming	 : kemeja nya bagusss bgttttaaaa mauuu nngisssssknpa ga dri dlu beli kemeja toko ini ini kemeja asli emg bagus bahan jga adem ga gerahand ga nerawang jga itu krna camera nya jelek jdi ga trlalu jelas kemeja asliny baguss bgttt ga bhong sumpah


### PREPROCESSING PIPELINE

In [11]:
def text_preprocessing_process(text):
  text = casefolding(text)
  text = text_normalize(text)
  text = stemming(text)
  return text

In [12]:
%%time
data['clean_reviews'] = data['reviews'].apply(text_preprocessing_process)

Wall time: 4min 26s


### SIMPAN

In [13]:
data.to_csv('clean_reviews.csv')

In [14]:
data

Unnamed: 0,reviews,label,clean_reviews
0,kemeja nya bagusss bgtttt😍😍😍aaaa mauuu nngisss...,1.0,kemeja nya bagusss bgttttaaaa mauuu nngisssssk...
1,"Jahitannya sih rapi,cuman ada benang yang ikut...",0.0,jahit sih rapicuman ada benang yang ikut ke ja...
2,Sesuai harga. Agak tipis tapi masih oke kok. W...,0.0,sesuai harga agak tipis tapi masih oke kok war...
3,"Wah gila sihhh sebagus itu, se worth it, se l...",1.0,wah gila sihhh bagus itu se worth it se lembut...
4,Kain nya bagus halus \nTapi kok di bukak koto...,0.0,kain nya bagus halus tapi kok di bukak kotor y...
...,...,...,...
826,Terima kasih barang sudah sampai sesuai ukuran...,1.0,terima kasih barang sudah sampai sesuai ukur d...
827,Mantapp realpicttt bangttt tapi pengemasan nya...,1.0,mantapp realpicttt bangttt tapi emas nya cuma ...
828,"Suka bgt sama tasnya, ga kayak tas local. Kere...",1.0,suka banget sama tas tidak seperti tas lokal k...
829,kualitas produk sangat baik. produk original. ...,1.0,kualitas produk sangat baik produk original ha...


# FEATURE EXTRACTION

In [15]:
X = data['clean_reviews']
Y = data['label']

In [16]:
X

0      kemeja nya bagusss bgttttaaaa mauuu nngisssssk...
1      jahit sih rapicuman ada benang yang ikut ke ja...
2      sesuai harga agak tipis tapi masih oke kok war...
3      wah gila sihhh bagus itu se worth it se lembut...
4      kain nya bagus halus tapi kok di bukak kotor y...
                             ...                        
826    terima kasih barang sudah sampai sesuai ukur d...
827    mantapp realpicttt bangttt tapi emas nya cuma ...
828    suka banget sama tas tidak seperti tas lokal k...
829    kualitas produk sangat baik produk original ha...
830    barang sudah sampai dengan selamat mantul bang...
Name: clean_reviews, Length: 831, dtype: object

In [17]:
Y

0      1.0
1      0.0
2      0.0
3      1.0
4      0.0
      ... 
826    1.0
827    1.0
828    1.0
829    1.0
830    1.0
Name: label, Length: 831, dtype: float64

In [18]:
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer

vec_TF_IDF = TfidfVectorizer(ngram_range=(1,1))
vec_TF_IDF.fit(X)

X_tf_idf = vec_TF_IDF.transform(X)

pickle.dump(vec_TF_IDF.vocabulary_,open("feature_tf-idf.sav", "wb"))

In [19]:
print(len(vec_TF_IDF.get_feature_names()))

2318




In [20]:
X1 = vec_TF_IDF.transform(X).toarray()
data_tabular_tf_idf = pd.DataFrame(X1,columns=vec_TF_IDF.get_feature_names())
data_tabular_tf_idf

Unnamed: 0,aaa,aaaa,aaaaaa,aamiin,abal,abang,abu,acara,ada,adaa,...,ydh,yeaaayyyyy,yen,yh,yha,ynag,youtube,youu,yung,zonk
0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.164018,0.0,...,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.266096,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.18655,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
826,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0
827,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0
828,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0
829,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0


# FEATURE SELECTION

In [21]:
X_train = np.array(data_tabular_tf_idf)
Y_train = np.array(Y)

In [22]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

chi2_features = SelectKBest(chi2, k=1000)
X_kbest_features = chi2_features.fit_transform(X_train, Y_train)

print('Original Feature Number :', X_train.shape[1])
print('Reduced Feature Number :', X_kbest_features.shape[1])

Original Feature Number : 2318
Reduced Feature Number : 1000


In [23]:
Data = pd.DataFrame(chi2_features.scores_,columns=['Nilai'])
Data

Unnamed: 0,Nilai
0,0.323091
1,0.416289
2,0.287972
3,0.736947
4,0.280057
...,...
2313,0.153840
2314,0.239721
2315,0.280518
2316,0.241403


In [24]:
feature = vec_TF_IDF.get_feature_names()
feature

Data['Kata'] = feature
Data



Unnamed: 0,Nilai,Kata
0,0.323091,aaa
1,0.416289,aaaa
2,0.287972,aaaaaa
3,0.736947,aamiin
4,0.280057,abal
...,...,...
2313,0.153840,ynag
2314,0.239721,youtube
2315,0.280518,youu
2316,0.241403,yung


In [25]:
Data.sort_values(by='Nilai', ascending=False)

Unnamed: 0,Nilai,Kata
145,17.384219,bagus
454,11.537386,cepat
15,9.515406,adem
1804,8.777461,rusak
1666,8.517626,pokok
...,...,...
1727,0.000044,ramah
1476,0.000015,nyaaa
336,0.000015,bikin
1745,0.000005,rban


In [26]:
mask = chi2_features.get_support()
mask

array([False,  True, False, ..., False, False, False])

In [27]:
new_feature=[]
for bool, f in zip(mask, feature):
  if bool :
    new_feature.append(f)
  selected_feature=new_feature
selected_feature

['aaaa',
 'aamiin',
 'abang',
 'abu',
 'ada',
 'adaa',
 'adapadahal',
 'adasaya',
 'adem',
 'ademmmmnnnn',
 'adik',
 'adkinnya',
 'adminnya',
 'agak',
 'ah',
 'ahahaha',
 'ahhh',
 'ahshshahhhsha',
 'ajar',
 'ajiibbb',
 'aju',
 'akhrinya',
 'akudongggg',
 'ala',
 'alas',
 'alda',
 'alhaldulillah',
 'alhamdulillah',
 'alhamdupillah',
 'alhasil',
 'allahmdullilah',
 'amaattt',
 'aman',
 'amat',
 'anak',
 'aneh',
 'anjir',
 'anti',
 'ap',
 'apa',
 'apayang',
 'army',
 'aslikknyeselll',
 'astaga',
 'atur',
 'auto',
 'awal',
 'awet',
 'baaaaaguuuuussss',
 'baaaaannnggggeeeeetttt',
 'baaangeettt',
 'babang',
 'babget',
 'baca',
 'badai',
 'badaiii',
 'badan',
 'bagai',
 'bagaimana',
 'baget',
 'bagi',
 'bagooos',
 'bagus',
 'bagusnyaa',
 'baguss',
 'bagusss',
 'bagussss',
 'bagussssss',
 'bagusssssss',
 'bagusssssssss',
 'bagussssssssss',
 'bagusssssssssssssssssssssssss',
 'bagusssssssssssssssssssssssssssss',
 'baguus',
 'baguuusnya',
 'baguuuss',
 'baguuusss',
 'baguuuus',
 'baguzzz',
 'baha

In [28]:
new_selected_feature={}
for (k,v) in vec_TF_IDF.vocabulary_.items():
  if k in selected_feature:
    new_selected_feature[k]=v

new_selected_feature

{'kemeja': 1026,
 'nya': 1474,
 'bagusss': 149,
 'dari': 521,
 'dulu': 613,
 'beli': 264,
 'di': 544,
 'emang': 636,
 'bagus': 145,
 'bahan': 167,
 'juga': 909,
 'adem': 15,
 'karena': 974,
 'jelek': 890,
 'jadi': 864,
 'terlalu': 2121,
 'jelas': 889,
 'baguss': 147,
 'bgttt': 318,
 'jahit': 866,
 'ada': 8,
 'benang': 272,
 'yang': 2305,
 'agak': 28,
 'harga': 776,
 'tipis': 2152,
 'tapi': 2073,
 'oke': 1508,
 'kok': 1079,
 'warna': 2251,
 'abu': 6,
 'kalau': 951,
 'foto': 664,
 'seperti': 1921,
 'biru': 343,
 'sedikit': 1866,
 'terimakasih': 2117,
 'gila': 721,
 'se': 1854,
 'worth': 2271,
 'it': 854,
 'lembut': 1191,
 'bakal': 185,
 'lama': 1151,
 'kaos': 968,
 'dengan': 536,
 'segitu': 1874,
 'thankyou': 2133,
 'next': 1430,
 'order': 1525,
 'halus': 768,
 'bukak': 403,
 'kotor': 1104,
 'ya': 2289,
 'putih': 1717,
 'banget': 200,
 'tebal': 2091,
 'harus': 785,
 'dalem': 514,
 'biar': 332,
 'bagussss': 150,
 'kirim': 1066,
 'mahal': 1240,
 'realpict': 1751,
 'cuci': 494,
 'puas': 170

In [29]:
len(new_selected_feature)

1000

In [30]:
pickle.dump(new_selected_feature,open("new_selected_feature_tf-idf.sav","wb"))

In [31]:
data_selected_feature = pd.DataFrame(X_kbest_features, columns=selected_feature)
data_selected_feature

Unnamed: 0,aaaa,aamiin,abang,abu,ada,adaa,adapadahal,adasaya,adem,ademmmmnnnn,...,ya,yaaa,yaaaa,yaaaaa,yaaaaagak,yaampun,yah,yajgn,yang,yen
0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.107531,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
1,0.0,0.0,0.0,0.000000,0.164018,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.115726,0.0
2,0.0,0.0,0.0,0.266096,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
3,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.060801,0.0
4,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.234425,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
826,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.200485,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
827,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
828,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
829,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0


# MODELING

In [32]:
selected_x = X_kbest_features
selected_x

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.11572596,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.17685977,
        0.        ]])

In [33]:
import random
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [34]:
X = selected_x
Y = data.label

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [35]:
print('Banyaknya X_train : ', len(X_train))
print('Banyaknya X_test : ', len(X_test))
print('Banyaknya Y_train : ', len(Y_train))
print('Banyaknya Y_test : ', len(Y_test))

Banyaknya X_train :  664
Banyaknya X_test :  167
Banyaknya Y_train :  664
Banyaknya Y_test :  167


In [36]:
text_algorithm = MultinomialNB()

model = text_algorithm.fit(X_train, Y_train)

In [37]:
data_input = ("tidak real pick kecewa di tapi ya emang harga sesuai minta slamet")
data_input = text_preprocessing_process(data_input)

tfidf = TfidfVectorizer
loaded_vec = TfidfVectorizer(decode_error="replace", vocabulary=set(pickle.load(open("new_selected_feature_tf-idf.sav", "rb"))))

hasil = model.predict(loaded_vec.fit_transform([data_input]))

if (hasil==0):
  s = "Negatif"
else:
  s = "Positif"

print("Hasil Prediksi : \n", s)

Hasil Prediksi : 
 Negatif


# EVALUASI

In [38]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

predicted = model.predict(X_test)

CM = confusion_matrix(Y_test, predicted)

print(classification_report(Y_test, predicted))

              precision    recall  f1-score   support

         0.0       0.95      0.97      0.96        91
         1.0       0.96      0.93      0.95        76

    accuracy                           0.95       167
   macro avg       0.95      0.95      0.95       167
weighted avg       0.95      0.95      0.95       167



In [39]:
pickle.dump(model,open("model_fraud.sav", "wb"))