# **SVM dengan K-Fold Cross Validation**

## **Library Sastrawi**

In [2]:
!pip install Sastrawi



## **Library Preprocessing**

In [3]:
import string
import re
import numpy as np
import pandas as pd
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

## **Library feature Extraction/Classifier/Evaluation**

In [4]:
from sklearn import preprocessing, model_selection, svm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

## **Libaray Visualization**

In [5]:
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [6]:
# word_tokenize & FreqDist from nltk
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

# stopwords
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to /home/pythonku/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/pythonku/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
np.random.seed(123)

##**Baca Dataset**

> Sebelum Case Folding

In [8]:
data = pd.read_csv("Dataset.csv", sep=";", encoding='latin1')
data['tweet'].str.encode('ascii','ignore')
data.head()

FileNotFoundError: [Errno 2] No such file or directory: 'Dataset.csv'

In [None]:
data = pd.read_csv("data_after_preprocessing.csv",sep=';', encoding='utf-8')

In [None]:
data.head()

> Sesudah Case Folding

## Text Cleaning

>Karakter Spesial

In [None]:
def remove_tweet_special(text):
  # Remove Tab, New Line, ans Back Slice
  text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
  # Remove non ASCII (emoticon, chinese word, etc)
  text = text.encode('ascii', 'replace').decode('ascii')
  # Remove mention, link, hastag
  text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
  # Remove incomplete URL
  text = text.replace("http://", " ").replace("https://", " ")
  # Remove Number
  text = re.sub(r"\d+", "", text)
  # Remove Punctuation
  text = text.translate(str.maketrans("","",string.punctuation))
  # whitespace
  text = text.strip()
  # whitiespace multiple
  text = re.sub('\s+',' ',text)
  #single char
  text = re.sub(r"\b[a-zA-Z]\b", "", text)
    #make lower text
  text = str.lower(text)
  return text

data['cleaning'] = data['tweet'].apply(remove_tweet_special)

In [None]:
data.head()

In [None]:
data.to_csv("dataset_text_after_cleaning.csv",sep=';')

##**Tokenizing**

In [None]:
# NLTK word tokenize 
def word_tokenize_wrapper(text):
    return word_tokenize(text)

data['tweet_tokens'] = data['cleaning'].apply(word_tokenize_wrapper)

In [None]:
print('Tokenizing Result : \n') 
print(data['tweet_tokens'].head())

NLTK calc Frequently Distribution

In [None]:
data.to_csv("dataset_text_after_tokenizing.csv",sep=';')

In [None]:
def freqDist_wrapper(text):
  return FreqDist(text)

data['tweet_tokens_fdist'] = data['tweet_tokens'].apply(freqDist_wrapper)

In [None]:
print('Frequency Tokens : \n') 
print(data['tweet_tokens_fdist'].head().apply(lambda x : x.most_common()))

## Stopwords

>Stopwords Indonesia

In [None]:
list_stopwords = stopwords.words('indonesian')
#print(list_stopwords)

>Stopwords Additional

In [None]:
list_stopwords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo', 
                       'kalo', 'amp', 'biar', 'bikin', 'bilang', 
                       'gak', 'ga', 'krn', 'nya', 'nih', 'sih', 
                       'si', 'tau', 'tdk', 'tuh', 'utk', 'ya', 
                       'jd', 'jgn', 'sdh', 'aja', 'n', 't', 
                       'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt',
                       '&amp', 'yah'])
#print(list_stopwords)

>Stopwords dari File

In [None]:
txt_stopwords = pd.read_csv("stopwords.txt",names=["stopwords"], header=None)
list_stopwords.extend(txt_stopwords["stopwords"][0].split(' '))

In [None]:
print(list_stopwords)

>Convert List to Dictionary

In [None]:
list_stopwords = set(list_stopwords)

> Menghapus Stopwords pada list token

In [None]:
def stopwords_removal(words):
  return [word for word in words if word not in list_stopwords]

data['tweet_stopwords'] = data['tweet_tokens'].apply(stopwords_removal)

In [None]:
print(data['tweet_stopwords'].head(5))

In [None]:
data.to_csv('data_text_after_stopwords.csv',sep=';')

##Normalization

In [None]:
normalization_word = pd.read_csv("colloquial-indonesian-lexicon.csv")
normalization_word_dict = {}

for index, row in normalization_word.iterrows():
  if row[0] not in normalization_word_dict:
    normalization_word_dict[row[0]] = row[1]

def normalization(document):
  return [normalization_word_dict[term] if term in normalization_word_dict else term for term in document]

data['normalization'] = data['tweet_stopwords'].apply(normalization)


In [None]:
from nltk.tokenize.treebank import TreebankWordDetokenizer
data['normalization']=data['normalization'].apply(lambda x: TreebankWordDetokenizer().detokenize(x))
data.head()

In [None]:
data.to_csv('data_text_after_normalization.csv',sep=';')

## Stemming

In [None]:
def stemming(text):
  factory = StemmerFactory()
  stemmer = factory.create_stemmer()
  stem = stemmer.stem(text)
  return stem

data['text_stemming'] = data['normalization'].apply(lambda text: stemming(text))

In [None]:
data.head()

In [None]:
data.to_csv("data_text_after_stemming.csv", sep=';')

In [None]:
data.to_csv("data_after_preprocessing.csv", sep=';',encoding='utf-8')

##Memberi Label

In [None]:
def labelToNumeric(category):
  if category == 'Negatif':
    return 0
  elif category == 'Positif':
    return 1
  else :
    return 0
data['category'] = data['Value'].apply(labelToNumeric)

In [None]:
positif,negatif=data['category'].value_counts()

In [None]:
data['category']

In [None]:
tfid_vectorized = TfidfVectorizer()
tfid_vectorized.fit(data['tweet'])
dictionary = tfid_vectorized.vocabulary_.items()
vocab = []
count = []
for key, value in dictionary:
  vocab.append(key)
  count.append(value)
vocab_bef_stem = pd.Series(count, index=vocab)
vocab_bef_stem = vocab_bef_stem.sort_values(ascending=True)
top_vocab = vocab_bef_stem.head(30)
top_vocab.plot(kind = 'barh', figsize=(5,10))

In [None]:
X_train, X_test, y_train, y_test = model_selection.train_test_split( data['text_stemming'].values.astype('U'), data['category'], test_size=0.2, random_state=45)

## Feature Extraction/Pembobotan TF-IDF

>Bag of Words Model

In [None]:
cv = CountVectorizer()
X = cv.fit_transform(data['text_stemming'].values.astype('U')).toarray()
Y = data['category']

In [None]:
import pickle
pickle.dump(cv,open('cv.model'))

>Creating TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()

In [None]:
print(X)

## CRoss Validation

In [None]:
from sklearn.model_selection import cross_val_score
clf = svm.SVC(C=0.5, degree=2, gamma='scale', kernel='sigmoid')
scores = cross_val_score(clf,X,Y, cv=10)

In [None]:
print(scores)
scores.mean()

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf2 = MultinomialNB(fit_prior=False)
scores2 = cross_val_score(clf2,X,Y, cv=10)

In [None]:
print(scores2)
scores2.mean()

## Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn import svm
svm = svm.SVC()
param = {'C':(0.5,1),
         'kernel':('rbf','linear','sigmoid','poly'),
         'gamma':('scale','auto'),
         'degree':(2,3,4)
         }
grid = GridSearchCV(svm, param)
grid.fit(X,Y)
grid.best_params_

In [None]:
grid.best_score_

## Split Dataset

>Splitting Dataset into training and testing

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.2, random_state=45)

In [None]:
print(X)

## SVM

In [None]:
from sklearn import svm
import time

classifierSVM = svm.SVC(C=0.5,degree=2, gamma='scale', kernel='sigmoid')

>Proses Pelatihan

In [None]:
t0 = time.time()
classifierSVM.fit(X_train, y_train)
t1 = time.time()

>Prediksi Dataset

In [None]:
y_pred_SVM = classifierSVM.predict(X_test)
t2 = time.time()
time_train=t1-t0
time_predict=t2-t1

In [None]:
#time
print("Training Time: %fs, Prediction Time: %fs"%(time_train, time_predict))

In [None]:
import pickle

In [None]:
pickle.dump(classifierSVM,open('classifierSVM.model','wb'))

In [None]:
from sklearn.metrics import confusion_matrix
cm_SVM = confusion_matrix(y_test, y_pred_SVM)
print('confusin matrix :')
print(cm_SVM)

from sklearn.metrics import accuracy_score
acc = (accuracy_score(y_test, y_pred_SVM))
print("Accuracy : ", (acc))

from sklearn.metrics import precision_score
pcc = (precision_score(y_test, y_pred_SVM))
print("Preccision : ",(pcc))

from sklearn.metrics import recall_score
rec = (recall_score(y_test, y_pred_SVM))
print("Recall : ", (rec))

from sklearn.metrics import f1_score
f1= (f1_score(y_test, y_pred_SVM))
print("Skor F1: ", (f1))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_SVM))

## Prediksi SVM

In [None]:
def prediksi_SVM(text):
  text = remove_tweet_special(text)
  text = str.lower(text)
  text = word_tokenize_wrapper(text)
  text = stopwords_removal(text)
  text = normalization(text)
  text = TreebankWordDetokenizer().detokenize(text)
  text = stemming(text)
  text = cv.transform([text]).toarray()
  text = tfidfconverter.transform(text).toarray()
  text = classifierSVM.predict(text)
  if text[0] == 0:
    print('Sentimen Negatif')
  else:
    print('Sentimen Positif')
  return text

In [None]:
SVM = prediksi_SVM("Indonesia Vaksin gratis")

## Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB
import time

classifierNB = MultinomialNB(fit_prior=False)
#proses pelatihan
t0 = time.time()
classifierNB.fit(X_train, y_train)
t1= time.time()
#prediksi 
y_pred_NB= classifierNB.predict(X_test)
t2=time.time()
time_train=t1-t0
time_predict=t2-t1

In [None]:
#time
print("Training Time: %fs, Prediction Time: %fs"%(time_train, time_predict))

In [None]:
pickle.dump(classifierNB,open('classifierNB.model','wb'))

In [None]:
from sklearn.metrics import confusion_matrix
cm_NB = confusion_matrix(y_test, y_pred_NB)
print("confusion matrix :")
print(cm_NB)

from sklearn.metrics import accuracy_score
acc = (accuracy_score(y_test, y_pred_NB))
print("Accuracy : ", (acc))

from sklearn.metrics import precision_score
pre = (precision_score(y_test, y_pred_NB))
print("Precision : ", (pre))

from sklearn.metrics import recall_score
rec= (recall_score(y_test, y_pred_NB))
print("Recall : ", (rec))

from sklearn.metrics import f1_score
f1= (f1_score(y_test, y_pred_NB))
print("Skor F1: ", (f1))

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_NB))

##Prediksi Naive Bayes

In [None]:
def prediksi_NB(text):
  text = remove_tweet_special(text)
  text = str.lower(text)
  text = word_tokenize_wrapper(text)
  text = stopwords_removal(text)
  text = normalization(text)
  text = TreebankWordDetokenizer().detokenize(text)
  text = stemming(text)
  text = cv.transform([text]).toarray()
  text = tfidfconverter.transform(text).toarray()
  text = classifierNB.predict(text)
  if text[0] == 0:
    print('Sentimen Negatif')
  else:
    print('Sentimen Positif')
  return text

In [None]:
Naive_Bayes = prediksi_NB("Indonesia Vaksin gratis")

In [None]:
print(type(classifierNB))