### Algoritma shallow machine learning (SVM/XGBOOST/dll) dengan vector space model atau menggunakan dimensional reduction technique spt LSI

In [None]:
!pip install --upgrade git+https://github.com/ariaghora/mpstemmer.git
!pip install python-Levenshtein

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/ariaghora/mpstemmer.git
  Cloning https://github.com/ariaghora/mpstemmer.git to /tmp/pip-req-build-fogkroce
  Running command git clone -q https://github.com/ariaghora/mpstemmer.git /tmp/pip-req-build-fogkroce
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import numpy as np
import nltk
from mpstemmer import MPStemmer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, classification_report

nltk.download("popular")
np.random.seed(500)

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to-date!
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Package names is already up-to-date!
[nltk_data]    | Do

In [None]:
CorpusTrain = pd.read_csv("train.csv", encoding='latin-1')
CorpusTest = pd.read_csv("test.csv", encoding='latin-1')

In [None]:
def preprocessing(Corpus):
  # 1. Menghapus value yang kosong
  Corpus['text_a'].dropna(inplace=True)
  # 2. Melakukan backup ke "text_original"
  Corpus['text_original'] = Corpus['text_a']
  # 3. Membuat seluruh huruf dalam lowercase
  Corpus['text_a'] = [entry.lower() for entry in Corpus['text_a']]
  # 4. Tokenization
  Corpus['text_a']= [word_tokenize(entry) for entry in Corpus['text_a']]
  # Tidak dilakukan lemmatization karena tidak ada library yang mendukung
  # Bahasa Indonesia dan memiliki performa yang baik, namun kami menemukan
  # library stemming yang cukup baik
  # 5. Penghapusan stopwords dan memproses stemming
  stemmer = MPStemmer()
  for index,entry in enumerate(Corpus['text_a']):
      Final_words = []
      for word in entry:
          # Mengecek apabila token adalah stop words atau bukan, dan hanya perbolehkan alfabet
          if word not in stopwords.words('indonesian') and word.isalpha():
              word_Final = stemmer.stem(word)
              Final_words.append(word_Final)
      # Hasil final akan disimpan di 'text_final'
      Corpus.loc[index,'text_final'] = str(Final_words)
  # Drop 'text_a'
  Corpus.drop(['text_a'], axis=1, inplace=True)


preprocessing(CorpusTrain)
preprocessing(CorpusTest)

CorpusTrain.head()

Unnamed: 0.1,Unnamed: 0,label,text_original,text_final
0,0,no,betewe buka twitter cuman ngetweet liat home b...,"['betewe', 'buka', 'twitter', 'cuman', 'ngetwe..."
1,1,no,mas piyuuu mugo2 corona tuh mulut tersumpal ma...,"['mas', 'piyuuu', 'corona', 'itu', 'mulut', 's..."
2,2,yes,e100ss gini buka informasi sejelas nya identit...,"['begini', 'buka', 'informasi', 'jelas', 'nya'..."
3,3,no,neng solo wes ono terduga corona cobo neng ati...,"['neng', 'solo', 'wes', 'ono', 'duga', 'corona..."
4,4,no,midiahn nii akun gak takut takut nya isu coron...,"['midiahn', 'nii', 'akun', 'tidak', 'takut', '..."


In [None]:
# Split antara label dan fitur
Train_X, Train_Y = CorpusTrain['text_final'], CorpusTrain['label']
Test_X, Test_Y = CorpusTest['text_final'], CorpusTest['label']

In [None]:
# Melakukan encoding untuk label
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)

In [None]:
# Vectorization menggunakan TfidfVectorizer
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(CorpusTrain['text_final'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

print(Tfidf_vect.vocabulary_)

{'buka': 669, 'twitter': 4704, 'cuman': 877, 'liat': 2511, 'home': 1617, 'berita': 498, 'corona': 843, 'panik': 3194, 'pikir': 3378, 'yang': 4956, 'aware': 300, 'aja': 70, 'stay': 4239, 'at': 278, 'nda': 2986, 'rumah': 3793, 'kalau': 2034, 'banget': 372, 'mas': 2644, 'piyuuu': 3398, 'itu': 1816, 'mulut': 2938, 'ma': 2580, 'begini': 459, 'informasi': 1754, 'jelas': 1908, 'nya': 3049, 'identitas': 1666, 'daerah': 894, 'derita': 995, 'jangkit': 1870, 'info': 1747, 'masyarakat': 2655, 'isolasi': 1796, 'kontak': 2302, 'langsung': 2441, 'positif': 3465, 'tutup': 4696, 'neng': 2998, 'solo': 4187, 'wes': 4887, 'ono': 3104, 'duga': 1123, 'ati': 283, 'mu': 2928, 'conora': 837, 'akun': 99, 'tidak': 4540, 'takut': 4370, 'isu': 1806, 'wkwkwkw': 4921, 'hey': 1573, 'gara': 1343, 'masuk': 2654, 'tempat': 4462, 'mesti': 2870, 'gue': 1450, 'kek': 2133, 'jajan': 1840, 'indomaret': 1733, 'sila': 4092, 'tes': 4498, 'cegah': 743, 'other': 3134, 'minum': 2894, 'my': 2960, 'mom': 2913, 'rebus': 3667, 'cb': 73

In [None]:
# Classifier - Algorithm - SVM
# fit training data ke classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf,Train_Y)

# Prediksi label di dataset validation
predictions_SVM = SVM.predict(Test_X_Tfidf)

In [None]:
f1_res = f1_score(Test_Y, predictions_SVM, average='macro')*100
print("F1 Score -> ", f1_res)

roc_auc_res = roc_auc_score(Test_Y, predictions_SVM)*100
print("ROC AUC Score -> ", f1_res)

accuracy_res = accuracy_score(predictions_SVM, Test_Y)*100
print("SVM Accuracy Score -> ", accuracy_res)

F1 Score ->  79.93736136841318
ROC AUC Score ->  79.93736136841318
SVM Accuracy Score ->  85.21428571428571


In [None]:
print(classification_report(Test_Y,predictions_SVM))

              precision    recall  f1-score   support

           0       0.89      0.91      0.90      2093
           1       0.72      0.67      0.70       707

    accuracy                           0.85      2800
   macro avg       0.81      0.79      0.80      2800
weighted avg       0.85      0.85      0.85      2800

