In [72]:
import logging
import pandas as pd
import numpy as np
import pandas as pd


from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from nltk.corpus import stopwords
from time import time 
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser

logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

In [73]:
file = pd.read_csv("svmtest.csv")

file_split = file.copy()

In [74]:
x_tokenized = [[w for w in sentence.split(" ") if w != ""] for sentence in file_split["tweet"]]
x_tokenized[1]

['cewek',
 'lho',
 'rasain',
 'sibuk',
 'jaga',
 'rasain',
 'sakit',
 'haid',
 'panik',
 'pulang',
 'malam',
 'gimana',
 'orang',
 'asing',
 'wajar',
 'korban',
 'takut',
 'curhat',
 'bela',
 'hujat']

In [75]:
class Sequencer():
    
    def __init__(self,
                 all_words,
                 max_words,
                 seq_len,
                 embedding_matrix
                ):
        
        self.seq_len = seq_len
        self.embed_matrix = embedding_matrix
        """
        temp_vocab = Vocab which has all the unique words
        self.vocab = Our last vocab which has only most used N words.
    
        """
        temp_vocab = list(set(all_words))
        self.vocab = []
        self.word_cnts = {}
        """
        Now we'll create a hash map (dict) which includes words and their occurencies
        """
        for word in temp_vocab:
            # 0 does not have a meaning, you can add the word to the list
            # or something different.
            count = len([0 for w in all_words if w == word])
            self.word_cnts[word] = count
            counts = list(self.word_cnts.values())
            indexes = list(range(len(counts)))
        
        # Now we'll sort counts and while sorting them also will sort indexes.
        # We'll use those indexes to find most used N word.
        cnt = 0
        while cnt + 1 != len(counts):
            cnt = 0
            for i in range(len(counts)-1):
                if counts[i] < counts[i+1]:
                    counts[i+1],counts[i] = counts[i],counts[i+1]
                    indexes[i],indexes[i+1] = indexes[i+1],indexes[i]
                else:
                    cnt += 1
        
        for ind in indexes[:max_words]:
            self.vocab.append(temp_vocab[ind])
                    
    def textToVector(self,text):
        # First we need to split the text into its tokens and learn the length
        # If length is shorter than the max len we'll add some spaces (100D vectors which has only zero values)
        # If it's longer than the max len we'll trim from the end.
        tokens = text.split()
        len_v = len(tokens)-1 if len(tokens) < self.seq_len else self.seq_len-1
        vec = []
        for tok in tokens[:len_v]:
            try:
                vec.append(self.embed_matrix[tok])
            except Exception as E:
                pass
        
        last_pieces = self.seq_len - len(vec)
        for i in range(last_pieces):
            vec.append(np.zeros(300,))
        
        return np.asarray(vec).flatten()

In [76]:
#load the model
word_embeddings = Word2Vec.load("svmtest.model").wv

sequencer = Sequencer(all_words = [token for seq in x_tokenized for token in seq],
              max_words = 1200,
              seq_len = 15,
              embedding_matrix = word_embeddings
             )

INFO - 07:02:39: loading Word2Vec object from svmtest.model
INFO - 07:02:39: loading wv recursively from svmtest.model.wv.* with mmap=None
INFO - 07:02:39: setting ignored attribute cum_table to None
INFO - 07:02:39: Word2Vec lifecycle event {'fname': 'svmtest.model', 'datetime': '2023-06-24T07:02:39.288547', 'gensim': '4.3.1', 'python': '3.11.4 (tags/v3.11.4:d2340ef, Jun  7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'loaded'}


In [77]:
# But before creating a PCA model using scikit-learn let's create
# vectors for our each vector
x_vecs = np.asarray([sequencer.textToVector(" ".join(seq)) for seq in x_tokenized])
print(x_vecs.shape)

(4398, 4500)


In [78]:
from sklearn.decomposition import PCA
pca_model = PCA(n_components=1000)
pca_model.fit(x_vecs)
print("Sum of variance ratios: ",sum(pca_model.explained_variance_ratio_))

Sum of variance ratios:  0.9717495765777076


In [79]:
x_comps = pca_model.transform(x_vecs)
x_comps.shape

(4398, 1000)

In [80]:
x_train,x_test,y_train,y_test = train_test_split(x_comps,file_split['label'],test_size=0.2,random_state=42)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(3518, 1000)
(880, 1000)
(3518,)
(880,)


In [81]:
import time
start = time.time() 

svm_classifier = SVC()
svm_classifier.fit(x_train,y_train)

end = time.time()
process = round(end-start,2)
print("Support Vector Machine Classifier has fitted, this process took {} seconds".format(process))

Support Vector Machine Classifier has fitted, this process took 2.62 seconds


In [82]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

predicted_classes = svm_classifier.predict(x_test)
conf_matrix = pd.DataFrame(confusion_matrix(y_test, predicted_classes))
print('Confusion Matrix of SVM Classifier')
display(conf_matrix)

test_scores = accuracy_score(y_test,predicted_classes), precision_score(y_test, predicted_classes), recall_score(y_test, predicted_classes), f1_score(y_test, predicted_classes)

print('\n \n Scores of SVM Classifier')
scores = pd.DataFrame(data=[test_scores])
scores.columns = ['accuracy', 'precision', 'recall', 'f1']
scores = scores.T
scores.columns = ['scores']
display(scores)

Confusion Matrix of SVM Classifier


Unnamed: 0,0,1
0,490,63
1,125,202



 
 Scores of SVM Classifier


Unnamed: 0,scores
accuracy,0.786364
precision,0.762264
recall,0.617737
f1,0.682432


In [83]:
from sklearn.naive_bayes import GaussianNB,BernoulliNB

start = time.time()

gnb = GaussianNB()
gnb.fit(x_train,y_train)

end = time.time()
process = round(end-start,2)
print("Gaussian Naive Bayes has fitted, this process took {} seconds".format(process))

Gaussian Naive Bayes has fitted, this process took 0.04 seconds


In [84]:
start = time.time()

bnb = BernoulliNB()
bnb.fit(x_train,y_train)

end = time.time()
process = round(end-start,2)
print("Bernoulli Naive Bayes has fitted, this process took {} seconds".format(process))

Bernoulli Naive Bayes has fitted, this process took 0.06 seconds


In [85]:
gnb_prediction = gnb.predict(x_test)
conf_matrix = pd.DataFrame(confusion_matrix(y_test, gnb_prediction))
print('Confusion Matrix of Gaussian Naive Bayes')
display(conf_matrix)

gnb_test_scores = accuracy_score(y_test,gnb_prediction), precision_score(y_test, gnb_prediction), recall_score(y_test, gnb_prediction), f1_score(y_test, gnb_prediction)

print('\n \n Scores of Gaussian Naive Bayes')
scores = pd.DataFrame(data=[gnb_test_scores])
scores.columns = ['accuracy', 'precision', 'recall', 'f1']
scores = scores.T
scores.columns = ['scores']
display(scores)


bnb_prediction = bnb.predict(x_test)
conf_matrix = pd.DataFrame(confusion_matrix(y_test, bnb_prediction))
print('Confusion Matrix of Bernoulli Naive Bayes')
display(conf_matrix)

bnb_test_scores = accuracy_score(y_test,bnb_prediction), precision_score(y_test, bnb_prediction), recall_score(y_test, bnb_prediction), f1_score(y_test, bnb_prediction)

print('\n \n Scores of Bernoulli Naive Bayes')
scores = pd.DataFrame(data=[bnb_test_scores])
scores.columns = ['accuracy', 'precision', 'recall', 'f1']
scores = scores.T
scores.columns = ['scores']
display(scores)

Confusion Matrix of Gaussian Naive Bayes


Unnamed: 0,0,1
0,304,249
1,159,168



 
 Scores of Gaussian Naive Bayes


Unnamed: 0,scores
accuracy,0.536364
precision,0.402878
recall,0.513761
f1,0.451613


Confusion Matrix of Bernoulli Naive Bayes


Unnamed: 0,0,1
0,447,106
1,174,153



 
 Scores of Bernoulli Naive Bayes


Unnamed: 0,scores
accuracy,0.681818
precision,0.590734
recall,0.46789
f1,0.522184
