## CountVectorizer for Bag of Words Model dengan Dataset Wayang 

In [1]:
import json
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

# Ngeload Datasetnya

In [2]:
with open("output/pandawa_dataset_v2.json", "r", encoding="utf-8") as f:
    data = json.load(f)


Mengambil isi data pada json

In [3]:
korpus = pd.Series([item["answer"] for item in data])

In [4]:
def text_clean(corpus, keep_list):
    cleaned_list = []   
    for row in corpus:
        qs = []
        for word in row.split():
            if word not in keep_list:
                p1 = re.sub(pattern='[^a-zA-Z0-9]', repl=' ', string=word)
                p1 = p1.lower()
                qs.append(p1)
            else:
                qs.append(word)
        cleaned_list.append(' '.join(qs))
    return pd.Series(cleaned_list, dtype="string")


In [5]:
def lemmatize(corpus):
    lem = WordNetLemmatizer()
    corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]
    return corpus


In [6]:
def stem(corpus, stem_type = None):
    if stem_type == 'snowball':
        stemmer = SnowballStemmer(language = 'english')
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    else :
        stemmer = PorterStemmer()
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    return corpus

In [7]:
def stopwords_removal(corpus):
    wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']
    stop = set(stopwords.words('english'))
    for word in wh_words:
        if word in stop:
            stop.remove(word)
    corpus = [[x for x in x.split() if x not in stop] for x in corpus]
    return corpus

In [8]:
def preprocess(corpus, keep_list, cleaning = True, stemming = False, stem_type = None,
               lemmatization = False, remove_stopwords = True):
    if cleaning:
        corpus = text_clean(corpus, keep_list)
    if remove_stopwords:
        corpus = stopwords_removal(corpus)
    else:
        corpus = [[x for x in x.split()] for x in corpus]
    if lemmatization:
        corpus = lemmatize(corpus)
    if stemming:
        corpus = stem(corpus, stem_type)
    corpus = [' '.join(x) for x in corpus]
    return corpus


In [9]:
pra_korpus = preprocess(
    korpus,
    keep_list=[], 
    stemming=False,
    stem_type=None,
    lemmatization=True,   
    remove_stopwords=True
)

In [10]:
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(pra_korpus)

OUTPUT ## CountVectorizer

In [11]:
print("\nKata-kata unik (fitur):")
print(vectorizer.get_feature_names_out()[:20])  

print("\nMatriks BoW (shape):", bow_matrix.shape)
print(bow_matrix.toarray()[:3]) 


Kata-kata unik (fitur):
['01' '04' '05' '08' '09' '10' '12' '13' '19' '20' '2017' '2018' '2019'
 '21' '22' '23' '24' '26' '27' '30']

Matriks BoW (shape): (33, 5256)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
