# One Hot Vectors dengan dataset Wayang

In [1]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import pandas as pd
import re
import numpy as np

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\IMAM\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\IMAM\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
sentence = ["Patih Sangkuni curiga, Arya Gatutkaca memiliki niat buruk, mengingat Gunung Argakelasa terletak di perbatasan antara Kerajaan Hastina dan Kerajaan Amarta",
            " Ia yakin, Arya Gatutkaca pasti sedang mengumpulkan kekuatan untuk memberontak kepada dirinya"]

In [3]:
corpus = pd.Series(sentence)
corpus

0    Patih Sangkuni curiga, Arya Gatutkaca memiliki...
1     Ia yakin, Arya Gatutkaca pasti sedang mengump...
dtype: object

In [4]:
def text_clean(corpus, keep_list):
    cleaned_list = []
    for row in corpus:
        qs = []
        for word in row.split():
            if word not in keep_list:
                p1 = re.sub(pattern='[^a-zA-Z0-9]',repl=' ',string=word)
                p1 = p1.lower()
                qs.append(p1)
            else : qs.append(word)
        cleaned_list.append(' '.join(qs))
    return pd.Series(cleaned_list, dtype="string")

In [5]:
def lemmatize(corpus):
    lem = WordNetLemmatizer()
    corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]
    return corpus

In [6]:
def stem(corpus, stem_type = None):
    if stem_type == 'snowball':
        stemmer = SnowballStemmer(language = 'english')
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    else :
        stemmer = PorterStemmer()
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    return corpus

In [7]:
def preprocess(corpus, keep_list, cleaning = True, stemming = False, stem_type = None, lemmatization = False, remove_stopwords = True):


    if cleaning == True:
        corpus = text_clean(corpus, keep_list)

    else :
        corpus = [[x for x in x.split()] for x in corpus]

    if lemmatization == True:
        corpus = lemmatize(corpus)


    if stemming == True:
        corpus = stem(corpus, stem_type)

    corpus = [' '.join(x) for x in corpus]

    return corpus

In [8]:

preprocessed_corpus = preprocess(corpus, keep_list = [], stemming = False, stem_type = None,
                                lemmatization = True)
preprocessed_corpus

['p a t i h   s a n g k u n i   c u r i g a     a r y a   g a t u t k a c a   m e m i l i k i   n i a t   b u r u k     m e n g i n g a t   g u n u n g   a r g a k e l a s a   t e r l e t a k   d i   p e r b a t a s a n   a n t a r a   k e r a j a a n   h a s t i n a   d a n   k e r a j a a n   a m a r t a',
 'i a   y a k i n     a r y a   g a t u t k a c a   p a s t i   s e d a n g   m e n g u m p u l k a n   k e k u a t a n   u n t u k   m e m b e r o n t a k   k e p a d a   d i r i n y a']

In [9]:
set_of_words = set()
for word in preprocessed_corpus[0].split():
    set_of_words.add(word)
vocab = list(set_of_words)
print(vocab)

['b', 't', 'i', 'c', 'h', 'm', 'k', 'l', 'e', 'a', 'j', 'r', 'g', 'n', 'd', 'p', 's', 'u', 'y']


In [10]:
position = {}
for i, token in enumerate(vocab):
    position[token] = i
print(position)

{'b': 0, 't': 1, 'i': 2, 'c': 3, 'h': 4, 'm': 5, 'k': 6, 'l': 7, 'e': 8, 'a': 9, 'j': 10, 'r': 11, 'g': 12, 'n': 13, 'd': 14, 'p': 15, 's': 16, 'u': 17, 'y': 18}


In [11]:
one_hot_matrix = np.zeros((len(preprocessed_corpus[0].split()), len(vocab)))
one_hot_matrix.shape

(132, 19)

In [12]:
for i, token in enumerate(preprocessed_corpus[0].split()):
    one_hot_matrix[i][position[token]] = 1

In [13]:
one_hot_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(132, 19))