# Measuring Cosine Similarity between Document Vectors Dengan Dataset Wayang

***kali ini kelompok kami mencoba dengan data manual tapi penyesuaian dengan data pada cerita wayang***

In [18]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\IMAM\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\IMAM\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Contoh Dataset Wayang
pandawa003
1. **Kalimat 1:**  
   "Kisah ini menceritakan tentang perkawinan antara Raden Gatutkaca putra Arya Wrekodara dengan Endang Pregiwa putri Raden Arjuna"

2. **Kalimat 2:**  
   "Kisah ini saya olah dari dongeng yang disampaikan orang tua saya, dengan sedikit pengembangan seperlunya"

3. **Kalimat 3:**  
   "Kediri, 31 Agustus 2017 Heri Purwanto Untuk daftar judul lakon wayang lainnya"


In [19]:
sentences = ["Kisah ini menceritakan tentang perkawinan antara Raden Gatutkaca putra Arya Wrekodara dengan Endang Pregiwa putri Raden Arjuna",
            "Kisah ini saya olah dari dongeng yang disampaikan orang tua saya, dengan sedikit pengembangan seperlunya",
            "Kediri, 31 Agustus 2017 Heri Purwanto Untuk daftar judul lakon wayang lainnya"]

In [20]:
corpus = pd.Series(sentences)
corpus

0    Kisah ini menceritakan tentang perkawinan anta...
1    Kisah ini saya olah dari dongeng yang disampai...
2    Kediri, 31 Agustus 2017 Heri Purwanto Untuk da...
dtype: object

In [21]:
def text_clean(corpus, keep_list):
    cleaned_list = []
    for row in corpus:
        qs = []
        for word in row.split():
            if word not in keep_list:
                p1 = re.sub(pattern='[^a-zA-Z0-9]',repl=' ',string=word)
                p1 = p1.lower()
                qs.append(p1)
            else : qs.append(word)
        cleaned_list.append(' '.join(qs))
    return pd.Series(cleaned_list, dtype="string")

In [22]:
def lemmatize(corpus):
    lem = WordNetLemmatizer()
    corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]
    return corpus

In [23]:
def stem(corpus, stem_type = None):
    if stem_type == 'snowball':
        stemmer = SnowballStemmer(language = 'english')
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    else :
        stemmer = PorterStemmer()
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    return corpus

In [29]:
def stopwords_removal(corpus):
    wh_words = ['who', 'what', 'when', 'why', 'how', 'which', 'where', 'whom']
    stop = set(stopwords.words('english'))
    for word in wh_words:
        stop.remove(word)
    corpus = [[x for x in x.split() if x not in stop] for x in corpus]
    return corpus

In [30]:
def preprocess(corpus, keep_list, cleaning = True, stemming = False, stem_type = None, lemmatization = False, remove_stopwords = True):

    if cleaning == True:
        corpus = text_clean(corpus, keep_list)

    if remove_stopwords == True:
        corpus = stopwords_removal(corpus)
    else :
        corpus = [[x for x in x.split()] for x in corpus]

    if lemmatization == True:
        corpus = lemmatize(corpus)


    if stemming == True:
        corpus = stem(corpus, stem_type)

    corpus = [' '.join(x) for x in corpus]

    return corpus

In [31]:
preprocessed_corpus = preprocess(corpus, keep_list = [], stemming = False, stem_type = None,
                                lemmatization = True, remove_stopwords = True)
preprocessed_corpus

['kisah ini menceritakan tentang perkawinan antara raden gatutkaca putra arya wrekodara dengan endang pregiwa putri raden arjuna',
 'kisah ini saya olah dari dongeng yang disampaikan orang tua saya dengan sedikit pengembangan seperlunya',
 'kediri 31 agustus 2017 heri purwanto untuk daftar judul lakon wayang lainnya']

In [32]:
def cosine_similarity(vector1, vector2):
    vector1 = np.array(vector1)
    vector2 = np.array(vector2)
    return np.dot(vector1, vector2) / (np.sqrt(np.sum(vector1**2)) * np.sqrt(np.sum(vector2**2)))

## CountVectorizer

In [33]:
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(preprocessed_corpus)

In [13]:
print(vectorizer.get_feature_names_out())
print(bow_matrix.toarray())

['2017' '31' 'agustus' 'antara' 'arjuna' 'arya' 'daftar' 'dari' 'dengan'
 'disampaikan' 'dongeng' 'endang' 'gatutkaca' 'heri' 'ini' 'judul'
 'kediri' 'kisah' 'lainnya' 'lakon' 'menceritakan' 'olah' 'orang'
 'pengembangan' 'perkawinan' 'pregiwa' 'purwanto' 'putra' 'putri' 'raden'
 'saya' 'sedikit' 'seperlunya' 'tentang' 'tua' 'untuk' 'wayang'
 'wrekodara' 'yang']
[[0 0 0 1 1 1 0 0 1 0 0 1 1 0 1 0 0 1 0 0 1 0 0 0 1 1 0 1 1 2 0 0 0 1 0 0
  0 1 0]
 [0 0 0 0 0 0 0 1 1 1 1 0 0 0 1 0 0 1 0 0 0 1 1 1 0 0 0 0 0 0 2 1 1 0 1 0
  0 0 1]
 [1 1 1 0 0 0 1 0 0 0 0 0 0 1 0 1 1 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1
  1 0 0]]


## Cosine similarity between the document vectors built using CountVectorizer

In [34]:
for i in range(bow_matrix.shape[0]):
    for j in range(i + 1, bow_matrix.shape[0]):
        print("The cosine similarity between the documents ", i, "and", j, "is: ",
              cosine_similarity(bow_matrix.toarray()[i], bow_matrix.toarray()[j]))

The cosine similarity between the documents  0 and 1 is:  0.16692446522239712
The cosine similarity between the documents  0 and 2 is:  0.0
The cosine similarity between the documents  1 and 2 is:  0.0


In [35]:
vectorizer = TfidfVectorizer()
tf_idf_matrix = vectorizer.fit_transform(preprocessed_corpus)

In [36]:
print(vectorizer.get_feature_names_out())
print(tf_idf_matrix.toarray())
print("\nThe shape of the TF-IDF matrix is: ", tf_idf_matrix.shape)

['2017' '31' 'agustus' 'antara' 'arjuna' 'arya' 'daftar' 'dari' 'dengan'
 'disampaikan' 'dongeng' 'endang' 'gatutkaca' 'heri' 'ini' 'judul'
 'kediri' 'kisah' 'lainnya' 'lakon' 'menceritakan' 'olah' 'orang'
 'pengembangan' 'perkawinan' 'pregiwa' 'purwanto' 'putra' 'putri' 'raden'
 'saya' 'sedikit' 'seperlunya' 'tentang' 'tua' 'untuk' 'wayang'
 'wrekodara' 'yang']
[[0.         0.         0.         0.23745536 0.23745536 0.23745536
  0.         0.         0.18059092 0.         0.         0.23745536
  0.23745536 0.         0.18059092 0.         0.         0.18059092
  0.         0.         0.23745536 0.         0.         0.
  0.23745536 0.23745536 0.         0.23745536 0.23745536 0.47491072
  0.         0.         0.         0.23745536 0.         0.
  0.         0.23745536 0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.2520948  0.19172459 0.2520948  0.2520948  0.
  0.         0.         0.19172459 0.         0.         0.19172459
  0.         0.    

In [17]:
for i in range(tf_idf_matrix.shape[0]):
    for j in range(i + 1, tf_idf_matrix.shape[0]):
        print("The cosine similarity between the documents ", i, "and", j, "is: ",
              cosine_similarity(tf_idf_matrix.toarray()[i], tf_idf_matrix.toarray()[j]))

The cosine similarity between the documents  0 and 1 is:  0.10387116284571872
The cosine similarity between the documents  0 and 2 is:  0.0
The cosine similarity between the documents  1 and 2 is:  0.0
