In [13]:
# Import library-library
import os
import glob
from tqdm import tqdm
import numpy as np

# Data Preparation and Preprocessing
import pandas as pd
import re
from string import digits

# Word Embedding
import joblib
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer('xlm-r-distilroberta-base-paraphrase-v1')
from keybert import KeyBERT
kw_extractor = KeyBERT('distilbert-base-nli-mean-tokens')
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt

# Input and Expansion Query
import nltk
import math
from textblob import TextBlob
from yake import KeywordExtractor
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import stopwords
from nltk import tokenize
from operator import itemgetter
from nltk.tokenize import word_tokenize
from textblob import TextBlob
NLTK_StopWords = stopwords.words('indonesian')
from typing import Union

In [14]:
excluded_words = ["tempat", "waktu"]
NLTK_StopWords.append(["detik", "detikjatim", "detikjateng", "detikjabar", "detiksulsel", "detiksumbar", "detikbali", "detikpapua", "detiksulteng", "detikmaluku", "detjatim", "detikcom"])

In [15]:
def preprocessing(berita):
    s = berita.lower()
    s = s.replace('\n', ' ')
    s = s.replace('\r', ' ')
    s = s.replace(' o ', ' ')
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
    s = re.sub(r'[0-9]', ' ', s)
    tokens = [token for token in s.split(" ") if token != ""]
    T = [t for t in tokens if ((t in excluded_words) or (t not in NLTK_StopWords))]
    return T

In [16]:
df_total = pd.read_csv('corpus/dataset/df_total.csv')
df_total = df_total[pd.notnull(df_total['description'])]
print(df_total.info())
print ('-'*90)
document_text= joblib.load('corpus/model/desc_text_train.pkl')
print(len(document_text))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1790 entries, 0 to 1789
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        1790 non-null   object
 1   date         1790 non-null   object
 2   description  1790 non-null   object
 3   source       1790 non-null   object
dtypes: object(4)
memory usage: 69.9+ KB
None
------------------------------------------------------------------------------------------
1611


In [17]:
df_test = pd.read_csv('corpus/dataset/df_test.csv')
df_test = df_test[pd.notnull(df_test['description'])]
print(df_test.info())
print ('-'*90)
document_text_test= joblib.load('corpus/model/desc_text_test.pkl')
print(len(document_text_test))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 179 entries, 0 to 178
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        179 non-null    object
 1   date         179 non-null    object
 2   description  179 non-null    object
 3   source       179 non-null    object
dtypes: object(4)
memory usage: 7.0+ KB
None
------------------------------------------------------------------------------------------
179


In [18]:
df_train = pd.read_csv('corpus/dataset/df_train.csv')
df_train = df_train[pd.notnull(df_train['description'])]
print(df_train.info())
print ('-'*90)
document_text_train= joblib.load('corpus/model/desc_text_train.pkl')
print(len(document_text_train))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1611 entries, 0 to 1610
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        1611 non-null   object
 1   date         1611 non-null   object
 2   description  1611 non-null   object
 3   source       1611 non-null   object
dtypes: object(4)
memory usage: 62.9+ KB
None
------------------------------------------------------------------------------------------
1611


In [19]:
# Load bow dataset
df_bow_what = pd.read_csv("bow/bow_what.csv")
df_bow_what.head()
df_bow_what.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1111111 entries, 0 to 1111110
Data columns (total 3 columns):
 #   Column                  Non-Null Count    Dtype 
---  ------                  --------------    ----- 
 0   tingkat setelah parent  1111111 non-null  int64 
 1   parent                  1111111 non-null  object
 2   similarity              1111111 non-null  object
dtypes: int64(1), object(2)
memory usage: 25.4+ MB


In [20]:
# Load bow dataset
df_bow_when = pd.read_csv("bow/bow_when.csv")
df_bow_when.head()
df_bow_when.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1111111 entries, 0 to 1111110
Data columns (total 3 columns):
 #   Column                  Non-Null Count    Dtype 
---  ------                  --------------    ----- 
 0   tingkat setelah parent  1111111 non-null  int64 
 1   parent                  1111051 non-null  object
 2   similarity              1111111 non-null  object
dtypes: int64(1), object(2)
memory usage: 25.4+ MB


In [21]:
# Load bow dataset
df_bow_where = pd.read_csv("bow/bow_where.csv")
df_bow_where.head()
df_bow_where.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 222222 entries, 0 to 222221
Data columns (total 3 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   tingkat setelah parent  222222 non-null  int64 
 1   parent                  222222 non-null  object
 2   similarity              222222 non-null  object
dtypes: int64(1), object(2)
memory usage: 5.1+ MB


In [22]:
# Load bow dataset
df_bow_who = pd.read_csv("bow/bow_who.csv")
df_bow_who.head()
df_bow_who.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1111111 entries, 0 to 1111110
Data columns (total 3 columns):
 #   Column                  Non-Null Count    Dtype 
---  ------                  --------------    ----- 
 0   tingkat setelah parent  1111111 non-null  int64 
 1   parent                  1111111 non-null  object
 2   similarity              1111111 non-null  object
dtypes: int64(1), object(2)
memory usage: 25.4+ MB


In [23]:
# Ambil parent dari bow
bow_list_what = []
bow_list_when = []
bow_list_where = []
bow_list_who = []

for i in range(0, df_bow_what.shape[0]):
  bow_list_what.append(df_bow_what.iloc[i, 1])

for i in range(0, df_bow_when.shape[0]):
  bow_list_when.append(df_bow_when.iloc[i, 1])

for i in range(0, df_bow_where.shape[0]):
  bow_list_where.append(df_bow_where.iloc[i, 1])

for i in range(0, df_bow_who.shape[0]):
  bow_list_who.append(df_bow_who.iloc[i, 1])

print({
  'What': len(bow_list_what),
  'When': len(bow_list_when),
  'Where': len(bow_list_where),
  'Who': len(bow_list_who)
})


{'What': 1111111, 'When': 1111111, 'Where': 222222, 'Who': 1111111}


In [24]:
# Use data train
def cari_dokpertama(kueriAsli: str) -> list[str]:
    kueriPre = preprocessing(kueriAsli)
    kueriPre = " ".join(kueriPre)
    hasilSearch = []
    tfidf_matrix = joblib.load('corpus/matrix/tfidf_train.pkl')
    tfidf_vectorizer = joblib.load('corpus/vectorizer/vectorizer.pkl')
    query_vec = tfidf_vectorizer.transform([kueriPre])
    results = cosine_similarity(tfidf_matrix, query_vec).reshape((-1))
    for i in results.argsort()[-5:][::-1]:
        hasilSearch.append(df_total.iloc[i,-2])
    hasilSearch=". ".join(hasilSearch)
    return hasilSearch

In [25]:
##Keywords Extraction with YAKE
def keyword_yake(hasilSearch: str) -> list[str]:
    keywordYake=[]

    k_extractor = KeywordExtractor(lan="id", n=1, top=10)
    k_extractor2 = KeywordExtractor(lan="id", n=2, top=10)
    keywords = k_extractor.extract_keywords(text=hasilSearch)
    keywords = k_extractor2.extract_keywords(text=hasilSearch)
    keywordYake = [x for x, y in keywords]
    #keywordYake.append(keywords)
    #print (keywordYake)
    return keywordYake
#print("Keywords of article\n", keywords)

In [26]:
#Keywords Extraction with TFIDF
def keyword_tfidf(hasilSearch: str) -> list[str]:

    keywordtfidf=[]
    keywordtfidf2=[]

    total_words = re.sub(r'[^\w]', ' ', hasilSearch)
    total_words = total_words.lower().split()
    #print (total_words)
    total_word_length = len(total_words)
    total_sentences = tokenize.sent_tokenize(hasilSearch)
    total_sent_len = len(total_sentences)

    tf_score = {}
    for each_word in total_words:
        #print (each_word)
        each_word = each_word.replace('.','')
        if (each_word in excluded_words) or (each_word not in NLTK_StopWords):
            if each_word in tf_score:
                tf_score[each_word] += 1
            else:
                tf_score[each_word] = 1

    # Dividing by total_word_length for each dictionary element
    tf_score.update((x, y/int(total_word_length)) for x, y in tf_score.items())
    #print(tf_score)
    def check_sent(word, sentences): 
        final = [all([w in x for w in word]) for x in sentences] 
        sent_len = [sentences[i] for i in range(0, len(final)) if final[i]]
        return int(len(sent_len))

    idf_score = {}
    for each_word in total_words:
        #print (each_word)
        each_word = each_word.replace('.','')
        if (each_word in excluded_words) or (each_word not in NLTK_StopWords):
            if each_word in idf_score:
                idf_score[each_word] = check_sent(each_word, total_sentences)
            else:
                idf_score[each_word] = 1

    # Performing a log and divide
    idf_score.update((x, math.log(int(total_sent_len)/y)) for x, y in idf_score.items())

    #print(idf_score)
    tf_idf_score = {key: tf_score[key] * idf_score.get(key, 0) for key in tf_score.keys()}
    #print(tf_idf_score)
    def get_top_n(dict_elem, n):
        result = dict(sorted(dict_elem.items(), key = itemgetter(1), reverse = True)[:n]) 
        hasil =list(result.keys())
        #print(list(result.keys()))        
        return hasil
    #print(get_top_n(tf_idf_score, 25))
    #print(len(get_top_n(tf_idf_score, 1)))
    keywordtfidf.append(get_top_n(tf_idf_score, 25))
    for i in range(len(keywordtfidf)):
        #print (i)
        totalKw=0
        totalKw=len(keywordtfidf[i])
        for j in range(totalKw):
            #print (j)
            keywordtfidf2.append(keywordtfidf[i][j])
    #print (keywordtfidf2)
    return keywordtfidf2

In [27]:
#Keywords Extraction with BERT
def keyword_bert(hasilSearch: str) -> list[str]:

    keywordbert=[]

    #for j in range(len(array_text)):
    keyword1 = kw_extractor.extract_keywords(hasilSearch, top_n=10, keyphrase_ngram_range=(1, 1))
    keyword2 = kw_extractor.extract_keywords(hasilSearch, top_n=10, keyphrase_ngram_range=(1, 2))

    #print("Keywords of article\n", keywords)
    for i in range (0,len (keyword1)):
        keywordbert.append(keyword1[i][0])
        keywordbert.append(keyword2[i][0])
    #print (keywordbert)
    return keywordbert

In [28]:
def rangking (keywordGabung: list[str], kueriAsli: str) -> list[str]:
    kandidatFinalCek=[]
    kandidatFinalFix=[]
    for i in keywordGabung:
        if (i not in kandidatFinalCek and i!=0):
            kandidatFinalCek.append(i)
    queries=[kueriAsli]
    query_embeddings = embedder.encode(queries)
    corpus_embeddings4 = embedder.encode(kandidatFinalCek)
    # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
    closest_n = 30
    for query, query_embedding in zip(queries, query_embeddings):
        distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings4, 'cosine')[0]
        results = zip(range(len(distances)), distances)
        results = sorted(results, key=lambda x: x[1])
        for idx, distance in results[0:closest_n]:
            kandidatFinalFix.append(kandidatFinalCek[idx])
    print ('kandidatFinalFix: ', kandidatFinalFix)
    return kandidatFinalFix


In [29]:
def keywordCustomBow(bowList: list[str], initialQuery: str) -> list[str]:
    cekDuplicate = []
    kandidatFix = []

    for i in bowList:
        if(i not in cekDuplicate and i!=0):
            cekDuplicate.append(i)

    queries=[initialQuery]
    query_embeddings = embedder.encode(queries)
    corpus_embeddings4 = embedder.encode(cekDuplicate)
    
    # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
    closest_n = 10
    for query, query_embedding in zip(queries, query_embeddings):
        distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings4, 'cosine')[0]
        results = zip(range(len(distances)), distances)
        results = sorted(results, key=lambda x: x[1])
        for idx, distance in results[0:closest_n]:
            kandidatFix.append(cekDuplicate[idx])
    return kandidatFix

In [36]:
# Creating query for what, when, where, who
what_initial_query = "apa sebenarnya kejadian atau peristiwa dalam berita"
when_initial_query = "kapan waktu berita tersebut terjadi"
where_initial_query = "di daerah mana tempat berita itu terjadi"
who_initial_query = "siapa orang, personel, atau lembaga yang terlibat dalam berita"

what_query = preprocessing(what_initial_query)
what_query = " ".join(what_query)
print (what_query)

when_query = preprocessing(when_initial_query)
when_query = " ".join(when_query)
print (when_query)

where_query = preprocessing(where_initial_query)
where_query = " ".join(where_query)
print (where_query)

who_query = preprocessing(who_initial_query)
who_query = " ".join(who_query)
print (who_query)


kejadian peristiwa berita
waktu berita
daerah tempat berita
orang personel lembaga terlibat berita


In [31]:
def prepareWData(initial_query: str, bow_list: list[str]):
    hasilkandidat = []
    keywordGabung = []
    kandidatFix = []
    kueriFix = []

    hasilSearch     = cari_dokpertama(initial_query)
    keywordYake     = keyword_yake(hasilSearch)
    keywordtfidf2   = keyword_tfidf(hasilSearch)
    keywordbert     = keyword_bert(hasilSearch)
    keywordBoW      = keywordCustomBow(bow_list, initial_query)

    for keyword in keywordYake:
        keywordGabung.append(keyword)
    for keyword in keywordtfidf2:
        keywordGabung.append(keyword)
    for keyword in keywordbert:
        keywordGabung.append(keyword)    
    for keyword in keywordBoW:
        keywordGabung.append(keyword)

    hasilrank = rangking(keywordGabung, initial_query)
    
    for word in hasilrank:
        kueriFix.append(word)
    for word in kueriFix:
        hasilkandidat.append(word)

    kueriFix = [preprocessing(i) for i in kueriFix]
    
    for word in kueriFix:
        for subWord in word:
            kandidatFix.append(subWord)

    kandidatFix = [" ".join(kandidatFix)]
    # print('*'*120)
    return kandidatFix, hasilrank

In [37]:
where_keyword_candidates, where_rank_result = prepareWData(where_query, bow_list_where)
what_keyword_candidates, what_rank_result = prepareWData(what_query, bow_list_what)
who_keyword_candidates, who_rank_result = prepareWData(who_query, bow_list_who)
when_keyword_candidates, when_rank_result = prepareWData(when_query, bow_list_when)

def wCalculation():
    document_result = []

    document_result_what = []
    document_result_who = []
    document_result_when = []
    document_result_where = []

    for i in range(0, len(document_text_test)-1):
        hasilWhat = []
        hasilWho = []
        hasilWhen = []
        hasilWhere = []

        teks = df_total.iloc[i, -2]
        tfidf_vectorizer = joblib.load('corpus/vectorizer/vectorizer.pkl')
        tfidf_matrix = tfidf_vectorizer.fit_transform([teks])

        query_vec_what = tfidf_vectorizer.transform(what_keyword_candidates)
        results_what = cosine_similarity(tfidf_matrix, query_vec_what).reshape((-1))
        document_result_what.append(df_total.iloc[i, 2])

        for a in what_rank_result:
            cariW = re.findall(a, document_result_what[i])
            if cariW:
                hasilWhat.append(a)

        document_result.append([i, 'what', what_keyword_candidates, hasilWhat, results_what, 0, 0])
        # ------------------------------------------------------------------------------------------------------

        query_vec_who = tfidf_vectorizer.transform(who_keyword_candidates)
        results_who = cosine_similarity(tfidf_matrix, query_vec_who).reshape((-1))
        document_result_who.append(df_total.iloc[i, 2])

        for a in who_rank_result:
            cariW = re.findall(a, document_result_who[i])
            if cariW:
                hasilWho.append(a)

        document_result.append([i, 'who', who_keyword_candidates, hasilWho, results_who, 0, 0])
        # ------------------------------------------------------------------------------------------------------

        query_vec_when = tfidf_vectorizer.transform(when_keyword_candidates)
        results_when = cosine_similarity(tfidf_matrix, query_vec_when).reshape((-1))
        document_result_when.append(df_total.iloc[i, 2])

        for a in when_rank_result:
            cariW = re.findall(a, document_result_when[i])
            if cariW:
                hasilWhen.append(a)

        document_result.append([i, 'when', when_keyword_candidates, hasilWhen, results_when, 0, 0])
        # ------------------------------------------------------------------------------------------------------

        query_vec_where = tfidf_vectorizer.transform(where_keyword_candidates)
        results_where = cosine_similarity(tfidf_matrix, query_vec_where).reshape((-1))
        document_result_where.append(df_total.iloc[i, 2])

        for a in where_rank_result:
            cariW = re.findall(a, document_result_where[i])
            if cariW:
                hasilWhere.append(a)

        document_result.append([i, 'where', where_keyword_candidates, hasilWhere, results_where, 0, 0])
    
    writer = pd.DataFrame(document_result, columns=['Data','W', 'Query', 'Query Result', 'Similarity', 'True Positive', 'True Negative'])
    writer.to_csv('QE_Stat_V2_testing_result.csv', index=False, sep=',')

kandidatFinalFix:  ['stiria', 'noticias', 'satawan', 'sastranegara', 'mondulkiri', 'daerahnya', 'masayarakat', 'daerah', 'krajeńskie', 'kawasan', 'peristiwa', 'darurat', 'penguatan', 'kabupaten', 'kecamatan', 'rumah warga', 'personel', 'kebencanaan', 'melaporkan', 'pulusangi', 'tenggara', 'penguatan mewaspadai', 'mencatat', 'warga', 'rumah', 'merusak', 'terdekat', 'menggambarkan', 'pulusangi kecamatan', 'sulawesi']
kandidatFinalFix:  ['informations', 'peritiwa', 'kejadiannya', 'tajuknya', 'memancarkan', 'tinjauan', 'peristiwa', 'mengobarkan', 'kejadian', 'premis', 'pemangkasan', 'darurat', 'penguatan', 'penguatan mewaspadai', 'merusak', 'pulusangi', 'mengakibatkan', 'melaporkan', 'penanganan', 'kencang merusak', 'pembangunan', 'kencang menyikapi', 'bersamaan', 'kapusdatinkom', 'kabupaten', 'aktivitas', 'personel', 'kecamatan', 'prakiraan cuaca', 'terdekat']
kandidatFinalFix:  ['wiraswastawan', 'wiracaritawan', 'pelaporan', 'investigatif', 'wartawan', 'personelnya', 'hartawan', 'investi

In [40]:
wCalculation()