In [126]:
# Import library-library
import os
import glob
from tqdm import tqdm
import numpy as np
import string

# Data Preparation and Preprocessing
import pandas as pd
import re
from string import digits

# Word Embedding
import joblib
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer('xlm-r-distilroberta-base-paraphrase-v1')
from keybert import KeyBERT
kw_extractor = KeyBERT('distilbert-base-nli-mean-tokens')
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt

# Input and Expansion Query
import nltk
import math
from textblob import TextBlob
from yake import KeywordExtractor
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import stopwords
from nltk import tokenize
from operator import itemgetter
from nltk.tokenize import word_tokenize
from textblob import TextBlob
from typing import Union

In [127]:
excluded_words = ["tempat", "waktu", "gempa", "banjir"]

NLTK_StopWords = stopwords.words('indonesian')
NLTK_StopWords.extend(["detik", "detikjatim", "detikjateng", "detikjabar", "detiksulsel", "detiksumbar", "detikbali", "detikpapua", "detiksulteng", "detikmaluku", "detjatim", "detikcom", "allahumma", "aamiin", "allah", "bismillah"])
NLTK_StopWords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo', 
                       'kalo', 'amp', 'biar', 'bikin', 'bilang', 
                       'gak', 'ga', 'krn', 'nya', 'nih', 'sih', 
                       'si', 'tau', 'tdk', 'tuh', 'utk', 'ya', 
                       'jd', 'jgn', 'sdh', 'aja', 'n', 't', 
                       'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt',
                       '&amp', 'yah'])
txt_stopword = pd.read_csv("stopwords.txt", names= ["stopwords"], header = None)

# convert stopword string to list & append additional stopword
NLTK_StopWords.extend(txt_stopword["stopwords"][0].split(' '))
NLTK_StopWords = set(NLTK_StopWords)

In [128]:
def preprocessing(berita):
    # Preprocessing
    s = berita.lower()
    s = s.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
    s = s.encode('ascii', 'replace').decode('ascii')
    ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", s).split())
    s.replace("http://", " ").replace("https://", " ")
    s = re.sub('\s+', ' ', s)
    s = s.strip()
    s = s.translate(str.maketrans("","", string.punctuation))
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
    s = re.sub(r'\b\w{1,1}\b', '', s)
    tokens = [token for token in s.split(" ") if token != ""]

    # Stopwords checking
    T = [t for t in tokens if ((t in excluded_words) or (t not in NLTK_StopWords))]
    return T

In [129]:
df_total = pd.read_csv('corpus/dataset/df_total.csv')
df_total = df_total[pd.notnull(df_total['description'])]
print(df_total.info())
print ('-'*90)
document_text= joblib.load('corpus/model/desc_text_train.pkl')
print(len(document_text))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1783 entries, 0 to 1782
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        1783 non-null   object
 1   date         1783 non-null   object
 2   description  1783 non-null   object
 3   source       1783 non-null   object
dtypes: object(4)
memory usage: 69.6+ KB
None
------------------------------------------------------------------------------------------
1604


In [130]:
df_test = pd.read_csv('corpus/dataset/df_test.csv')
df_test = df_test[pd.notnull(df_test['description'])]
print(df_test.info())
print ('-'*90)
document_text_test= joblib.load('corpus/model/desc_text_test.pkl')
print(len(document_text_test))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 179 entries, 0 to 178
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        179 non-null    object
 1   date         179 non-null    object
 2   description  179 non-null    object
 3   source       179 non-null    object
dtypes: object(4)
memory usage: 7.0+ KB
None
------------------------------------------------------------------------------------------
179


In [131]:
df_train = pd.read_csv('corpus/dataset/df_train.csv')
df_train = df_train[pd.notnull(df_train['description'])]
print(df_train.info())
print ('-'*90)
document_text_train= joblib.load('corpus/model/desc_text_train.pkl')
print(len(document_text_train))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1604 entries, 0 to 1603
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        1604 non-null   object
 1   date         1604 non-null   object
 2   description  1604 non-null   object
 3   source       1604 non-null   object
dtypes: object(4)
memory usage: 62.7+ KB
None
------------------------------------------------------------------------------------------
1604


In [132]:
# Load bow dataset
df_bow_what = pd.read_csv("bow/bow_what.csv")
df_bow_what.head()
df_bow_what.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17485 entries, 0 to 17484
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Unnamed: 0              17485 non-null  int64 
 1   tingkat setelah parent  17485 non-null  int64 
 2   parent                  17485 non-null  object
 3   similarity              17485 non-null  object
dtypes: int64(2), object(2)
memory usage: 546.5+ KB


In [133]:
# Load bow dataset
df_bow_when = pd.read_csv("bow/bow_when.csv")
df_bow_when.head()
df_bow_when.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26186 entries, 0 to 26185
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Unnamed: 0              26186 non-null  int64 
 1   tingkat setelah parent  26186 non-null  int64 
 2   parent                  26185 non-null  object
 3   similarity              26186 non-null  object
dtypes: int64(2), object(2)
memory usage: 818.4+ KB


In [134]:
# Load bow dataset
df_bow_where = pd.read_csv("bow/bow_where.csv")
df_bow_where.head()
df_bow_where.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9786 entries, 0 to 9785
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Unnamed: 0              9786 non-null   int64 
 1   tingkat setelah parent  9786 non-null   int64 
 2   parent                  9786 non-null   object
 3   similarity              9786 non-null   object
dtypes: int64(2), object(2)
memory usage: 305.9+ KB


In [135]:
# Load bow dataset
df_bow_who = pd.read_csv("bow/bow_who.csv")
df_bow_who.head()
df_bow_who.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17293 entries, 0 to 17292
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Unnamed: 0              17293 non-null  int64 
 1   tingkat setelah parent  17293 non-null  int64 
 2   parent                  17293 non-null  object
 3   similarity              17293 non-null  object
dtypes: int64(2), object(2)
memory usage: 540.5+ KB


In [136]:
# Ambil parent dari bow
bow_list_what = []
bow_list_when = []
bow_list_where = []
bow_list_who = []

for i in range(0, df_bow_what.shape[0]):
  bow_list_what.append(df_bow_what.iloc[i, 2])

for i in range(0, df_bow_when.shape[0]):
  bow_list_when.append(df_bow_when.iloc[i, 2])

for i in range(0, df_bow_where.shape[0]):
  bow_list_where.append(df_bow_where.iloc[i, 2])

for i in range(0, df_bow_who.shape[0]):
  bow_list_who.append(df_bow_who.iloc[i, 2])

print("What ", bow_list_what[0])
print("When ", bow_list_when[0])
print("Where ", bow_list_where[0])
print("Who ", bow_list_who[0])

print({
  'What': len(bow_list_what),
  'When': len(bow_list_when),
  'Where': len(bow_list_where),
  'Who': len(bow_list_who)
})


What  bencana
When  waktu
Where  surabaya
Who  korban
{'What': 17485, 'When': 26186, 'Where': 9786, 'Who': 17293}


In [137]:
# Use data train
def cari_dokpertama(kueriAsli: str) -> list[str]:
    kueriPre = preprocessing(kueriAsli)
    kueriPre = " ".join(kueriPre)
    hasilSearch = []
    tfidf_matrix = joblib.load('corpus/matrix/tfidf_train.pkl')
    tfidf_vectorizer = joblib.load('corpus/vectorizer/vectorizer.pkl')
    query_vec = tfidf_vectorizer.transform([kueriPre])
    results = cosine_similarity(tfidf_matrix, query_vec).reshape((-1))
    for i in results.argsort()[-5:][::-1]:
        hasilSearch.append(df_total.iloc[i,-2])
    hasilSearch=". ".join(hasilSearch)
    
    return hasilSearch

In [138]:
##Keywords Extraction with YAKE
def keyword_yake(hasilSearch: str) -> list[str]:
    keywordYake=[]

    k_extractor = KeywordExtractor(lan="id", n=1, top=10)
    k_extractor2 = KeywordExtractor(lan="id", n=2, top=10)
    keywords = k_extractor.extract_keywords(text=hasilSearch)
    keywords = k_extractor2.extract_keywords(text=hasilSearch)
    keywordYake = [x for x, y in keywords]
    #keywordYake.append(keywords)
    #print (keywordYake)
    return keywordYake
#print("Keywords of article\n", keywords)

In [139]:
#Keywords Extraction with TFIDF
def keyword_tfidf(hasilSearch: str) -> list[str]:

    keywordtfidf=[]
    keywordtfidf2=[]

    total_words = re.sub(r'[^\w]', ' ', hasilSearch)
    total_words = total_words.lower().split()
    #print (total_words)
    total_word_length = len(total_words)
    total_sentences = tokenize.sent_tokenize(hasilSearch)
    total_sent_len = len(total_sentences)

    tf_score = {}
    for each_word in total_words:
        #print (each_word)
        each_word = each_word.replace('.','')
        if (each_word in excluded_words) or (each_word not in NLTK_StopWords):
            if each_word in tf_score:
                tf_score[each_word] += 1
            else:
                tf_score[each_word] = 1

    # Dividing by total_word_length for each dictionary element
    tf_score.update((x, y/int(total_word_length)) for x, y in tf_score.items())
    #print(tf_score)
    def check_sent(word, sentences): 
        final = [all([w in x for w in word]) for x in sentences] 
        sent_len = [sentences[i] for i in range(0, len(final)) if final[i]]
        return int(len(sent_len))

    idf_score = {}
    for each_word in total_words:
        #print (each_word)
        each_word = each_word.replace('.','')
        if (each_word in excluded_words) or (each_word not in NLTK_StopWords):
            if each_word in idf_score:
                idf_score[each_word] = check_sent(each_word, total_sentences)
            else:
                idf_score[each_word] = 1

    # Performing a log and divide
    idf_score.update((x, math.log(int(total_sent_len)/y)) for x, y in idf_score.items())

    #print(idf_score)
    tf_idf_score = {key: tf_score[key] * idf_score.get(key, 0) for key in tf_score.keys()}
    #print(tf_idf_score)
    def get_top_n(dict_elem, n):
        result = dict(sorted(dict_elem.items(), key = itemgetter(1), reverse = True)[:n]) 
        hasil =list(result.keys())
        #print(list(result.keys()))        
        return hasil
    #print(get_top_n(tf_idf_score, 25))
    #print(len(get_top_n(tf_idf_score, 1)))
    keywordtfidf.append(get_top_n(tf_idf_score, 25))
    for i in range(len(keywordtfidf)):
        #print (i)
        totalKw=0
        totalKw=len(keywordtfidf[i])
        for j in range(totalKw):
            #print (j)
            keywordtfidf2.append(keywordtfidf[i][j])
    #print (keywordtfidf2)
    return keywordtfidf2

In [140]:
#Keywords Extraction with BERT
def keyword_bert(hasilSearch: str) -> list[str]:

    keywordbert=[]

    #for j in range(len(array_text)):
    keyword1 = kw_extractor.extract_keywords(hasilSearch, top_n=10, keyphrase_ngram_range=(1, 1))
    keyword2 = kw_extractor.extract_keywords(hasilSearch, top_n=10, keyphrase_ngram_range=(1, 2))

    #print("Keywords of article\n", keywords)
    for i in range (0,len (keyword1)):
        keywordbert.append(keyword1[i][0])
        keywordbert.append(keyword2[i][0])
    #print (keywordbert)
    return keywordbert

In [141]:
def rangking (keywordGabung: list[str], kueriAsli: str) -> list[str]:
    kandidatFinalCek=[]
    kandidatFinalFix=[]
    for i in keywordGabung:
        if (i not in kandidatFinalCek and i!=0):
            kandidatFinalCek.append(i)
    queries=[kueriAsli]
    query_embeddings = embedder.encode(queries)
    corpus_embeddings4 = embedder.encode(kandidatFinalCek)
    # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
    closest_n = 30
    for query, query_embedding in zip(queries, query_embeddings):
        distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings4, 'cosine')[0]
        results = zip(range(len(distances)), distances)
        results = sorted(results, key=lambda x: x[1])
        for idx, distance in results[0:closest_n]:
            kandidatFinalFix.append(kandidatFinalCek[idx])

    print ('Kandidat Final Fix Rank: ', kandidatFinalFix)
    return kandidatFinalFix


In [142]:
def keywordCustomBow(bowList: list[str], initialQuery: str) -> list[str]:
    cekDuplicate = []
    kandidatFix = []

    for i in bowList:
        if(i not in cekDuplicate and i!=0):
            cekDuplicate.append(i)

    queries=[initialQuery]
    query_embeddings = embedder.encode(queries)
    corpus_embeddings4 = embedder.encode(cekDuplicate)
    
    # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
    closest_n = 10
    for query, query_embedding in zip(queries, query_embeddings):
        distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings4, 'cosine')[0]
        results = zip(range(len(distances)), distances)
        results = sorted(results, key=lambda x: x[1])
        for idx, distance in results[0:closest_n]:
            kandidatFix.append(cekDuplicate[idx])
    
    print ('Kandidat Final Fix BoW: ', kandidatFix)
    return kandidatFix

In [143]:
# Creating query for what, when, where, who
what_initial_query = "bencana apa yang terjadi dalam berita"
when_initial_query = "kapan waktu berita tersebut terjadi"
where_initial_query = "di daerah mana tempat berita itu terjadi"
who_initial_query = "siapa orang, personel, atau lembaga yang terlibat dalam berita"

what_query = preprocessing(what_initial_query)
what_query = " ".join(what_query)
print (what_query)

when_query = preprocessing(when_initial_query)
when_query = " ".join(when_query)
print (when_query)

where_query = preprocessing(where_initial_query)
where_query = " ".join(where_query)
print (where_query)

who_query = preprocessing(who_initial_query)
who_query = " ".join(who_query)
print (who_query)


bencana berita
waktu berita
daerah tempat berita
personel lembaga terlibat berita


In [144]:
def prepareWData(initial_query: str, bow_list: list[str]):
    hasilkandidat = []
    keywordGabung = []
    qeGabungan = []
    kueriFix = []

    hasilSearch     = cari_dokpertama(initial_query)
    # (ini yake + tfidf + bert) = qe statistik
    keywordYake     = keyword_yake(hasilSearch) # 20
    keywordtfidf2   = keyword_tfidf(hasilSearch) # 20
    keywordbert     = keyword_bert(hasilSearch) # 20
    # ini qe bow
    keywordBoW      = keywordCustomBow(bow_list, initial_query)

    for keyword in keywordYake:
        keywordGabung.append(keyword)
    for keyword in keywordtfidf2:
        keywordGabung.append(keyword)
    for keyword in keywordbert:
        keywordGabung.append(keyword)  

    # hasilrank = qe statistik
    hasilrank = rangking(keywordGabung, initial_query)
    
    for word in hasilrank:
        kueriFix.append(word)

    for word in kueriFix:
        hasilkandidat.append(word)

    kueriFix = [preprocessing(i) for i in kueriFix]
    
    for word in kueriFix:
        for subWord in word:
            qeGabungan.append(subWord)

    for word in keywordBoW:
        qeGabungan.append(word)

    # (hasil ranking + bow) = kandidat final
    qeGabungan = [" ".join(qeGabungan)]

    print('*'*120)
    qeStatistik = hasilrank
    qeBoW = keywordBoW

    return [qeGabungan, qeStatistik, qeBoW]

In [145]:
whatResultList = prepareWData(what_query, bow_list_what)
print("What")
print()
qeGabunganWhat = whatResultList[0]
qeStatistikWhat = whatResultList[1]
qeBoWWhat = whatResultList[2]

whoResultList = prepareWData(who_query, bow_list_who)
print("Who")
print()
qeGabunganWho = whoResultList[0]
qeStatistikWho = whoResultList[1]
qeBoWWho = whoResultList[2]

whenResultList = prepareWData(when_query, bow_list_when)
print("When")
print()
qeGabunganWhen = whenResultList[0]
qeStatistikWhen = whenResultList[1]
qeBoWWhen = whenResultList[2]

whereResultList = prepareWData(where_query, bow_list_where)
print("Where")
print()
qeGabunganWhere = whereResultList[0]
qeStatistikWhere = whereResultList[1]
qeBoWWhere = whereResultList[2]

Kandidat Final Fix BoW:  ['bencana', 'malapetaka', 'suryakencana', 'keniscayaan', 'tragis', 'musibah', 'kebinasaan', 'kelangkaan', 'informations', 'disaster']
Kandidat Final Fix Rank:  ['tanggap darurat', 'gempa', 'validasi', 'info', 'lumajang', 'bermagnitudo', 'serambagiantimurmaluku', 'daya', 'status tanggap', 'pemprov', 'sr', 'diguncang', 'cuit', 'lebak', 'lumajang diguncang', 'bmkgjogja', 'daya lumajang', 'verifikasi', 'terverifikasi', 'berlokasi', 'wib cuit', 'mengguncang', 'lintang selatan', 'mag36', 'cuit bmkgjogja', 'alit', 'bujur timur', '11295 bujur', '183656', '919']
************************************************************************************************************************
Kandidat Final Fix BoW:  ['wiraswastawan', 'wiracaritawan', 'pelaporan', 'wartawan', 'investigatif', 'personelnya', 'hartawan', 'jurnalis', 'stafnya', 'investigative']
Kandidat Final Fix Rank:  ['laporan', 'tulis stasiun', 'twitter', 'tulis', 'info', 'stasiun', 'notification', 'menimbulkan', '

In [146]:
def wCalculation():
    document_result = []

    document_result_w = []

    for i in range(0, len(document_text_test)-1):
        hasilWhat = []
        hasilWho = []
        hasilWhen = []
        hasilWhere = []

        teks = df_total.iloc[i, -2]
        tfidf_vectorizer = joblib.load('corpus/vectorizer/vectorizer.pkl')
        tfidf_matrix = tfidf_vectorizer.fit_transform([teks])
        document_result_w.append(df_total.iloc[i, -2])

        query_vec_what = tfidf_vectorizer.transform(qeGabunganWhat)
        results_what = cosine_similarity(tfidf_matrix, query_vec_what).reshape((-1))

        for a in qeStatistikWhat:
            cariW = re.findall(a, document_result_w[i])
            if cariW:
                hasilWhat.append(a)

        document_result.append([i, 'what', what_query, qeGabunganWhat, qeStatistikWhat, qeBoWWhat, hasilWhat, results_what, 0, 0, ""])
        # ------------------------------------------------------------------------------------------------------

        query_vec_who = tfidf_vectorizer.transform(qeGabunganWho)
        results_who = cosine_similarity(tfidf_matrix, query_vec_who).reshape((-1))

        for a in qeStatistikWho:
            cariW = re.findall(a, document_result_w[i])
            if cariW:
                hasilWho.append(a)

        document_result.append([i, 'who', who_query, qeGabunganWho, qeStatistikWho, qeBoWWho, hasilWho, results_who, 0, 0, ""])
        # ------------------------------------------------------------------------------------------------------

        query_vec_when = tfidf_vectorizer.transform(qeGabunganWhen)
        results_when = cosine_similarity(tfidf_matrix, query_vec_when).reshape((-1))

        for a in qeStatistikWhen:
            cariW = re.findall(a, document_result_w[i])
            if cariW:
                hasilWhen.append(a)

        document_result.append([i, 'when', when_query, qeGabunganWhen, qeStatistikWhen, qeBoWWhen, hasilWho, results_who, 0, 0, ""])
        # ------------------------------------------------------------------------------------------------------

        query_vec_where = tfidf_vectorizer.transform(qeGabunganWhere)
        results_where = cosine_similarity(tfidf_matrix, query_vec_where).reshape((-1))

        for a in qeStatistikWhere:
            cariW = re.findall(a, document_result_w[i])
            if cariW:
                hasilWhere.append(a)

        document_result.append([i, 'where', where_query, qeGabunganWhere, qeStatistikWhere, qeBoWWhere, hasilWhere, results_where, 0, 0, ""])
    
    writer = pd.DataFrame(document_result, columns=['Data','W', 'Query', 'QE Gabungan', 'QE Bow', 'QE Statistik', 'Hasil Query', 'Similarity', 'True Positive', 'True Negative', 'Skimming News'])
    writer.to_csv('QE_Stat_V2_testing_result.csv', index=False, sep=',')

In [148]:
wCalculation()