In [1]:
# Import library-library
import os
import glob
from tqdm import tqdm
import numpy as np

# Data Preparation and Preprocessing
import pandas as pd
import re
from string import digits

# Word Embedding
import joblib
from keybert import KeyBERT
kw_extractor = KeyBERT('distilbert-base-nli-mean-tokens')
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer('xlm-r-distilroberta-base-paraphrase-v1')

# Input and Expansion Query
import nltk
nltk.download('punkt')
import math
from textblob import TextBlob
from yake import KeywordExtractor
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import stopwords
from nltk import tokenize
from operator import itemgetter
from nltk.tokenize import word_tokenize
from textblob import TextBlob
NLTK_StopWords = stopwords.words('indonesian')

[nltk_data] Downloading package punkt to /usr/local/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def preprocessing(berita):
    s = berita.lower()
    s = s.replace('\n', ' ')
    s = s.replace('\r', ' ')
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
    tokens = [token for token in s.split(" ") if token != ""]
    T = [t for t in tokens if (((t.lower() == "tempat") or (t.lower() == "waktu") or (t.lower() == "hari")) or (t not in NLTK_StopWords))]
    return T

In [3]:
df_test = pd.read_csv('df_test.csv')
df_test = df_test[pd.notnull(df_test['description'])]
print(df_test.info())
print ('-'*90)
document_text_test= joblib.load('desc_text_test.pkl')
print(len(document_text_test))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 221 entries, 0 to 220
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        221 non-null    object
 1   date         221 non-null    object
 2   description  221 non-null    object
 3   source       221 non-null    object
dtypes: object(4)
memory usage: 8.6+ KB
None
------------------------------------------------------------------------------------------
221


In [4]:
df_train = pd.read_csv('df_train.csv')
df_train = df_train[pd.notnull(df_train['description'])]
print(df_train.info())
print ('-'*90)
document_text_train= joblib.load('desc_text_train.pkl')
print(len(document_text_train))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1979 entries, 0 to 1979
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        1979 non-null   object
 1   date         1979 non-null   object
 2   description  1979 non-null   object
 3   source       1979 non-null   object
dtypes: object(4)
memory usage: 77.3+ KB
None
------------------------------------------------------------------------------------------
1980


In [5]:
def bow_read(bow):
    bow_read = pd.read_csv(bow)
    bow_text = []

    for i in range(0,bow_read.shape[0]):
        bow_text.append(bow_read.iloc[i,1])
        
    return(bow_read,bow_text)

In [6]:
def cari_dokpertama(kueriAsli):
    kueriPre=preprocessing(kueriAsli)
    kueriPre= " ".join (kueriPre)
    hasilSearch=[]
    tfidf_train = joblib.load('tfidf_train.pkl')
    tfidf_vectorizer = joblib.load('vectorizer.pkl')
    query_vec= tfidf_vectorizer.transform([kueriPre])
    # print('queryvec')
    print(query_vec)
    results=cosine_similarity(tfidf_train, query_vec).reshape((-1))
    # print(results)
    for i in results.argsort()[-5:][::-1]:
        hasilSearch.append(df_train.iloc[i,-2])
    hasilSearch=". ".join(hasilSearch)
    return hasilSearch

In [7]:
##Keywords Extraction with YAKE
def keyword_yake(hasilSearch):
    # print(hasilSearch)
    keywordYake=[]

    k_extractor = KeywordExtractor(lan="id", n=1, top=50)
    # print(k_extractor)
    k_extractor2 = KeywordExtractor(lan="id", n=2, top=50)
    # print(k_extractor2)
    keywords = k_extractor.extract_keywords(text=hasilSearch)
    # print("pertama : ",keywords)
    keywords = k_extractor2.extract_keywords(text=hasilSearch)
    # print("kedua : ",keywords)
    keywordYake = [x for x, y in keywords]
    # print(keywordYake)
    #keywordYake.append(keywords)
    # print (keywordYake)
    return keywordYake
#print("Keywords of article\n", keywords)

In [8]:
#Keywords Extraction with TFIDF
def keyword_tfidf(hasilSearch):

    keywordtfidf=[]
    keywordtfidf2=[]

    #doc = 'بَاب فرض الْوضُوء وسننه وهيآته وَفرض الْوضُوء سِتّ خِصَال النِّيَّة عمند غسل الْوَجْه وَغسل الْوَجْه وَغسل الذراعين مَعَ الْمرْفقين وَمسح مَا قل من الرَّأْس وَغسل الرجلَيْن مَعَ الْكَعْبَيْنِ وَالتَّرْتِيب وعَلى قَول الْوَلَاء وسننه عشر خِصَال خمس مِنْهَا قبل غسل الْوَجْه وَهِي التَّسْمِيَة وَغسل الْكَفَّيْنِ والمضمضة وَالِاسْتِنْشَاق وَالْمُبَالغَة فيههما إِلَّا للصَّائِم وَخمْس بعد غسل الْوَجْه وَهِي تَقْدِيم الْيُمْنَى على ليسرى وَمسح جَمِيع الرَّأْس وَمسح الْأُذُنَيْنِ ظاهرهما وباطنهما وَإِدْخَال الأصبعين فيهمَا وتخليل أَصَابِع الرجلَيْن . وَغسل دَاخل الْكَعْبَيْنِ وَلَيْسَ مسح لعنق من سنَنه وفضيلته تكراره ثَلَاثًا وزالواجب فِيهِ مرّة والمرتان أفضل وَالثَّلَاث أكمل وهيآته أَن يبْدَأ فِي تَطْهِير الْأَعْضَاء بمواضع الِابْتِدَاء . فَإِن اقْتصر على فروضه استة أَجزَأَهُ وَإِن ضيع حَظّ نَفسه فِيمَا ترك'
    total_words = re.sub(r'[^\w]', ' ', hasilSearch)
    total_words = total_words.lower().split()
    #print (total_words)
    total_word_length = len(total_words)
    total_sentences = tokenize.sent_tokenize(hasilSearch)
    total_sent_len = len(total_sentences)

    tf_score = {}
    for each_word in total_words:
        #print (each_word)
        each_word = each_word.replace('.','')
        if each_word not in NLTK_StopWords:
            if each_word in tf_score:
                tf_score[each_word] += 1
            else:
                tf_score[each_word] = 1

    # Dividing by total_word_length for each dictionary element
    tf_score.update((x, y/int(total_word_length)) for x, y in tf_score.items())
    #print(tf_score)
    def check_sent(word, sentences): 
        final = [all([w in x for w in word]) for x in sentences] 
        sent_len = [sentences[i] for i in range(0, len(final)) if final[i]]
        return int(len(sent_len))

    idf_score = {}
    for each_word in total_words:
        #print (each_word)
        each_word = each_word.replace('.','')
        if each_word not in NLTK_StopWords:
            if each_word in idf_score:
                idf_score[each_word] = check_sent(each_word, total_sentences)
            else:
                idf_score[each_word] = 1

    # Performing a log and divide
    idf_score.update((x, math.log(int(total_sent_len)/y)) for x, y in idf_score.items())

    #print(idf_score)
    tf_idf_score = {key: tf_score[key] * idf_score.get(key, 0) for key in tf_score.keys()}
    #print(tf_idf_score)
    def get_top_n(dict_elem, n):
        result = dict(sorted(dict_elem.items(), key = itemgetter(1), reverse = True)[:n]) 
        hasil =list(result.keys())
        #print(list(result.keys()))        
        return hasil
    #print(get_top_n(tf_idf_score, 25))
    #print(len(get_top_n(tf_idf_score, 1)))
    keywordtfidf.append(get_top_n(tf_idf_score, 35))
    for i in range(len(keywordtfidf)):
        #print (i)
        totalKw=0
        totalKw=len(keywordtfidf[i])
        for j in range(totalKw):
            #print (j)
            keywordtfidf2.append(keywordtfidf[i][j])
    #print (keywordtfidf2)
    return keywordtfidf2

In [9]:
#Keywords Extraction with BERT
def keyword_bert(hasilSearch):
    # print(hasilSearch)

    keywordbert=[]

    #for j in range(len(array_text)):
    keyword1 = kw_extractor.extract_keywords(hasilSearch, top_n=50, keyphrase_ngram_range=(1, 1))
    # print(keyword1)
    keyword2 = kw_extractor.extract_keywords(hasilSearch, top_n=50, keyphrase_ngram_range=(1, 2))
    # print(keyword2)
    # keyword3 = kw_extractor.extract_keywords(hasilSearch, top_n=50, keyphrase_ngram_range=(1, 5))
    # print(keyword3)

    #print("Keywords of article\n", keywords)
    for i in range (0,len (keyword1)):
        keywordbert.append(keyword1[i][0])
        keywordbert.append(keyword2[i][0])
    # print (keywordbert)
    return keywordbert

In [10]:
def rangking (keywordGabung):
    keywordTemp=[]
    keywordFinal=[]

    def borda_sort(lists):
        scores = {}
        for l in lists:
            for idx, elem in enumerate(reversed(l)):
                if not elem in scores:
                    scores[elem] = 0
                scores[elem] += idx
        return sorted(scores.keys(), key=lambda elem: scores[elem], reverse=True)

    keywordTemp.append(borda_sort(keywordGabung))
    print ('kandidat temp',keywordTemp)
    print ('Total Kandidat temp: ',len(keywordTemp[0]))

    if len(keywordTemp[0])>30:
        print ('kurang dari 80')
        for i in range (0,80):
            keywordFinal.append(keywordTemp[0][i])
    elif len(keywordTemp[0])<80:
        print ('lebih dari 80')
        for i in range (0,len(keywordTemp)):
            for j in range (0,len(keywordTemp[0])):
                keywordFinal.append(keywordTemp[0][j])
    print ('Total Kandidat final: ',len(keywordFinal))
    print ('Kandidat final: ',keywordFinal)
    return keywordFinal

In [11]:
def keyword_BOW(keywordBOW, kueriAsli):
    cekDuplicate = []
    kandidatFix = []

    for i in keywordBOW:
        if(i not in cekDuplicate and i!=0):
            cekDuplicate.append(i)

    # queries=[kueriAsli]
    queries=kueriAsli
    query_embeddings = embedder.encode(queries)
    corpus_embeddings4 = embedder.encode(cekDuplicate)
    
    # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
    closest_n = 1000
    for query, query_embedding in zip(queries, query_embeddings):
        distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings4, 'cosine')[0]
        results = zip(range(len(distances)), distances)
        results = sorted(results, key=lambda x: x[1])
        for idx, distance in results[0:closest_n]:
            kandidatFix.append(cekDuplicate[idx])
    return kandidatFix

In [12]:
def kandidatFix(kueriAsli,bow):
    #kueri what
    # kueriAsli='apa sebenarnya kejadian yang terjadi diberita tersebut'
    kueri=preprocessing(kueriAsli)
    kueri= [" ".join (kueri)]
    print (kueri)
    hasilkandidat=[]
    keywordGabung=[]
    kandidatFix=[]
    kueriFix=[]
    kueriFixWithDelimiter=[]
    


    hasilSearch=cari_dokpertama(kueriAsli)
    keywordYake=keyword_yake(hasilSearch)
    keywordtfidf2=keyword_tfidf(hasilSearch)
    keywordbert=keyword_bert (hasilSearch)
    keywordBOW=keyword_BOW(bow, kueri)

    keywordGabung.append(keywordYake)
    keywordGabung.append(keywordtfidf2)
    keywordGabung.append(keywordbert)
    # keywordGabung.append(keywordBOW)
    hasilrank=rangking(keywordGabung)

    for i in hasilrank:
        kueriFix.append(i)
    for x in keywordBOW:
        kueriFix.append(x)
    for j in kueriFix:
        hasilkandidat.append(j)
    kueriFixWithDelimiter=kueriFix
    kueriFix=[preprocessing(i) for i in kueriFix]
    for i in kueriFix:
        for j in i:
            kandidatFix.append(j)
    kandidatFix= [" ".join (kandidatFix)]
    
    print ('*'*120)
    return(kandidatFix,keywordGabung,keywordBOW,hasilrank,kueriFixWithDelimiter)

In [13]:
bow_when_read,bow_when_text = bow_read('bow_when.csv')



In [14]:
print("When     : ", bow_when_text[0])


When     :  hari


In [15]:
hasilDokumenWhen=[]
kueriAsliWhen='hari apa waktu terjadinya'
kandidatFixWhen,keywordGabungWhen,keywordBOW_When, hasilrankWhen, kueriFixWithDelimiter_When = kandidatFix(kueriAsliWhen, bow_when_text)

['hari waktu']
  (0, 16713)	1.0
kandidat temp [['ohio', 'tragedi ohio', 'menghubungi', 'ohio rabu', 'panggilan darurat', 'columbus', 'melepaskan', 'menghubungi layanan', 'mengembuskan', 'dunia mengembuskan', 'mengidentifikasi', 'tega membunuh', 'tertembak ayahnya', 'panggilan', 'berboncengan pelaku', 'pemberitahuan', 'ayah melepaskan', 'mengidentifikasi terkait', 'mengirimkan', 'pangestu', 'merilis pemberitahuan', 'menunggu', 'putrinya jane', 'tergeletak', 'mengidentifikasi pelaku', 'korban menghubungi', 'keberadaan', 'pelaku', 'tertembak', 'tersangka', 'kabar columbus', 'columbus dispatch', 'pelaku membunuh', 'berboncengan', 'korban anggota', 'anggota', 'mengembuskan napas', 'mengklaim', 'hubungan', 'ditembak', 'insyaallah', 'korban membunuh', 'tugasnya', 'kendaraan tugasnya', 'bangun petugas', 'waktu sang', 'waktu', 'penembakan', 'dunia', 'sudjana mengidentifikasi', 'membunuh', 'kolonel pangestu', 'penyelidikan', 'pangestu pelaku', 'metro jaya', 'berjanji', 'rekaman panggilan', 'diba

In [16]:
print(keywordBOW_When)
print(hasilrankWhen)

['harinya', 'seharinya', 'hari', 'zamannya', 'berhari', 'sehari', 'siangnya', 'masanya', 'ajalku', 'ajalnya', 'waktunya', 'jamannya', 'seharian', 'zaman', 'diwaktu', 'dizaman', 'siang', 'waktu', 'besoknya', 'jaman', 'dinihari', 'sejam', 'masa', 'hariku', '時間', 'bermasa', 'harimu', 'semasa', 'saat', 'menitnya', 'daytona', 'detik', 'saatnya', 'jamnya', 'dimasa', 'subuh', 'pekan', 'detiknya', 'tempoh', 'attimo', 'pertanggal', 'sesaat', 'pagi', 'berjam', 'bertanggal', 'mideen', 'paginya', 'latach', 'rayanya', 'tahunya', 'tahunnya', 'seiringnya', 'ketika', 'wieku', 'senna', 'sewaktu', 'eranya', 'termenung', 'kisaran', 'tahun', 'dzieje', 'penayangannya', 'segmennya', 'sempat', 'sebelum', 'heydeni', 'sekon', 'kemarin', 'deret', 'yadnya', 'aksinya', 'kondangan', 'nurlan', 'sabtu', 'dita', 'ramadan', 'kapan', 'urutan', 'sundulan', 'tayangan', 'penanyangannya', 'timing', 'dhesta', 'sepandjang', 'menit', 'mehdi', 'pospenas', 'imbang', 'antriannya', 'minggu', 'soeltanaat', 'awaludin', 'hitungan', 

In [17]:
testing_data = []

tfidf_vectorizer = joblib.load('vectorizer.pkl')

for i in range(0, len(document_text_test)-1):

    hasilWhen=[]

    teks=df_test.iloc[i,-2]
    tfidf_matrix = tfidf_vectorizer.fit_transform([teks])

    query_vec_When= tfidf_vectorizer.transform(kandidatFixWhen)
    results_when=cosine_similarity(tfidf_matrix, query_vec_When).reshape((-1))
    hasilDokumenWhen.append(df_test.iloc[i,2])
    for a in kueriFixWithDelimiter_When:
        cariW = re.findall(a,hasilDokumenWhen[i])
        #print(cariW)
        if cariW:
            hasilWhen.append(a)
    
    data = [i,df_test.iloc[i,2],'when',kueriAsliWhen,keywordBOW_When , keywordGabungWhen, kandidatFixWhen, hasilWhen, results_when,' ',' ']

    testing_data.append(data)

In [18]:
#write to csv
writer = pd.DataFrame(testing_data, columns=['No Document','Description', 'W','Pertanyaan', 'Keyword BOW', 'Keyword Gabung','kandidat fix','hasilW', 'Kemiripan', 'True Positif', 'True Negative'])
writer.to_csv('QE_Stat_testing_whenTes_result.csv', index=False, sep=',')