In [1]:
# Import library-library
import os
import glob
from tqdm import tqdm
import numpy as np

# Data Preparation and Preprocessing
import pandas as pd
import re
from string import digits

# Word Embedding
import joblib
from keybert import KeyBERT
kw_extractor = KeyBERT('distilbert-base-nli-mean-tokens')
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt

# Input and Expansion Query
import nltk
import math
from textblob import TextBlob
from yake import KeywordExtractor
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import stopwords
from nltk import tokenize
from operator import itemgetter
from nltk.tokenize import word_tokenize
from textblob import TextBlob
NLTK_StopWords = stopwords.words('indonesian')

In [2]:
def preprocessing(berita):
    s = berita.lower()
    s = s.replace('\n', ' ')
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
    tokens = [token for token in s.split(" ") if token != ""]
    T = [t for t in tokens if t not in NLTK_StopWords]
    return T

In [3]:
df_total = pd.read_csv('df_total.csv')
df_total = df_total[pd.notnull(df_total['description'])]
print(df_total.info())
print ('-'*90)
document_text= joblib.load('document_text.pkl')
print(len(document_text))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27 entries, 0 to 26
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        27 non-null     object
 1   date         27 non-null     object
 2   description  27 non-null     object
dtypes: object(3)
memory usage: 864.0+ bytes
None
------------------------------------------------------------------------------------------
27


In [11]:
def cari_dokpertama(kueriAsli):
    kueriPre=preprocessing(kueriAsli)
    kueriPre= " ".join (kueriPre)
    hasilSearch=[]
    tfidf_matrix = joblib.load('tfidf.pkl')
    tfidf_vectorizer = joblib.load('vectorizer.pkl')
    query_vec= tfidf_vectorizer.transform([kueriPre])
    results=cosine_similarity(tfidf_matrix, query_vec).reshape((-1))
    for i in results.argsort()[-5:][::-1]:
        hasilSearch.append(df_total.iloc[i,-1])
    hasilSearch=". ".join(hasilSearch)
    return hasilSearch



In [66]:
##Keywords Extraction with YAKE
def keyword_yake(hasilSearch):
    keywordYake=[]

    k_extractor = KeywordExtractor(lan="id", n=1, top=10)
    k_extractor2 = KeywordExtractor(lan="id", n=2, top=10)
    keywords = k_extractor.extract_keywords(text=hasilSearch)
    keywords = k_extractor2.extract_keywords(text=hasilSearch)
    keywordYake = [x for x, y in keywords]
    #keywordYake.append(keywords)
    #print (keywordYake)
    return keywordYake
#print("Keywords of article\n", keywords)

In [65]:
#Keywords Extraction with TFIDF
def keyword_tfidf(hasilSearch):

    keywordtfidf=[]
    keywordtfidf2=[]

    #doc = 'بَاب فرض الْوضُوء وسننه وهيآته وَفرض الْوضُوء سِتّ خِصَال النِّيَّة عمند غسل الْوَجْه وَغسل الْوَجْه وَغسل الذراعين مَعَ الْمرْفقين وَمسح مَا قل من الرَّأْس وَغسل الرجلَيْن مَعَ الْكَعْبَيْنِ وَالتَّرْتِيب وعَلى قَول الْوَلَاء وسننه عشر خِصَال خمس مِنْهَا قبل غسل الْوَجْه وَهِي التَّسْمِيَة وَغسل الْكَفَّيْنِ والمضمضة وَالِاسْتِنْشَاق وَالْمُبَالغَة فيههما إِلَّا للصَّائِم وَخمْس بعد غسل الْوَجْه وَهِي تَقْدِيم الْيُمْنَى على ليسرى وَمسح جَمِيع الرَّأْس وَمسح الْأُذُنَيْنِ ظاهرهما وباطنهما وَإِدْخَال الأصبعين فيهمَا وتخليل أَصَابِع الرجلَيْن . وَغسل دَاخل الْكَعْبَيْنِ وَلَيْسَ مسح لعنق من سنَنه وفضيلته تكراره ثَلَاثًا وزالواجب فِيهِ مرّة والمرتان أفضل وَالثَّلَاث أكمل وهيآته أَن يبْدَأ فِي تَطْهِير الْأَعْضَاء بمواضع الِابْتِدَاء . فَإِن اقْتصر على فروضه استة أَجزَأَهُ وَإِن ضيع حَظّ نَفسه فِيمَا ترك'
    total_words = re.sub(r'[^\w]', ' ', hasilSearch)
    total_words = total_words.lower().split()
    #print (total_words)
    total_word_length = len(total_words)
    total_sentences = tokenize.sent_tokenize(hasilSearch)
    total_sent_len = len(total_sentences)

    tf_score = {}
    for each_word in total_words:
        #print (each_word)
        each_word = each_word.replace('.','')
        if each_word not in NLTK_StopWords:
            if each_word in tf_score:
                tf_score[each_word] += 1
            else:
                tf_score[each_word] = 1

    # Dividing by total_word_length for each dictionary element
    tf_score.update((x, y/int(total_word_length)) for x, y in tf_score.items())
    #print(tf_score)
    def check_sent(word, sentences): 
        final = [all([w in x for w in word]) for x in sentences] 
        sent_len = [sentences[i] for i in range(0, len(final)) if final[i]]
        return int(len(sent_len))

    idf_score = {}
    for each_word in total_words:
        #print (each_word)
        each_word = each_word.replace('.','')
        if each_word not in NLTK_StopWords:
            if each_word in idf_score:
                idf_score[each_word] = check_sent(each_word, total_sentences)
            else:
                idf_score[each_word] = 1

    # Performing a log and divide
    idf_score.update((x, math.log(int(total_sent_len)/y)) for x, y in idf_score.items())

    #print(idf_score)
    tf_idf_score = {key: tf_score[key] * idf_score.get(key, 0) for key in tf_score.keys()}
    #print(tf_idf_score)
    def get_top_n(dict_elem, n):
        result = dict(sorted(dict_elem.items(), key = itemgetter(1), reverse = True)[:n]) 
        hasil =list(result.keys())
        #print(list(result.keys()))        
        return hasil
    #print(get_top_n(tf_idf_score, 25))
    #print(len(get_top_n(tf_idf_score, 1)))
    keywordtfidf.append(get_top_n(tf_idf_score, 25))
    for i in range(len(keywordtfidf)):
        #print (i)
        totalKw=0
        totalKw=len(keywordtfidf[i])
        for j in range(totalKw):
            #print (j)
            keywordtfidf2.append(keywordtfidf[i][j])
    #print (keywordtfidf2)
    return keywordtfidf2

In [64]:
#Keywords Extraction with BERT
def keyword_bert(hasilSearch):

    keywordbert=[]

    #for j in range(len(array_text)):
    keyword1 = kw_extractor.extract_keywords(hasilSearch, top_n=10, keyphrase_ngram_range=(1, 1))
    keyword2 = kw_extractor.extract_keywords(hasilSearch, top_n=10, keyphrase_ngram_range=(1, 2))

    #print("Keywords of article\n", keywords)
    for i in range (0,len (keyword1)):
        keywordbert.append(keyword1[i][0])
        keywordbert.append(keyword2[i][0])
    #print (keywordbert)
    return keywordbert

In [71]:
def rangking (keywordGabung):
    keywordTemp=[]
    keywordFinal=[]

    def borda_sort(lists):
        scores = {}
        for l in lists:
            for idx, elem in enumerate(reversed(l)):
                if not elem in scores:
                    scores[elem] = 0
                scores[elem] += idx
        return sorted(scores.keys(), key=lambda elem: scores[elem], reverse=True)

    keywordTemp.append(borda_sort(keywordGabung))
    print ('kandidat temp',keywordTemp)
    print ('Total Kandidat temp: ',len(keywordTemp[0]))

    if len(keywordTemp[0])>30:
        print ('kurang dari 30')
        for i in range (0,30):
            keywordFinal.append(keywordTemp[0][i])
    elif len(keywordTemp[0])<30:
        print ('lebih dari 30')
        for i in range (0,len(keywordTemp)):
            for j in range (0,len(keywordTemp[0])):
                keywordFinal.append(keywordTemp[0][j])
    print ('Total Kandidat final: ',len(keywordFinal))
    print ('Kandidat final: ',keywordFinal)
    return keywordFinal


In [72]:
kueriAsli='pelaku'
kueri=preprocessing(kueriAsli)
kueri= [" ".join (kueri)]
print (kueri)
hasilkandidat=[]
keywordGabung=[]
kandidatFix=[]
kueriFix=[]
hasilDokumen=[]

hasilSearch=cari_dokpertama(kueriAsli)
keywordYake=keyword_yake(hasilSearch)
keywordtfidf2=keyword_tfidf(hasilSearch)
keywordbert=keyword_bert (hasilSearch)
for i in keywordYake:
    keywordGabung.append(i)
for i in keywordtfidf2:
    keywordGabung.append(i)
for i in keywordbert:
    keywordGabung.append(i)
hasilrank=rangking(keywordGabung,kueriAsli)
for i in hasilrank:
    kueriFix.append(i)
for j in kueriFix:
    hasilkandidat.append(j)
kueriFix=[preprocessing(i) for i in kueriFix]
for i in kueriFix:
    for j in i:
        kandidatFix.append(j)
kandidatFix= [" ".join (kandidatFix)]
print ('*'*120)
j=1
for i in range(0, 5):
    hasilW=[]
    teks=df_total.iloc[i,-1]
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform([teks])
    query_vec= tfidf_vectorizer.transform(kandidatFix)
    results=cosine_similarity(tfidf_matrix, query_vec).reshape((-1))
    hasilDokumen.append(df_total.iloc[i,2])
    for a in hasilrank:
        cariW = re.findall(a,hasilDokumen[i])
        #print(cariW)
        if cariW:
            hasilW.append(a)
    print (j)
    print("No ID Dokumen  : ", i)
    print("Tanggal        : ", df_total.iloc[i,1])
    print("Isi berita     : ", df_total.iloc[i,2])
    print ("Hasil W       : ",hasilW)
    print("(Kemiripan: %.4f) " % results)
    print ('*'*120)
    j+=1

['kecelakaan']
kandidat temp [['toyota', '166', '28', 'informasi', '165', 'rifki', 'avanza', 'dialami', 'sedan toyota', '5', 'sedan', 'iya', 'toyota vios', 'parah', 'lancar', 'bemper', 'sedan ditumpangi', 'kaca', 'kabupaten', 'mobil sedan', 'pecah', 'atap', 'minibus', 'mobilnya', 'sedan terbalik', 'arah bandung', 'penyok', 'kecamatan', 'bandung sumedang', 'diberitakan', 'lokasi kejadian', 'jelambar', '1152', 'kecelakaan', 'lalin', 'sedan mengalami', 'pembatas jalan', 'macet', 'jatiluhur', 'mobil', 'melibatkan', 'menimpa sedan', 'kecelakaan tunggal', 'kota', 'rancabali', 'tol cisumdawu', 'melintang', 'sedan km', 'menabrak pembatas', '400', 'jatiwangi', 'kanan', 'kecelakaan minibus']]
Total Kandidat temp:  53
kurang dari 30
Total Kandidat final:  30
Kandidat final:  ['toyota', '166', '28', 'informasi', '165', 'rifki', 'avanza', 'dialami', 'sedan toyota', '5', 'sedan', 'iya', 'toyota vios', 'parah', 'lancar', 'bemper', 'sedan ditumpangi', 'kaca', 'kabupaten', 'mobil sedan', 'pecah', 'atap

No ID Dokumen  :  23
Tanggal        :  Jumat, 28 Jan 2022 11:27 WIB


Isi berita     :  tabrak truk tol cipularang pikap ringsek mobil jenis pikap menabrak truk tronton ruas jalan tol cipularang km 87 purwakarta orang tewas akibat kejadian
(Kemiripan: 0.1796) 
************************************************************************************************************************
No ID Dokumen  :  24
Tanggal        :  Jumat, 28 Jan 2022 11:23 WIB


Isi berita     :  viral postingan korban jambret jaksel playing victim polisi sopir taksi unggahan pemotor ditabrak sopir taksi inget berita viral ga 3 yg ojol car menabrak orang jambret you have to know the facts yg ditabrak si eko jambret yes si eko yg mengaku korban playing victim fitnah si pengendara yg udah meninggal bunyi unggahan medsos jumat 28 1 2022 postingan muncul akun tiktok radensurya939 diduga ayah terduga jambret detikcom menghubungi bersangkutan berita diturunkan tanggapan mengkonfirmasi unggahan kabid humas polda metro jaya 