In [2]:
# Import library-library
import os
import glob
from tqdm import tqdm
import numpy as np

# Data Preparation and Preprocessing
import pandas as pd
import re
from string import digits

# Word Embedding
import joblib
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer('xlm-r-distilroberta-base-paraphrase-v1')
from keybert import KeyBERT
kw_extractor = KeyBERT('distilbert-base-nli-mean-tokens')
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt

# Input and Expansion Query
import nltk
nltk.download('punkt')
import math
from textblob import TextBlob
from yake import KeywordExtractor
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import stopwords
from nltk import tokenize
from operator import itemgetter
from nltk.tokenize import word_tokenize
from textblob import TextBlob
NLTK_StopWords = stopwords.words('indonesian')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ryand\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
def preprocessing(berita):
    s = berita.lower()
    s = s.replace('\n', ' ')
    s = s.replace('\r', ' ')
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
    tokens = [token for token in s.split(" ") if token != ""]
    T = [t for t in tokens if (((t.lower() == "tempat") or (t.lower() == "waktu")) or (t not in NLTK_StopWords))]
    return T

In [4]:
df_test = pd.read_csv('df_test.csv')
df_test = df_test[pd.notnull(df_test['description'])]
print(df_test.info())
print ('-'*90)
document_text_test= joblib.load('document_text_test.pkl')
print(len(document_text_test))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 147 entries, 0 to 146
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        147 non-null    object
 1   date         147 non-null    object
 2   description  147 non-null    object
 3   source       147 non-null    object
dtypes: object(4)
memory usage: 5.7+ KB
None
------------------------------------------------------------------------------------------
147


In [5]:
df_train = pd.read_csv('df_train.csv')
df_train = df_train[pd.notnull(df_train['description'])]
print(df_train.info())
print ('-'*90)
document_text_train= joblib.load('document_text_train.pkl')
print(len(document_text_train))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1314 entries, 0 to 1313
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        1314 non-null   object
 1   date         1314 non-null   object
 2   description  1314 non-null   object
 3   source       1314 non-null   object
dtypes: object(4)
memory usage: 51.3+ KB
None
------------------------------------------------------------------------------------------
1314


In [6]:
bow_ekonomi_read = pd.read_csv('bow_ekonomi.csv')
bow_ekonomi_read.head()

Unnamed: 0,tingkat setelah parent,parent,similarity
0,1,ekonomi,"[('perekonomian', 0.7561678290367126), ('ekono..."
1,2,perekonomian,"[('ekonomi', 0.7561678290367126), ('ekonominya..."
2,2,ekonominya,"[('perekonomian', 0.676762044429779), ('ekonom..."
3,2,politik,"[('politiknya', 0.6882383227348328), ('sosial'..."
4,2,sosial,"[('sosialnya', 0.6959102153778076), ('politik'..."


In [7]:
bow_ekonomi_text = []

for i in range(0, bow_ekonomi_read.shape[0]):
    bow_ekonomi_text.append(bow_ekonomi_read.iloc[i,1])

In [8]:
bow_where_read = pd.read_csv('bow_where.csv')
bow_where_text = []

for i in range(0, bow_where_read.shape[0]):
    bow_where_text.append(bow_where_read.iloc[i,1])

In [9]:
bow_when_read = pd.read_csv('bow_when.csv')
bow_when_text = []

for i in range(0, bow_when_read.shape[0]):
    bow_when_text.append(bow_when_read.iloc[i,1])

In [10]:
bow_who_read = pd.read_csv('bow_who.csv')
bow_who_text = []

for i in range(0, bow_who_read.shape[0]):
    bow_who_text.append(bow_who_read.iloc[i,1])

In [11]:
def cari_dokpertama(kueriAsli):
    kueriPre=preprocessing(kueriAsli)
    kueriPre= " ".join (kueriPre)
    hasilSearch=[]
    tfidf_matrix = joblib.load('tfidf_train.pkl')
    tfidf_vectorizer = joblib.load('vectorizer.pkl')
    query_vec= tfidf_vectorizer.transform([kueriPre])
    results=cosine_similarity(tfidf_matrix, query_vec).reshape((-1))
    for i in results.argsort()[-5:][::-1]:
        hasilSearch.append(df_train.iloc[i,-2])
    hasilSearch=". ".join(hasilSearch)
    return hasilSearch



In [12]:
##Keywords Extraction with YAKE
def keyword_yake(hasilSearch):
    keywordYake=[]

    k_extractor = KeywordExtractor(lan="id", n=1, top=10)
    k_extractor2 = KeywordExtractor(lan="id", n=2, top=10)
    keywords = k_extractor.extract_keywords(text=hasilSearch)
    keywords = k_extractor2.extract_keywords(text=hasilSearch)
    keywordYake = [x for x, y in keywords]
    #keywordYake.append(keywords)
    #print (keywordYake)
    return keywordYake
#print("Keywords of article\n", keywords)

In [13]:
#Keywords Extraction with TFIDF
def keyword_tfidf(hasilSearch):

    keywordtfidf=[]
    keywordtfidf2=[]

    #doc = 'بَاب فرض الْوضُوء وسننه وهيآته وَفرض الْوضُوء سِتّ خِصَال النِّيَّة عمند غسل الْوَجْه وَغسل الْوَجْه وَغسل الذراعين مَعَ الْمرْفقين وَمسح مَا قل من الرَّأْس وَغسل الرجلَيْن مَعَ الْكَعْبَيْنِ وَالتَّرْتِيب وعَلى قَول الْوَلَاء وسننه عشر خِصَال خمس مِنْهَا قبل غسل الْوَجْه وَهِي التَّسْمِيَة وَغسل الْكَفَّيْنِ والمضمضة وَالِاسْتِنْشَاق وَالْمُبَالغَة فيههما إِلَّا للصَّائِم وَخمْس بعد غسل الْوَجْه وَهِي تَقْدِيم الْيُمْنَى على ليسرى وَمسح جَمِيع الرَّأْس وَمسح الْأُذُنَيْنِ ظاهرهما وباطنهما وَإِدْخَال الأصبعين فيهمَا وتخليل أَصَابِع الرجلَيْن . وَغسل دَاخل الْكَعْبَيْنِ وَلَيْسَ مسح لعنق من سنَنه وفضيلته تكراره ثَلَاثًا وزالواجب فِيهِ مرّة والمرتان أفضل وَالثَّلَاث أكمل وهيآته أَن يبْدَأ فِي تَطْهِير الْأَعْضَاء بمواضع الِابْتِدَاء . فَإِن اقْتصر على فروضه استة أَجزَأَهُ وَإِن ضيع حَظّ نَفسه فِيمَا ترك'
    total_words = re.sub(r'[^\w]', ' ', hasilSearch)
    total_words = total_words.lower().split()
    #print (total_words)
    total_word_length = len(total_words)
    total_sentences = tokenize.sent_tokenize(hasilSearch)
    total_sent_len = len(total_sentences)

    tf_score = {}
    for each_word in total_words:
        #print (each_word)
        each_word = each_word.replace('.','')
        if each_word not in NLTK_StopWords:
            if each_word in tf_score:
                tf_score[each_word] += 1
            else:
                tf_score[each_word] = 1

    # Dividing by total_word_length for each dictionary element
    tf_score.update((x, y/int(total_word_length)) for x, y in tf_score.items())
    #print(tf_score)
    def check_sent(word, sentences): 
        final = [all([w in x for w in word]) for x in sentences] 
        sent_len = [sentences[i] for i in range(0, len(final)) if final[i]]
        return int(len(sent_len))

    idf_score = {}
    for each_word in total_words:
        #print (each_word)
        each_word = each_word.replace('.','')
        if each_word not in NLTK_StopWords:
            if each_word in idf_score:
                idf_score[each_word] = check_sent(each_word, total_sentences)
            else:
                idf_score[each_word] = 1

    # Performing a log and divide
    idf_score.update((x, math.log(int(total_sent_len)/y)) for x, y in idf_score.items())

    #print(idf_score)
    tf_idf_score = {key: tf_score[key] * idf_score.get(key, 0) for key in tf_score.keys()}
    #print(tf_idf_score)
    def get_top_n(dict_elem, n):
        result = dict(sorted(dict_elem.items(), key = itemgetter(1), reverse = True)[:n]) 
        hasil =list(result.keys())
        #print(list(result.keys()))        
        return hasil
    #print(get_top_n(tf_idf_score, 25))
    #print(len(get_top_n(tf_idf_score, 1)))
    keywordtfidf.append(get_top_n(tf_idf_score, 25))
    for i in range(len(keywordtfidf)):
        #print (i)
        totalKw=0
        totalKw=len(keywordtfidf[i])
        for j in range(totalKw):
            #print (j)
            keywordtfidf2.append(keywordtfidf[i][j])
    #print (keywordtfidf2)
    return keywordtfidf2

In [14]:
#Keywords Extraction with BERT
def keyword_bert(hasilSearch):

    keywordbert=[]

    #for j in range(len(array_text)):
    keyword1 = kw_extractor.extract_keywords(hasilSearch, top_n=10, keyphrase_ngram_range=(1, 1))
    keyword2 = kw_extractor.extract_keywords(hasilSearch, top_n=10, keyphrase_ngram_range=(1, 2))

    #print("Keywords of article\n", keywords)
    for i in range (0,len (keyword1)):
        keywordbert.append(keyword1[i][0])
        keywordbert.append(keyword2[i][0])
    #print (keywordbert)
    return keywordbert

In [15]:
def rangking (keywordGabung,kueriAsli):
    kandidatFinalCek=[]
    kandidatFinalFix=[]
    for i in keywordGabung:
        if (i not in kandidatFinalCek and i!=0):
            kandidatFinalCek.append(i)
    queries=[kueriAsli]
    query_embeddings = embedder.encode(queries)
    corpus_embeddings4 = embedder.encode(kandidatFinalCek)
    # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
    closest_n = 50
    for query, query_embedding in zip(queries, query_embeddings):
        distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings4, 'cosine')[0]
        results = zip(range(len(distances)), distances)
        results = sorted(results, key=lambda x: x[1])
        for idx, distance in results[0:closest_n]:
            kandidatFinalFix.append(kandidatFinalCek[idx])
    print ('kandidatFinalFix: ', kandidatFinalFix)
    return kandidatFinalFix


In [16]:
def keyword_BOW(keywordBOW, kueriAsli):
    cekDuplicate = []
    kandidatFix = []

    for i in keywordBOW:
        if(i not in cekDuplicate and i!=0):
            cekDuplicate.append(i)

    queries=[kueriAsli]
    query_embeddings = embedder.encode(queries)
    corpus_embeddings4 = embedder.encode(cekDuplicate)
    
    # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
    closest_n = 10
    for query, query_embedding in zip(queries, query_embeddings):
        distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings4, 'cosine')[0]
        results = zip(range(len(distances)), distances)
        results = sorted(results, key=lambda x: x[1])
        for idx, distance in results[0:closest_n]:
            kandidatFix.append(cekDuplicate[idx])
    return kandidatFix

In [17]:
def kandidatFix(kueriAsli,bow):
    
    kueri=preprocessing(kueriAsli)
    kueri= [" ".join (kueri)]
    print (kueri)
    hasilkandidat=[]
    keywordGabung=[]
    kandidatFix=[]
    kueriFix=[]

    hasilSearch=cari_dokpertama(kueriAsli)
    keywordYake=keyword_yake(hasilSearch)
    keywordtfidf2=keyword_tfidf(hasilSearch)
    keywordbert=keyword_bert (hasilSearch)
    keywordBOW=keyword_BOW(bow, kueriAsli)

    for i in keywordYake:
        keywordGabung.append(i)
    for i in keywordtfidf2:
        keywordGabung.append(i)
    for i in keywordbert:
        keywordGabung.append(i)
    for i in keywordBOW:
        keywordGabung.append(i)
    hasilrank=rangking(keywordGabung,kueriAsli)
    # print(hasilrank)
    for i in hasilrank:
        kueriFix.append(i)
    for j in kueriFix:
        hasilkandidat.append(j)
    kueriFix=[preprocessing(i) for i in kueriFix]
    for i in kueriFix:
        for j in i:
            kandidatFix.append(j)
    kandidatFix= [" ".join (kandidatFix)]
    print ('*'*120)

    return(kandidatFix,keywordGabung,keywordBOW,hasilrank)


In [18]:
hasilDokumenWhat=[]
hasilDokumenWho=[]
hasilDokumenWhen=[]
hasilDokumenWhere=[]

kueriAsliWhat='apa yang terjadi diberita tersebut'
kueriAsliWhere='di daerah mana kejadian itu terjadi'
kueriAsliWho='siapa pelaku kejadian ini'
kueriAsliWhen='Kapan waktu kejadian tersebut'

kandidatFixWhat,keywordGabungWhat,keywordBOW_What, hasilrankWhat = kandidatFix(kueriAsliWhat, bow_ekonomi_text)
kandidatFixWhere,keywordGabungWhere,keywordBOW_Where, hasilrankWhere = kandidatFix(kueriAsliWhere, bow_where_text)
kandidatFixWho,keywordGabungWho,keywordBOW_Who, hasilrankWho = kandidatFix(kueriAsliWho, bow_who_text)
kandidatFixWhen,keywordGabungWhen,keywordBOW_When, hasilrankWhen = kandidatFix(kueriAsliWhen, bow_when_text)

['diberita']
kandidatFinalFix:  ['terwujudnya', 'terselenggaranya', 'masyarakatnya', 'konsumennya', 'nasabahnya', 'tercapainya', 'timbulnya', 'aksinya', 'keputusannya', 'perkembangannya', 'munculnya', 'menganaktirikan', 'ditetapkan menganaktirikan', 'diiringi', 'berhasil', 'sayangnya', 'terbukti', 'pemberlakuan', 'bikin', 'diprediksi', 'penggerak', 'ketentuan', 'dibuktikan', 'ancaman', 'juta orang', 'alami', 'of', 'resesi', 'mengoptimalkan', 'ketergantungan', 'pembatasan', 'kegiatan', 'perekonomian', '07', 'melambat', 'juta', 'lapangan pekerjaan', 'abdullah', 'angka kemiskinan', '7', 'indonesia berhasil', '8', 'perusahaan indonesia', 'reform', '4', 'memulihkan indonesia', 'kemiskinan maret', 'core', 'center', '31']
************************************************************************************************************************
['daerah kejadian']
kandidatFinalFix:  ['konsumsi masyarakat', 'masyarakat', 'maksud', 'temanggung', 'kuin', 'masyarakat tujuan', 'srondol', 'persebaya', 

In [None]:
# #kueri what
# kueriAsliWhat='apa sebenarnya kejadian yang terjadi diberita tersebut'
# kueriWhat=preprocessing(kueriAsliWhat)
# kueriWhat= [" ".join (kueriWhat)]
# print (kueriWhat)
# hasilkandidatWhat=[]
# keywordGabungWhat=[]
# kandidatFixWhat=[]
# kueriFixWhat=[]
# hasilDokumenWhat=[]

# hasilSearchWhat=cari_dokpertama(kueriAsliWhat)
# keywordYakeWhat=keyword_yake(hasilSearchWhat)
# keywordtfidf2What=keyword_tfidf(hasilSearchWhat)
# keywordbertWhat=keyword_bert (hasilSearchWhat)
# keywordBOW_What=keyword_BOW(bow_kecelakaan_text, kueriAsliWhat)

# for i in keywordYakeWhat:
#     keywordGabungWhat.append(i)
# for i in keywordtfidf2What:
#     keywordGabungWhat.append(i)
# for i in keywordbertWhat:
#     keywordGabungWhat.append(i)
# for i in keywordBOW_What:
#     keywordGabungWhat.append(i)
# hasilrankWhat=rangking(keywordGabungWhat,kueriAsliWhat)
# # print(hasilrank)
# for i in hasilrankWhat:
#     kueriFixWhat.append(i)
# for j in kueriFixWhat:
#     hasilkandidatWhat.append(j)
# kueriFixWhat=[preprocessing(i) for i in kueriFixWhat]
# for i in kueriFixWhat:
#     for j in i:
#         kandidatFixWhat.append(j)
# kandidatFixWhat= [" ".join (kandidatFixWhat)]
# print ('*'*120)


In [None]:
# #kueri where
# kueriAsliWhere='di daerah mana kejadian itu terjadi'
# kueriWhere=preprocessing(kueriAsliWhere)
# kueriWhere= [" ".join (kueriWhere)]
# print (kueriWhere)
# hasilkandidatWhere=[]
# keywordGabungWhere=[]
# kandidatFixWhere=[]
# kueriFixWhere=[]
# hasilDokumenWhere=[]

# hasilSearchWhere=cari_dokpertama(kueriAsliWhere)
# keywordYakeWhere=keyword_yake(hasilSearchWhere)
# keywordtfidf2Where=keyword_tfidf(hasilSearchWhere)
# keywordbertWhere=keyword_bert (hasilSearchWhere)
# keywordBOW_Where=keyword_BOW(bow_where_text, kueriAsliWhere)

# for i in keywordYakeWhere:
#     keywordGabungWhere.append(i)
# for i in keywordtfidf2Where:
#     keywordGabungWhere.append(i)
# for i in keywordbertWhere:
#     keywordGabungWhere.append(i)
# for i in keywordBOW_Where:
#     keywordGabungWhere.append(i)
# hasilrankWhere=rangking(keywordGabungWhere,kueriAsliWhere)
# # print(hasilrank)
# for i in hasilrankWhere:
#     kueriFixWhere.append(i)
# for j in kueriFixWhere:
#     hasilkandidatWhere.append(j)
# kueriFixWhere=[preprocessing(i) for i in kueriFixWhere]
# for i in kueriFixWhere:
#     for j in i:
#         kandidatFixWhere.append(j)
# kandidatFixWhere= [" ".join (kandidatFixWhere)]
# print ('*'*120)


In [None]:
# #kueri who
# kueriAsliWho='siapa pelaku kejadian ini'
# kueriWho=preprocessing(kueriAsliWho)
# kueriWho= [" ".join (kueriWho)]
# print (kueriWho)
# hasilkandidatWho=[]
# keywordGabungWho=[]
# kandidatFixWho=[]
# kueriFixWho=[]
# hasilDokumenWho=[]

# hasilSearchWho=cari_dokpertama(kueriAsliWho)
# keywordYakeWho=keyword_yake(hasilSearchWho)
# keywordtfidf2Who=keyword_tfidf(hasilSearchWho)
# keywordbertWho=keyword_bert (hasilSearchWho)
# keywordBOW_Who=keyword_BOW(bow_who_text, kueriAsliWho)

# for i in keywordYakeWho:
#     keywordGabungWho.append(i)
# for i in keywordtfidf2Who:
#     keywordGabungWho.append(i)
# for i in keywordbertWho:
#     keywordGabungWho.append(i)
# for i in keywordBOW_Who:
#     keywordGabungWho.append(i)
# hasilrankWho=rangking(keywordGabungWho,kueriAsliWho)
# # print(hasilrank)
# for i in hasilrankWho:
#     kueriFixWho.append(i)
# for j in kueriFixWho:
#     hasilkandidatWho.append(j)
# kueriFixWho=[preprocessing(i) for i in kueriFixWho]
# for i in kueriFixWho:
#     for j in i:
#         kandidatFixWho.append(j)
# kandidatFixWho= [" ".join (kandidatFixWho)]
# print ('*'*120)


In [None]:
# #kueri when
# kueriAsliWhen='Kapan waktu kejadian tersebut'
# kueriWhen=preprocessing(kueriAsliWhen)
# kueriWhen= [" ".join (kueriWhen)]
# print (kueriWhen)
# hasilkandidatWhen=[]
# keywordGabungWhen=[]
# kandidatFixWhen=[]
# kueriFixWhen=[]
# hasilDokumenWhen=[]

# hasilSearchWhen=cari_dokpertama(kueriAsliWhen)
# keywordYakeWhen=keyword_yake(hasilSearchWhen)
# keywordtfidf2When=keyword_tfidf(hasilSearchWhen)
# keywordbertWhen=keyword_bert (hasilSearchWhen)
# keywordBOW_When=keyword_BOW(bow_when_text, kueriAsliWhen)

# for i in keywordYakeWhen:
#     keywordGabungWhen.append(i)
# for i in keywordtfidf2When:
#     keywordGabungWhen.append(i)
# for i in keywordbertWhen:
#     keywordGabungWhen.append(i)
# for i in keywordBOW_When:
#     keywordGabungWhen.append(i)
# hasilrankWhen=rangking(keywordGabungWhen,kueriAsliWhen)
# # print(hasilrank)
# for i in hasilrankWhen:
#     kueriFixWhen.append(i)
# for j in kueriFixWhen:
#     hasilkandidatWhen.append(j)
# kueriFixWhen=[preprocessing(i) for i in kueriFixWhen]
# for i in kueriFixWhen:
#     for j in i:
#         kandidatFixWhen.append(j)
# kandidatFixWhen= [" ".join (kandidatFixWhen)]
# print ('*'*120)

In [None]:
j=1

testing_data = []
for i in range(0, len(document_text_test)-1):
    
    hasilWhat=[]

    teks=df_test.iloc[i,-2]
    tfidf_vectorizer = joblib.load('vectorizer.pkl')
    # tfidf_matrix = joblib.load('tfidf_test.pkl')
    tfidf_matrix = tfidf_vectorizer.fit_transform([teks])

    query_vec_What= tfidf_vectorizer.transform(kandidatFixWhat)
    results_what=cosine_similarity(tfidf_matrix, query_vec_What).reshape((-1))
    hasilDokumenWhat.append(df_test.iloc[i,2])
    for a in hasilrankWhat:
        cariW = re.findall(a,hasilDokumenWhat[i])
        #print(cariW)
        if cariW:
            hasilWhat.append(a)
    
    data = [i,df_test.iloc[i,2],'what',kueriAsliWhat,keywordBOW_What , keywordGabungWhat, hasilWhat, results_what,' ',' ']

    testing_data.append(data)


for i in range(0, len(document_text_test)-1):

    hasilWhere=[]

    query_vec_Where= tfidf_vectorizer.transform(kandidatFixWhere)
    results_where=cosine_similarity(tfidf_matrix, query_vec_Where).reshape((-1))
    hasilDokumenWhere.append(df_test.iloc[i,2])
    for a in hasilrankWhere:
        cariW = re.findall(a,hasilDokumenWhere[i])
        #print(cariW)
        if cariW:
            hasilWhere.append(a)
    
    data = [i,df_test.iloc[i,2],'where',kueriAsliWhere,keywordBOW_Where , keywordGabungWhere, hasilWhere, results_where,' ',' ']

    testing_data.append(data)


for i in range(0, len(document_text_test)-1):

    hasilWho=[]

    query_vec_Who= tfidf_vectorizer.transform(kandidatFixWho)
    results_who=cosine_similarity(tfidf_matrix, query_vec_Who).reshape((-1))
    hasilDokumenWho.append(df_test.iloc[i,2])
    for a in hasilrankWho:
        cariW = re.findall(a,hasilDokumenWho[i])
        #print(cariW)
        if cariW:
            hasilWho.append(a)
    
    data = [i,df_test.iloc[i,2],'who',kueriAsliWho,keywordBOW_Who , keywordGabungWho, hasilWho, results_who,' ',' ']

    testing_data.append(data)

for i in range(0, len(document_text_test)-1):

    hasilWhen=[]

    query_vec_When= tfidf_vectorizer.transform(kandidatFixWhen)
    results_when=cosine_similarity(tfidf_matrix, query_vec_When).reshape((-1))
    hasilDokumenWhen.append(df_test.iloc[i,2])
    for a in hasilrankWhen:
        cariW = re.findall(a,hasilDokumenWhen[i])
        #print(cariW)
        if cariW:
            hasilWhen.append(a)
    
    data = [i,df_test.iloc[i,2],'when',kueriAsliWhen,keywordBOW_When , keywordGabungWhen, hasilWhen, results_when,' ',' ']

    testing_data.append(data)
    
    # print (j)
    # print("No ID Dokumen  : ", i)
    # print("Tanggal        : ", df_test.iloc[i,1])
    # print("Isi berita     : ", df_test.iloc[i,2])
    # print ("Hasil W       : ",hasilW)
    # print("(Kemiripan: %.4f) " % results)
    # print ('*'*120)
    j+=1

In [None]:
#write to csv
writer = pd.DataFrame(testing_data, columns=['No Document','Description', 'W','Keyword W', 'Keyword BOW', 'Keyword Gabung','hasilW', 'Kemiripan', 'True Positif', 'True Negative'])
writer.to_csv('QE_Stat_V2_testing_result.csv', index=False, sep=',')

In [None]:
# kueriAsli='kecelakaan'
# kueri=preprocessing(kueriAsli)
# kueri= [" ".join (kueri)]
# print (kueri)
# hasilkandidat=[]
# keywordGabung=[]
# kandidatFix=[]
# kueriFix=[]
# hasilDokumen=[]
# hasilSearch=cari_dokpertama(kueriAsli)
# keywordYake=keyword_yake(hasilSearch)
# keywordtfidf2=keyword_tfidf(hasilSearch)
# keywordbert=keyword_bert (hasilSearch)
# keywordBOW=keyword_BOW(bow_text, kueriAsli)
# for i in keywordYake:
#     keywordGabung.append(i)
# for i in keywordtfidf2:
#     keywordGabung.append(i)
# for i in keywordbert:
#     keywordGabung.append(i)
# for i in keywordBOW:
#     keywordGabung.append(i)
# hasilrank=rangking(keywordGabung,kueriAsli)
# for i in hasilrank:
#     kueriFix.append(i)
# for j in kueriFix:
#     hasilkandidat.append(j)
# kueriFix=[preprocessing(i) for i in kueriFix]
# for i in kueriFix:
#     for j in i:
#         kandidatFix.append(j)
# kandidatFix= [" ".join (kandidatFix)]
# print ('*'*120)
# tfidf_matrix =joblib.load( "tfidf_test.pkl" )
# tfidf_vectorizer = joblib.load( "vectorizer.pkl" ) 
# query_vec= tfidf_vectorizer.transform(kandidatFix)
# results=cosine_similarity(tfidf_matrix, query_vec).reshape((-1))
# j=1
# for i in results.argsort()[-10:][::-1]:
#     print (j)
#     print("No ID Dokumen  : ", i)
#     print("Tanggal        : ", df_test.iloc[i,1])
#     print("Isi berita     : ", df_test.iloc[i,2])
#     print("(Kemiripan: %.4f) " % results[i])
#     hasilDokumen.append(df_test.iloc[i,2])
#     print ('*'*120)
#     j+=1