In [8]:
# Import library-library
import os
import glob
from tqdm import tqdm
import numpy as np

# Data Preparation and Preprocessing
import pandas as pd
import re
from string import digits

# Word Embedding
import joblib
from keybert import KeyBERT
kw_extractor = KeyBERT('distilbert-base-nli-mean-tokens')
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer('xlm-r-distilroberta-base-paraphrase-v1')

# Input and Expansion Query
import nltk
nltk.download('punkt')
import math
from textblob import TextBlob
from yake import KeywordExtractor
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import stopwords
from nltk import tokenize
from operator import itemgetter
from nltk.tokenize import word_tokenize
from textblob import TextBlob
NLTK_StopWords = stopwords.words('indonesian')

KeyboardInterrupt: 

In [2]:
def preprocessing(berita):
    s = berita.lower()
    s = s.replace('\n', ' ')
    s = s.replace('\r', ' ')
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
    tokens = [token for token in s.split(" ") if token != ""]
    T = [t for t in tokens if (((t.lower() == "tempat") or (t.lower() == "waktu") or (t.lower() == "hari")) or (t not in NLTK_StopWords))]
    return T

In [3]:
kta="olahraga fengan sukamu"
print(preprocessing(kta))

['olahraga', 'fengan', 'sukamu']


In [4]:

df_test = pd.read_csv('df_test.csv')
df_test = df_test[pd.notnull(df_test['description'])]
print(df_test.info())
print ('-'*90)
document_text_test= joblib.load('document_text_test.pkl')
print(len(document_text_test))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 158 entries, 0 to 157
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        158 non-null    object
 1   date         158 non-null    object
 2   description  158 non-null    object
 3   source       158 non-null    object
dtypes: object(4)
memory usage: 6.2+ KB
None
------------------------------------------------------------------------------------------
158


In [5]:
df_train = pd.read_csv('df_train.csv')
df_train = df_train[pd.notnull(df_train['description'])]
print(df_train.info())
print ('-'*90)
document_text_train= joblib.load('document_text_train.pkl')
print(len(document_text_train))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1418 entries, 0 to 1418
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        1418 non-null   object
 1   date         1418 non-null   object
 2   description  1418 non-null   object
 3   source       1418 non-null   object
dtypes: object(4)
memory usage: 55.4+ KB
None
------------------------------------------------------------------------------------------
1419


In [3]:
def bow_read(bow):
    bow_read = pd.read_csv(bow)
    bow_text = []

    for i in range(0,bow_read.shape[0]):
        if(bow_read.iloc[i,1] not in bow_text and bow_read.iloc[i,1]!=0):
            bow_text.append(bow_read.iloc[i,1])
        
    return(bow_read,bow_text)

In [7]:
def cari_dokpertama(kueriAsli):
    kueriPre=preprocessing(kueriAsli)
    kueriPre= " ".join (kueriPre)
    hasilSearch=[]
    tfidf_train = joblib.load('tfidf_train.pkl')
    tfidf_vectorizer = joblib.load('vectorizer.pkl')
    query_vec= tfidf_vectorizer.transform([kueriPre])
    # print('queryvec')
    print(query_vec)
    results=cosine_similarity(tfidf_train, query_vec).reshape((-1))
    # print(results)
    for i in results.argsort()[-5:][::-1]:
        hasilSearch.append(df_train.iloc[i,-2])
    hasilSearch=". ".join(hasilSearch)
    return hasilSearch



In [47]:
##Keywords Extraction with YAKE
def keyword_yake(hasilSearch):
    # print(hasilSearch)
    keywordYake=[]

    k_extractor = KeywordExtractor(lan="id", n=1, top=50)
    # print(k_extractor)
    k_extractor2 = KeywordExtractor(lan="id", n=2, top=50)
    # print(k_extractor2)
    keywords = k_extractor.extract_keywords(text=hasilSearch)
    # print("pertama : ",keywords)
    keywords = k_extractor2.extract_keywords(text=hasilSearch)
    # print("kedua : ",keywords)
    keywordYake = [x for x, y in keywords]
    # print(keywordYake)
    #keywordYake.append(keywords)
    # print (keywordYake)
    return keywordYake
#print("Keywords of article\n", keywords)

In [46]:
#Keywords Extraction with TFIDF
def keyword_tfidf(hasilSearch):

    keywordtfidf=[]
    keywordtfidf2=[]

    #doc = 'بَاب فرض الْوضُوء وسننه وهيآته وَفرض الْوضُوء سِتّ خِصَال النِّيَّة عمند غسل الْوَجْه وَغسل الْوَجْه وَغسل الذراعين مَعَ الْمرْفقين وَمسح مَا قل من الرَّأْس وَغسل الرجلَيْن مَعَ الْكَعْبَيْنِ وَالتَّرْتِيب وعَلى قَول الْوَلَاء وسننه عشر خِصَال خمس مِنْهَا قبل غسل الْوَجْه وَهِي التَّسْمِيَة وَغسل الْكَفَّيْنِ والمضمضة وَالِاسْتِنْشَاق وَالْمُبَالغَة فيههما إِلَّا للصَّائِم وَخمْس بعد غسل الْوَجْه وَهِي تَقْدِيم الْيُمْنَى على ليسرى وَمسح جَمِيع الرَّأْس وَمسح الْأُذُنَيْنِ ظاهرهما وباطنهما وَإِدْخَال الأصبعين فيهمَا وتخليل أَصَابِع الرجلَيْن . وَغسل دَاخل الْكَعْبَيْنِ وَلَيْسَ مسح لعنق من سنَنه وفضيلته تكراره ثَلَاثًا وزالواجب فِيهِ مرّة والمرتان أفضل وَالثَّلَاث أكمل وهيآته أَن يبْدَأ فِي تَطْهِير الْأَعْضَاء بمواضع الِابْتِدَاء . فَإِن اقْتصر على فروضه استة أَجزَأَهُ وَإِن ضيع حَظّ نَفسه فِيمَا ترك'
    total_words = re.sub(r'[^\w]', ' ', hasilSearch)
    total_words = total_words.lower().split()
    #print (total_words)
    total_word_length = len(total_words)
    total_sentences = tokenize.sent_tokenize(hasilSearch)
    total_sent_len = len(total_sentences)

    tf_score = {}
    for each_word in total_words:
        #print (each_word)
        each_word = each_word.replace('.','')
        if each_word not in NLTK_StopWords:
            if each_word in tf_score:
                tf_score[each_word] += 1
            else:
                tf_score[each_word] = 1

    # Dividing by total_word_length for each dictionary element
    tf_score.update((x, y/int(total_word_length)) for x, y in tf_score.items())
    #print(tf_score)
    def check_sent(word, sentences): 
        final = [all([w in x for w in word]) for x in sentences] 
        sent_len = [sentences[i] for i in range(0, len(final)) if final[i]]
        return int(len(sent_len))

    idf_score = {}
    for each_word in total_words:
        #print (each_word)
        each_word = each_word.replace('.','')
        if each_word not in NLTK_StopWords:
            if each_word in idf_score:
                idf_score[each_word] = check_sent(each_word, total_sentences)
            else:
                idf_score[each_word] = 1

    # Performing a log and divide
    idf_score.update((x, math.log(int(total_sent_len)/y)) for x, y in idf_score.items())

    #print(idf_score)
    tf_idf_score = {key: tf_score[key] * idf_score.get(key, 0) for key in tf_score.keys()}
    #print(tf_idf_score)
    def get_top_n(dict_elem, n):
        result = dict(sorted(dict_elem.items(), key = itemgetter(1), reverse = True)[:n]) 
        hasil =list(result.keys())
        #print(list(result.keys()))        
        return hasil
    #print(get_top_n(tf_idf_score, 25))
    #print(len(get_top_n(tf_idf_score, 1)))
    keywordtfidf.append(get_top_n(tf_idf_score, 35))
    for i in range(len(keywordtfidf)):
        #print (i)
        totalKw=0
        totalKw=len(keywordtfidf[i])
        for j in range(totalKw):
            #print (j)
            keywordtfidf2.append(keywordtfidf[i][j])
    # print (keywordtfidf2)
    return keywordtfidf2

In [45]:
#Keywords Extraction with BERT
def keyword_bert(hasilSearch):
    # print(hasilSearch)

    keywordbert=[]

    #for j in range(len(array_text)):
    keyword1 = kw_extractor.extract_keywords(hasilSearch, top_n=50, keyphrase_ngram_range=(1, 1))
    # print(keyword1)
    keyword2 = kw_extractor.extract_keywords(hasilSearch, top_n=50, keyphrase_ngram_range=(1, 2))
    # print(keyword2)
    # keyword3 = kw_extractor.extract_keywords(hasilSearch, top_n=20, keyphrase_ngram_range=(1, 4))
    # print(keyword3)

    #print("Keywords of article\n", keywords)
    for i in range (0,len (keyword1)):
        keywordbert.append(keyword1[i][0])
        keywordbert.append(keyword2[i][0])
    # print (keywordbert)
    # print(len(keywordbert))
    return keywordbert

In [32]:
hasilDokumenWhen=[]
# kueriAsliWhen='hari apa waktu terjadinya'
hasilSearch="polda metro jaya insiden pembacokan duri kosambi cengkareng jakarta barat penembakan perumahan green lake city tangerang rangkaian peristiwa kejadian polisi mengamankan 25 orang john kei markasnya medan satria bekasi kapolda metro jaya irjen nana sudjana timnya pimpinan dirkrimum polda metro jaya kombes tubagus ade hidayat olah tempat kejadian peristiwa tkp pemeriksaan saksi saksi tim khusus satgas antibegal preman tim khusus gabungan polres tangerang kota anggota direktorat reserse kriminal langsung direktur kombes tubagus ade mendiskusikan penyelidikan hasil penyelidikan diduga pelaku penganiayaan pembunuhan perusakan undang undang darurat uu nomor 12 1951 irjen nana jumpa pers mapolda metro jaya jakarta senin 22 6 2020 nana peristiwa rangkaian minggu 21 6 kejadian diawali pembacokan mengakibatkan orang tewas ydr alias er pembacokan pertigaan abc duri kosambi cengkareng jakarta barat minggu 21 6 11 30 wib korban er ar berboncengan motor diadang pelaku turun mobil diduga kelompok john kei 5 7 orang kelompok nus kei wilayah kosambi cengkareng jakarta barat menyebabkan 1 orang meninggal dunia nama er bersangkutan meninggal luka bacok tempat irjen nana korban berinisial ar mengalami luka bacok orang putus jari tangannya 4 jari tangan putus bacokan nama ar"
keywordber=keyword_bert(hasilSearch)

[('medan', 0.5339), ('pembacokan', 0.5201), ('tangerang', 0.5142), ('bersangkutan', 0.5107), ('gabungan', 0.497), ('penembakan', 0.4725), ('cengkareng', 0.4716), ('pembunuhan', 0.4686), ('jakarta', 0.4684), ('mengamankan', 0.4634), ('mengakibatkan', 0.4623), ('penganiayaan', 0.4603), ('pelaku', 0.4602), ('pemeriksaan', 0.4594), ('perumahan', 0.4553), ('langsung', 0.4528), ('kelompok', 0.4503), ('anggota', 0.4492), ('green', 0.4479), ('peristiwa', 0.4478), ('tubagus', 0.4473), ('perusakan', 0.4457), ('penyelidikan', 0.4456), ('bacokan', 0.4372), ('pertigaan', 0.4332), ('berboncengan', 0.4266), ('rangkaian', 0.4236), ('mendiskusikan', 0.4215), ('mengalami', 0.4151), ('dirkrimum', 0.415), ('orang', 0.4148), ('kosambi', 0.414), ('minggu', 0.4102), ('pimpinan', 0.4048), ('lake', 0.4042), ('polda', 0.4037), ('tangannya', 0.4013), ('bacok', 0.4004), ('kriminal', 0.3976), ('diduga', 0.3959), ('korban', 0.3935), ('kapolda', 0.393), ('kombes', 0.3907), ('metro', 0.3899), ('meninggal', 0.3894), (

In [12]:
def rangking (keywordGabung):
    keywordTemp=[]
    keywordFinal=[]

    def borda_sort(lists):
        scores = {}
        for l in lists:
            for idx, elem in enumerate(reversed(l)):
                if not elem in scores:
                    scores[elem] = 0
                scores[elem] += idx
        return sorted(scores.keys(), key=lambda elem: scores[elem], reverse=True)

    keywordTemp.append(borda_sort(keywordGabung))
    print ('kandidat temp',keywordTemp)
    print ('Total Kandidat temp: ',len(keywordTemp[0]))

    if len(keywordTemp[0])>30:
        print ('kurang dari 80')
        for i in range (0,80):
            keywordFinal.append(keywordTemp[0][i])
    elif len(keywordTemp[0])<80:
        print ('lebih dari 80')
        for i in range (0,len(keywordTemp)):
            for j in range (0,len(keywordTemp[0])):
                keywordFinal.append(keywordTemp[0][j])
    print ('Total Kandidat final: ',len(keywordFinal))
    print ('Kandidat final: ',keywordFinal)
    return keywordFinal


In [13]:
def keyword_BOW(keywordBOW, kueriAsli):
    cekDuplicate = []
    kandidatFix = []

    for i in keywordBOW:
        if(i not in cekDuplicate and i!=0):
            cekDuplicate.append(i)

    # queries=[kueriAsli]
    queries=kueriAsli
    query_embeddings = embedder.encode(queries)
    corpus_embeddings4 = embedder.encode(cekDuplicate)
    
    # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
    closest_n = 1000
    for query, query_embedding in zip(queries, query_embeddings):
        distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings4, 'cosine')[0]
        results = zip(range(len(distances)), distances)
        results = sorted(results, key=lambda x: x[1])
        for idx, distance in results[0:closest_n]:
            kandidatFix.append(cekDuplicate[idx])
    return kandidatFix

In [14]:
def kandidatFix(kueriAsli,bow):
    #kueri what
    # kueriAsli='apa sebenarnya kejadian yang terjadi diberita tersebut'
    kueri=preprocessing(kueriAsli)
    kueri= [" ".join (kueri)]
    print (kueri)
    hasilkandidat=[]
    keywordGabung=[]
    kandidatFix=[]
    kueriFix=[]
    kueriFixWithDelimiter=[]
    


    hasilSearch=cari_dokpertama(kueriAsli)
    keywordYake=keyword_yake(hasilSearch)
    keywordtfidf2=keyword_tfidf(hasilSearch)
    keywordbert=keyword_bert (hasilSearch)
    keywordBOW=keyword_BOW(bow, kueri)

    keywordGabung.append(keywordYake)
    keywordGabung.append(keywordtfidf2)
    keywordGabung.append(keywordbert)
    # keywordGabung.append(keywordBOW)
    hasilrank=rangking(keywordGabung)

    for i in hasilrank:
        kueriFix.append(i)
    for x in keywordBOW:
        kueriFix.append(x)
    for j in kueriFix:
        hasilkandidat.append(j)
    kueriFixWithDelimiter=kueriFix
    kueriFix=[preprocessing(i) for i in kueriFix]
    for i in kueriFix:
        for j in i:
            kandidatFix.append(j)
    kandidatFix= [" ".join (kandidatFix)]
    
    print ('*'*120)
    return(kandidatFix,keywordGabung,keywordBOW,hasilrank,kueriFixWithDelimiter)


In [4]:
bow_kecelakaan_read,bow_kecelakaan_text = bow_read('bow_kecelakaan.csv')
bow_where_read,bow_where_text = bow_read('bow_where.csv')
bow_when_read,bow_when_text = bow_read('bow_when.csv')
bow_who_read,bow_who_text = bow_read('bow_who.csv')

In [5]:
print("What     : ", bow_kecelakaan_text[0])
print("Where    : ", bow_where_text[0])
print("When     : ", bow_when_text[0])
print("Who      : ", bow_who_text[0])

What     :  musibah
Where    :  semarang
When     :  waktunya
Who      :  pelakunya


In [6]:
print(len(bow_when_text))
cekDuplicate1 = []
kandidatFix2 = []

for i in bow_kecelakaan_text:
    if(i not in cekDuplicate1 and i!=0):
        cekDuplicate1.append(i)
print(len(cekDuplicate1))

11110
5576


In [43]:
# hasilDokumenWhen=[]
hasilDokumenWho=[]
# hasilDokumenWhere=[]
# hasilDokumenWhat=[]

# kueriAsliWhat='apa sebenarnya kejadian kecelakaan yang terjadi diberita tersebut'
# kueriAsliWhat='musibah apa yang terjadi'
# kueriAsliWhere='daerah lokasi tempat terjadinya kecelakaan'
# kueriAsliWhere='kota kejadian terjadi'
kueriAsliWho='siapa pelaku kejadian ini'
# kueriAsliWhen='hari apa waktu terjadinya'

# kandidatFixWhat,keywordGabungWhat,keywordBOW_What, hasilrankWhat, kueriFixWithDelimiter_What = kandidatFix(kueriAsliWhat, bow_kecelakaan_text)
# kandidatFixWhere,keywordGabungWhere,keywordBOW_Where, hasilrankWhere,kueriFixWithDelimiter_Where = kandidatFix(kueriAsliWhere, bow_where_text)
kandidatFixWho,keywordGabungWho,keywordBOW_Who, hasilrankWho, kueriFixWithDelimiter_Who = kandidatFix(kueriAsliWho, bow_who_text)
# kandidatFixWhen,keywordGabungWhen,keywordBOW_When, hasilrankWhen, kueriFixWithDelimiter_When = kandidatFix(kueriAsliWhen, bow_when_text)


['pelaku kejadian']
  (0, 10695)	0.8756401519728193
  (0, 6678)	0.4829641024476021
['kereta api', 'perlintasan sebidang', 'kereta', 'palang pintu', 'perlintasan', 'api', 'perlintasan kereta', 'sebidang', 'pengguna jalan', 'api perlintasan', 'jalan', 'keselamatan perlintasan', 'angkutan jalan', 'rel kereta', 'sebidang sesuai', 'kecelakaan perlintasan', 'lintas angkutan', 'kecelakaan', 'mobil', 'api jalan', 'palang', 'pintu kereta', 'polres kediri', 'lintas perlintasan', 'mobil dikendarai', 'blitar kediri', 'kediri akp', 'kediri penyelidikan', 'api peristiwa', 'lintas', 'pintu', 'melintas perlintasan', 'dikendarai ongki', 'rusak parah', 'jalan raya', 'api sebidang', 'api rapih', 'api palang', 'tol', 'berbunyi palang', 'kediri kecelakaan', 'sebidang kereta', 'mengalami luka', 'perjalanan kereta', 'munculah kereta', 'ongki agus', 'warga sedati', 'arah barat', 'pengguna', 'kendaraan']
['17', 'kecelakaan', '2009', 'confero', 'bernopol', 'n', '1835', 'ga', 'ditumpangi', 'terseret', '50', 'met

In [51]:
hasilDokumenWho=[]
kueriAsli='apa sebenarnya kejadian kecelakaan yang terjadi diberita tersebut'

kueri=preprocessing(kueriAsli)
kueri= [" ".join (kueri)]
print (kueri)
hasilkandidat=[]
keywordGabung=[]
kandidatFix=[]
kueriFix=[]
kueriFixWithDelimiter=[]



hasilSearch=cari_dokpertama(kueriAsli)
keywordYake=keyword_yake(hasilSearch)
keywordtfidf2=keyword_tfidf(hasilSearch)
keywordbert=keyword_bert (hasilSearch)

print("Yake  : ", keywordYake)
print("TFIDF : ", keywordtfidf2)
print("Bert  : ", keywordYake)
# keywordBOW=keyword_BOW(bow, kueri)

keywordGabung.append(keywordYake)
keywordGabung.append(keywordtfidf2)
keywordGabung.append(keywordbert)

hasilrank=rangking(keywordGabung)
print("Rank  : ", hasilrank)

['kejadian kecelakaan diberita']
  (0, 6678)	0.7510210317790929
  (0, 6606)	0.6602782821094958
Yake  :  ['hati hati', 'mobil minibus', 'melenceng kiri', 'pas belok', 'belok tekor', 'warga berusaha', 'meninggal dunia', 'kereta api', 'kiri menabrak', 'kiri gungun', 'mobil', 'belok melenceng', 'kiri jalan', 'minibus melenceng', 'tekor belok', 'mengangkat mobil', 'mobil mobilnya', 'kendaraannya mobil', 'jalan sangkuriang', 'tekor melenceng', 'pengemudi hati', 'dunia erin', 'hati mengendalikan', 'hati menguasai', 'sangkuriang menyebrang', 'mobilnya pas', 'pas diangkat', 'dibawa warga', 'warga keluarganya', 'mobil melaju', 'citeureup mobil', 'badan mobil', 'menyebut mobil', 'oleng kiri', 'terhimpit coba', 'berusaha mengangkat', 'dunia perawatan', 'perawatan meninggal', 'korban', 'cimahi jumat', 'kota cimahi', 'mobilnya diangkat', 'hati', 'minibus persimpangan', 'susah warga', 'dingin pas', 'tabrakan warga', 'konsentrasi pas', 'polres cimahi', 'cimahi iptu']
TFIDF :  ['cctv', 'video', 'api', 

In [38]:
# j=1

testing_data = []

tfidf_vectorizer = joblib.load('vectorizer.pkl')

#What
# for i in range(0, len(document_text_test)-1):
    
#     hasilWhat=[]

#     teks=df_test.iloc[i,-2]
#     tfidf_matrix = tfidf_vectorizer.fit_transform([teks])

#     query_vec_What= tfidf_vectorizer.transform(kandidatFixWhat)
#     results_what=cosine_similarity(tfidf_matrix, query_vec_What).reshape((-1))
#     hasilDokumenWhat.append(df_test.iloc[i,2])
#     for a in kueriFixWithDelimiter_What:
#         cariW = re.findall(a,hasilDokumenWhat[i])
#         #print(cariW)
#         if cariW:
#             hasilWhat.append(a)
    
#     data = [i,df_test.iloc[i,2],'what',kueriAsliWhat,keywordBOW_What , keywordGabungWhat,kandidatFixWhat, hasilWhat, results_what,' ',' ']

#     testing_data.append(data)

#Where
# for i in range(0, len(document_text_test)-1):

#     hasilWhere=[]
#     truePos = 0

#     teks=df_test.iloc[i,-2]
#     tfidf_matrix = tfidf_vectorizer.fit_transform([teks])

#     query_vec_Where= tfidf_vectorizer.transform(kandidatFixWhere)
#     results_where=cosine_similarity(tfidf_matrix, query_vec_Where).reshape((-1))
#     hasilDokumenWhere.append(df_test.iloc[i,2])
#     for a in kueriFixWithDelimiter_Where:
#         cariW = re.findall(a,hasilDokumenWhere[i])
#         #print(cariW)
#         if cariW:
#             hasilWhere.append(a)
#     # print(hasilWhere)
#     for x in hasilWhere:
#         for y in keywordBOW_Where:
#             if(x == y):
#                 truePos=1
    
#     data = [i,df_test.iloc[i,2],'where',kueriAsliWhere,keywordBOW_Where , keywordGabungWhere, kandidatFixWhere, hasilWhere, results_where,truePos,' ']

#     testing_data.append(data)

#Who
for i in range(0, len(document_text_test)-1):

    hasilWho=[]

    teks=df_test.iloc[i,-2]
    tfidf_matrix = tfidf_vectorizer.fit_transform([teks])

    query_vec_Who= tfidf_vectorizer.transform(kandidatFixWho)
    results_who=cosine_similarity(tfidf_matrix, query_vec_Who).reshape((-1))
    hasilDokumenWho.append(df_test.iloc[i,2])
    for a in kueriFixWithDelimiter_Who:
        cariW = re.findall(a,hasilDokumenWho[i])
        #print(cariW)
        if cariW:
            hasilWho.append(a)
    
    data = [i,df_test.iloc[i,2],'who',kueriAsliWho,keywordBOW_Who , keywordGabungWho, kandidatFixWho, hasilWho, results_who,' ',' ']

    testing_data.append(data)

#When
# for i in range(0, len(document_text_test)-1):

#     hasilWhen=[]

#     teks=df_test.iloc[i,-2]
#     tfidf_matrix = tfidf_vectorizer.fit_transform([teks])

#     query_vec_When= tfidf_vectorizer.transform(kandidatFixWhen)
#     results_when=cosine_similarity(tfidf_matrix, query_vec_When).reshape((-1))
#     hasilDokumenWhen.append(df_test.iloc[i,2])
#     for a in kueriFixWithDelimiter_When:
#         cariW = re.findall(a,hasilDokumenWhen[i])
#         #print(cariW)
#         if cariW:
#             hasilWhen.append(a)
    
#     data = [i,df_test.iloc[i,2],'when',kueriAsliWhen,keywordBOW_When , keywordGabungWhen, kandidatFixWhen, hasilWhen, results_when,' ',' ']

#     testing_data.append(data)


In [39]:
#write to csv
writer = pd.DataFrame(testing_data, columns=['No Document','Description', 'W','Pertanyaan', 'Keyword BOW', 'Keyword Gabung','kandidat fix','hasilW', 'Kemiripan', 'True Positif', 'True Negative'])
writer.to_csv('QE_Stat_testing_who_result.csv', index=False, sep=',')