In [215]:
# Import library-library
import os
import glob
import string

# Data Preparation and Preprocessing
import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Word Embedding
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse

from sklearn.metrics.pairwise import cosine_similarity

# Input and Expansion Query
from textblob import TextBlob
from nltk.tokenize import wordpunct_tokenize
#from googletrans import Translator

#Split to train and test
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/syubbanfakhriya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [216]:
excluded_words = ["tempat", "waktu", "gempa", "banjir"]

NLTK_StopWords = stopwords.words('indonesian')
NLTK_StopWords.extend(["detik", "detikjatim", "detikjateng", "detikjabar", "detiksulsel", "detiksumbar", "detikbali", "detikpapua", "detiksulteng", "detikmaluku", "detjatim", "detikcom", "allahumma", "aamiin", "aamiin", "allah", "bismillah"])
NLTK_StopWords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo', 
                       'kalo', 'amp', 'biar', 'bikin', 'bilang', 
                       'gak', 'ga', 'krn', 'nya', 'nih', 'sih', 
                       'si', 'tau', 'tdk', 'tuh', 'utk', 'ya', 
                       'jd', 'jgn', 'sdh', 'aja', 'n', 't', 
                       'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt',
                       '&amp', 'yah'])
txt_stopword = pd.read_csv("stopwords.txt", names= ["stopwords"], header = None)

# convert stopword string to list & append additional stopword
NLTK_StopWords.extend(txt_stopword["stopwords"][0].split(' '))
NLTK_StopWords = set(NLTK_StopWords)

In [217]:
def preprocessing(berita):
    # Preprocessing
    s = berita.lower()
    s = s.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
    s = s.encode('ascii', 'replace').decode('ascii')
    ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", s).split())
    s.replace("http://", " ").replace("https://", " ")
    s = re.sub('\s+', ' ', s)
    s = s.strip()
    s = s.translate(str.maketrans("","", string.punctuation))
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
    s = re.sub(r'\b\w{1,1}\b', '', s)
    tokens = [token for token in s.split(" ") if token != ""]

    # Stopwords checking
    T = [t for t in tokens if ((t in excluded_words) or (t not in NLTK_StopWords))]
    return T

In [218]:
df =pd.read_csv("news.csv")
df.info()
# df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1790 entries, 0 to 1789
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        1790 non-null   object
 1   date         1790 non-null   object
 2   description  1790 non-null   object
 3   source       1790 non-null   object
dtypes: object(4)
memory usage: 56.1+ KB


In [219]:
column = ['Title', 'Date', 'Description', 'Source']
df_total = pd.DataFrame()

df = pd.read_csv("news.csv")
df = df.dropna(subset=['description'], axis=0)
df.sort_values("description", inplace = True)
df.drop_duplicates(subset ="description", keep = False, inplace = True)

for x in range(0, df.shape[0]):
    text = preprocessing (df.iloc[x, -2])
    df.iloc[x, -2] = (' '.join(text)) 

In [220]:
df.head()
df_train, df_test = train_test_split(df,test_size=0.1)
df_train.head()

Unnamed: 0,title,date,description,source
1417,"\n Gempa Kembali Guncang Pasaman Barat,...","Sabtu, 26 Feb 2022 00:06 WIB",badan meteorologi klimatologi geofisika bmkg m...,www.detik.com
560,\n 4 Hektare Lahan Gambut di Riau Terba...,"Sabtu, 05 Feb 2022 09:16 WIB",lahan gambut seluas hektare milik masyarakat d...,www.detik.com
1648,"\n Topan Molave Terjang Vietnam, 2 Oran...","Rabu, 28 Okt 2020 15:33 WIB",dilaporkan tewas 26 hilang dilansir media peme...,www.detik.com
223,\n Pemukiman Warga Ludes Terbakar Gegar...,"Rabu, 04 Agu 2021 16:37 WIB",warga memandangi kerusakan rumah kebakaran hut...,www.detik.com
1079,\n Longsor Tebing Anak Bengawan Solo di...,"Kamis, 13 Jan 2022 10:39 WIB",longsor talut tebing sungai babadan desa babad...,www.detik.com


In [221]:
desc_text_train = []
desc_text_test = []

for x in range(0, df_train.shape[0]):
  desc_text_train.append(df_train.iloc[x, -2])

for x in range(0, df_test.shape[0]):
  desc_text_test.append(df_test.iloc[x, -2])

In [222]:
# print(df_test.info())
print("Panjang Dokumen Test : ", len(desc_text_test))

print ('-'*90)

# print(df_train.info())
print("Panjang Dokumen Train : ", len(desc_text_train))

Panjang Dokumen Test :  179
------------------------------------------------------------------------------------------
Panjang Dokumen Train :  1604


In [223]:
# Simpan teks asli dokumen
joblib.dump(desc_text_test, "corpus/model/desc_text_test.pkl")
joblib.dump(desc_text_train, "corpus/model/desc_text_train.pkl")

# Simpan dataframe hasil text processing sebelum TF-IDF Vectorizer
df.to_csv('corpus/dataset/df_total.csv', index=False)
df_test.to_csv('corpus/dataset/df_test.csv', index=False)
df_train.to_csv('corpus/dataset/df_train.csv', index=False)


In [224]:
# Fit transform menggunakan TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_test_matrix = tfidf_vectorizer.fit_transform(desc_text_test)
tfidf_train_matrix = tfidf_vectorizer.fit_transform(desc_text_train)

print('tfidf_test_matrix')
print(tfidf_test_matrix.shape)
print('tfidf_train_matrix')
print(tfidf_train_matrix.shape)

tfidf_test_matrix
(179, 5228)
tfidf_train_matrix
(1604, 18077)


In [225]:
joblib.dump(tfidf_vectorizer, "corpus/vectorizer/vectorizer.pkl")
joblib.dump(tfidf_test_matrix, "corpus/matrix/tfidf_test.pkl")
joblib.dump(tfidf_train_matrix, "corpus/matrix/tfidf_train.pkl")

['corpus/matrix/tfidf_train.pkl']

### Pencarian 

In [226]:
df_total = pd.read_csv('corpus/dataset/df_total.csv')
df_total = df_total[pd.notnull(df_total['description'])]
print(df_total.info())
print ('-'*90)

document_text_train= joblib.load('corpus/model/desc_text_train.pkl')
document_text_test= joblib.load('corpus/model/desc_text_test.pkl')
print(len(document_text_test))
print(len(document_text_train))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1783 entries, 0 to 1782
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        1783 non-null   object
 1   date         1783 non-null   object
 2   description  1783 non-null   object
 3   source       1783 non-null   object
dtypes: object(4)
memory usage: 69.6+ KB
None
------------------------------------------------------------------------------------------
179
1604


In [227]:
kueri='gempa'
kueri=preprocessing(kueri)
kueri= [" ".join (kueri)]
print (kueri)

tfidf_matrix =joblib.load( "corpus/matrix/tfidf_train.pkl" )
tfidf_vectorizer = joblib.load( "corpus/vectorizer/vectorizer.pkl" ) 
query_vec= tfidf_vectorizer.transform(kueri)
results=cosine_similarity(tfidf_matrix, query_vec).reshape((-1))
#print (results)
print("\n======================\n")
print("Top 10 most similar documents in corpus:")

j = 1

for i in results.argsort()[-10:][::-1]:
    print("No ID Dokumen  : ", i)
    print("Tanggal        : ", df_total.iloc[i,1])
    print("Isi berita     : ", df_total.iloc[i,2])
    print("(Score: %.4f) " % results[i])

['gempa']


Top 10 most similar documents in corpus:
No ID Dokumen  :  1424
Tanggal        :  Selasa, 22 Feb 2022 16:34 WIB
Isi berita     :  kebakaran melanda pondok pesantren miftahul khoirot karawang jawa barat senin 212 kemarin kondisi terkini pesantren terbakar foto udara kondisi kamar santri mengalami kebakaran polisi menyebut kebakaran korsleting kipas angin gedung lantai akibat peristiwa maut delapan santri dikabarkan meninggal dunia
(Score: 0.6169) 
No ID Dokumen  :  1175
Tanggal        :  Kamis, 18 Mar 2021 16:08 WIB
Isi berita     :  pasien dirawat rumah sakit jiwa rsj info pasien rsj diduga personel polri informasi beredar pesan grup whatsapp personel polri kabid humas polda aceh kombes winardy wartawan kamis 1832021 informasi personel polda winardy menyebut penelusuran polisi zainal dirawat rsj mengantar warga menerimanya dibawa rsj banda aceh winardy polisi asep winardy asep personel bko resimen kedung halang bogor konflik aceh berkecamuk dinyatakan hilang gempa bumi tsun