In [1]:
# Import library-library
import os
import glob
import string

# Data Preparation and Preprocessing
import pandas as pd
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Word Embedding
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse

from sklearn.metrics.pairwise import cosine_similarity

# Input and Expansion Query
from textblob import TextBlob
from nltk.tokenize import wordpunct_tokenize
#from googletrans import Translator

#Split to train and test
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/syubbanfakhriya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
excluded_words = ["tempat", "waktu", "gempa", "banjir"]

NLTK_StopWords = stopwords.words('indonesian')
NLTK_StopWords.extend(["detik", "detikjatim", "detikjateng", "detikjabar", "detiksulsel", "detiksumbar", "detikbali", "detikpapua", "detiksulteng", "detikmaluku", "detjatim", "detikcom", "allahumma", "aamiin", "aamiin", "allah", "bismillah"])
NLTK_StopWords.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo', 
                       'kalo', 'amp', 'biar', 'bikin', 'bilang', 
                       'gak', 'ga', 'krn', 'nya', 'nih', 'sih', 
                       'si', 'tau', 'tdk', 'tuh', 'utk', 'ya', 
                       'jd', 'jgn', 'sdh', 'aja', 'n', 't', 
                       'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt',
                       '&amp', 'yah'])
txt_stopword = pd.read_csv("stopwords.txt", names= ["stopwords"], header = None)

# convert stopword string to list & append additional stopword
NLTK_StopWords.extend(txt_stopword["stopwords"][0].split(' '))
NLTK_StopWords = set(NLTK_StopWords)

In [3]:
def preprocessing(berita):
    # Preprocessing
    s = berita.lower()
    s = s.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
    s = s.encode('ascii', 'replace').decode('ascii')
    ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", s).split())
    s.replace("http://", " ").replace("https://", " ")
    s = re.sub('\s+', ' ', s)
    s = s.strip()
    s = s.translate(str.maketrans("","", string.punctuation))
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)
    s = re.sub(r'\b\w{1,1}\b', '', s)
    tokens = [token for token in s.split(" ") if token != ""]

    # Stopwords checking
    T = [t for t in tokens if ((t in excluded_words) or (t not in NLTK_StopWords))]
    return T

In [4]:
df =pd.read_csv("news.csv")
df.info()
# df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1655 entries, 0 to 1654
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        1655 non-null   object
 1   date         1655 non-null   object
 2   description  1655 non-null   object
 3   source       1655 non-null   object
dtypes: object(4)
memory usage: 51.8+ KB


In [5]:
column = ['Title', 'Date', 'Description', 'Source']
df_total = pd.DataFrame()

df = pd.read_csv("news.csv")
df = df.dropna(subset=['description'], axis=0)
df.sort_values("description", inplace = True)
df.drop_duplicates(subset ="description", keep = False, inplace = True)

df.head()
df_train_unprocessed, df_test_unprocessed = train_test_split(df,test_size=0.1)

In [6]:
for x in range(0, df.shape[0]):
    text = preprocessing(df.iloc[x, -2])
    df.iloc[x, -2] = (' '.join(text)) 

In [7]:
df.head()
df_train, df_test = train_test_split(df,test_size=0.1)
df_train.head()

Unnamed: 0,title,date,description,source
1295,\n Atap Plafon Mal di Kemang Jaksel Amb...,"Sabtu, 05 Mar 2022 15:33 WIB",kawasan kemang jakarta selatan ambruk peristiw...,www.detik.com
608,"\n Kemarau Datang, Petani di Klaten Cur...","Senin, 13 Jul 2020 13:15 WIB",datangnya musim seminggu air air ponggok kecam...,www.detik.com
735,"\n Bencana Alam, Pandemi dan Urgensi Pe...","Minggu, 24 Jan 2021 16:04 WIB",rentetan bencana alam berulang pandemi covid19...,www.detik.com
1613,\n Gunung Merapi Muntahkan Awan Panas S...,"Minggu, 09 Jan 2022 14:58 WIB",gunung merapi memuntahkan awan panas minggu si...,www.detik.com
1551,\n Melihat Perjuangan Damkar Jinakkan K...,"Kamis, 12 Agu 2021 09:37 WIB",kebakaran lahan desa rawa jaya pemulutan ogan ...,www.detik.com


In [8]:
desc_text_test_unprocessed = []

for x in range(0, df_test_unprocessed.shape[0]):
  desc_text_test_unprocessed.append(df_test_unprocessed.iloc[x, -2])

joblib.dump(desc_text_test_unprocessed, "corpus/model/desc_text_test_unprocessed.pkl")

['corpus/model/desc_text_test_unprocessed.pkl']

In [9]:
desc_text_train = []
desc_text_test = []

for x in range(0, df_train.shape[0]):
  desc_text_train.append(df_train.iloc[x, -2])

for x in range(0, df_test.shape[0]):
  desc_text_test.append(df_test.iloc[x, -2])

In [10]:
# print(df_test.info())
print("Panjang Dokumen Test : ", len(desc_text_test))

print ('-'*90)

# print(df_train.info())
print("Panjang Dokumen Train : ", len(desc_text_train))

Panjang Dokumen Test :  166
------------------------------------------------------------------------------------------
Panjang Dokumen Train :  1489


In [11]:
# Simpan teks asli dokumen
joblib.dump(desc_text_test, "corpus/model/desc_text_test.pkl")
joblib.dump(desc_text_train, "corpus/model/desc_text_train.pkl")

# Simpan dataframe hasil text processing sebelum TF-IDF Vectorizer
df.to_csv('corpus/dataset/df_total.csv', index=False)
df_test.to_csv('corpus/dataset/df_test.csv', index=False)
df_train.to_csv('corpus/dataset/df_train.csv', index=False)


In [12]:
# Fit transform menggunakan TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_test_matrix = tfidf_vectorizer.fit_transform(desc_text_test)
tfidf_train_matrix = tfidf_vectorizer.fit_transform(desc_text_train)

print('tfidf_test_matrix')
print(tfidf_test_matrix.shape)
print('tfidf_train_matrix')
print(tfidf_train_matrix.shape)

tfidf_test_matrix
(166, 4393)
tfidf_train_matrix
(1489, 16943)


In [13]:
joblib.dump(tfidf_vectorizer, "corpus/vectorizer/vectorizer.pkl")
joblib.dump(tfidf_test_matrix, "corpus/matrix/tfidf_test.pkl")
joblib.dump(tfidf_train_matrix, "corpus/matrix/tfidf_train.pkl")

['corpus/matrix/tfidf_train.pkl']

### Pencarian 

In [14]:
df_total = pd.read_csv('corpus/dataset/df_total.csv')
df_total = df_total[pd.notnull(df_total['description'])]
print(df_total.info())
print ('-'*90)

document_text_train= joblib.load('corpus/model/desc_text_train.pkl')
document_text_test= joblib.load('corpus/model/desc_text_test.pkl')
print(len(document_text_test))
print(len(document_text_train))

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1655 entries, 0 to 1654
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        1655 non-null   object
 1   date         1655 non-null   object
 2   description  1655 non-null   object
 3   source       1655 non-null   object
dtypes: object(4)
memory usage: 64.6+ KB
None
------------------------------------------------------------------------------------------
166
1489


In [15]:
kueri='gempa'
kueri=preprocessing(kueri)
kueri= [" ".join (kueri)]
print (kueri)

tfidf_matrix =joblib.load( "corpus/matrix/tfidf_train.pkl" )
tfidf_vectorizer = joblib.load( "corpus/vectorizer/vectorizer.pkl" ) 
query_vec= tfidf_vectorizer.transform(kueri)
results=cosine_similarity(tfidf_matrix, query_vec).reshape((-1))
#print (results)
print("\n======================\n")
print("Top 10 most similar documents in corpus:")

j = 1

for i in results.argsort()[-10:][::-1]:
    print("No ID Dokumen  : ", i)
    print("Tanggal        : ", df_total.iloc[i,1])
    print("Isi berita     : ", df_total.iloc[i,2])
    print("(Score: %.4f) " % results[i])

['gempa']


Top 10 most similar documents in corpus:
No ID Dokumen  :  248
Tanggal        :  Kamis, 27 Mei  2021 10:21 WIB
Isi berita     :  beredar pesan singkat berisi peringatan gempa berkekuatan 85 potensi tsunami badan meteorologi klimatologi geofisika pusat gempa nasionalinatews peringatan kesalahan pengiriman daryono bmkg menginvestigasi investigasi beredar pesan singkat berisi peringatan gempa berkekuatan 85 badan meteorologi klimatologi geofisika bmkg bmkg kesalahan sistem pengiriman pesan peringatan tsunami tangkapan layar pesan singkat berisi peringatan gempa diunggah warganet twitter pesan berisi peringatan tsunami provinsi jawa timur ntb bali ntt jawa gempa berkekuatan 85 magnitudo peringatan tsunami jatim ntb bali ntt jateng gempa mag85 04jun21 101445wib lok1050ls 11480bt kdlmn10kmbmkg bunyi pesan dikirim kominfobmkg pesan terkirim kesalahan sistem pengiriman test maaf kesalahan system pengiriman testperingatan tsunami jatimntbbalinttjatengbmkg kepala pusat gempa bumi tsu