# Data Preparation

In [50]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Evaluation

In [51]:
# import package for displaying NER results
from spacy import load, displacy
import pandas as pd

In [59]:
# import model
link_to_model = "/content/drive/MyDrive/NER_v2.2/model10"
loaded_model = load(link_to_model)

In [53]:
test_text = """beliau pulang dinas aceh melewati jalan lokasi penembakan kabid humas bandung polda maluku kombes Muhammad Roem ohoirat roem bharaka tewas ditembak jalan mengaku mengkonfirmasi bharaka ditembak berkendara jalan ditembak situlah versi situ berboncengan versi roem roem ditangani pom ad penjelasan terkait kronologi penembakan memeriksa pom ad diberitakan menembak bharaka pratu ri menembak rekannya anggota tni prada prada dilaporkan kritis meninggal kondisi kritis rumah sakit di aceh barat meninggal anggota brimob kapendam xvi pattimura kolonel arh adi prayogo choirul fajar wartawan rabu 163 penembakan anggota tni polri rabu 163 pratu ri menjalani pemeriksaan intensif polisi militer motif penembakan pratu ri di jakarta barat diduga mengalami depresi bandar pasir mandoge indikasinya kondisi depresi akut pemeriksaan psikologis barat kejiwaan didalami diperiksa"""

In [54]:
doc = loaded_model(test_text)
displacy.render(doc, style="ent", jupyter=True)

In [20]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

beliau 0 6 lokasi
aceh 20 24 lokasi
bandung 70 77 lokasi
berkendara 194 204 lokasi
bharaka 363 370 lokasi
aceh 480 484 lokasi
barat 725 730 lokasi
pemeriksaan 810 821 lokasi
psikologis 822 832 lokasi


In [55]:
df_test = pd.read_csv('/content/drive/MyDrive/NER_v2.2/dataset_ner/data/df_test.csv')
df_test.head()

Unnamed: 0,title,date,description,source
0,\n 6 Kios Terminal Lama Bojonegoro lude...,"Jumat, 28 Jan 2022 09:24 WIB",enam kios bekas terminal bojonegoro ludes terb...,www.detik.com
1,\n Pantai Desa Teluk Ruh Rupat Utara Te...,"Sabtu, 05 Des 2020 15:30 WIB",pantai desa teluk rhu dimakan abrasi mengkhawa...,www.detik.com
2,\n Angin Kencang Landa Karanggedang Pur...,"Jumat, 04 Feb 2022 19:45 WIB",bencana angin kencang melanda desa karanggedan...,www.detik.com
3,\n Jejak Kebakaran Hebat yang Menghangu...,"Kamis, 22 Jul 2021 20:30 WIB",kebakaran hutan melanda daerah negara amerika ...,www.detik.com
4,"\n Gempa M 2,8 Guncang Pacitan","Minggu, 23 Jan 2022 10:26 WIB",bumi 28 guncang pacitan gempa pacitan 092807 w...,www.detik.com


In [56]:
lokasi_csv = '/content/drive/MyDrive/NER_v2.2/dataset_ner/lokasi.csv'
df_lokasi = pd.read_csv(lokasi_csv)
df_lokasi.head()

Unnamed: 0,no,daerah,tipe
0,1.0,aceh barat,kabkot
1,2.0,aceh barat daya,kabkot
2,3.0,aceh besar,kabkot
3,4.0,aceh jaya,kabkot
4,5.0,aceh selatan,kabkot


In [57]:
flattened_lokasi = []
for i in range(0, df_lokasi.shape[0]):
  flattened_lokasi.append(df_lokasi.iloc[i, -2])

In [47]:
visited = []

for ent in doc.ents:
  if (ent.text in flattened_lokasi) and (ent.text not in visited):
    print(ent.text)
    visited.append(ent.text)
    
visited.clear()

aceh
bandung


In [60]:
document_results = []
visited = []
for i in range(0, df_test.shape[0]):
  news = df_test.iloc[i, -2]
  doc = loaded_model(news)

  ents = []
  for ent in doc.ents:
    if (ent.text not in visited):
      ents.append(ent.text)
      visited.append(ent.text)

  visited.clear()

  document_results.append([
    ents,
    'where',
    '',
    '',
    '',
    news,
  ])

writer = pd.DataFrame(document_results, columns=['Data','W', 'True Positive', 'True Negative', 'Relevansi Berita', 'Skimming News'])
writer.to_excel('/content/drive/MyDrive/NER_v2.2/dataset_ner/result_where.xlsx', index=False)

In [None]:
# with open('/content/drive/MyDrive/NER_v2.2/ner.csv', 'w') as file:
#     file.writelines('token,entity\n')
for tok in doc.ents:
  print(tok.text, tok.label)
        # file.writelines(f"{tok.text},{tok.label}\n")

beliau 441812806324301794
pulang 441812806324301794
dinas 441812806324301794
melewati 441812806324301794
jalan 441812806324301794
lokasi 441812806324301794
penembakan 441812806324301794
kabid 441812806324301794
humas 441812806324301794
polda 441812806324301794
maluku 441812806324301794
kombes 441812806324301794
Muhammad 441812806324301794
Roem 441812806324301794
ohoirat 441812806324301794
roem 441812806324301794
bharaka 441812806324301794
tewas 441812806324301794
ditembak 441812806324301794
jalan 441812806324301794
mengaku 441812806324301794
mengkonfirmasi 441812806324301794
bharaka 441812806324301794
ditembak 441812806324301794
berkendara 441812806324301794
jalan 441812806324301794
ditembak 441812806324301794
situlah 441812806324301794
versi 441812806324301794
situ 441812806324301794
berboncengan 441812806324301794
versi 441812806324301794
roem 441812806324301794
roem 441812806324301794
ditangani 441812806324301794
pom 441812806324301794
ad 441812806324301794
penjelasan 44181280632430

In [None]:
# PERSON:      People, including fictional.
# NORP:        Nationalities or religious or political groups.
# FAC:         Buildings, airports, highways, bridges, etc.
# ORG:         Companies, agencies, institutions, etc.
# GPE:         Countries, cities, states.
# LOC:         Non-GPE locations, mountain ranges, bodies of water.
# PRODUCT:     Objects, vehicles, foods, etc. (Not services.)
# EVENT:       Named hurricanes, battles, wars, sports events, etc.
# WORK_OF_ART: Titles of books, songs, etc.
# LAW:         Named documents made into laws.
# LANGUAGE:    Any named language.
# DATE:        Absolute or relative dates or periods.
# TIME:        Times smaller than a day.
# PERCENT:     Percentage, including ”%“.
# MONEY:       Monetary values, including unit.
# QUANTITY:    Measurements, as of weight or distance.
# ORDINAL:     “first”, “second”, etc.
# CARDINAL:    Numerals that do not fall under another type.