In [40]:
import os
import json
import re
import string
import pandas as pd
import numpy as np
import ast

In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [4]:
def load_data(dataPath):
    data = pd.read_csv(dataPath, encoding="latin1")
    return data


def preprocess_data(data):
    data = data.fillna(method="ffill")
    data['Word'] = data['Word'].str.lower()

    func = lambda temp: [(w, t) for w, t in zip(temp["Word"].values.tolist(), temp["Tag"].values.tolist())]
    tagged_sentences=[t for t in data.groupby("Sentence #").apply(func)]
    print("전체 샘플 개수: {}".format(len(tagged_sentences)))

    sentences, ner_tags = [], [] 
    for tagged_sentence in tagged_sentences: # 47,959개의 문장 샘플을 1개씩 불러온다.
        sentence, tag_info = zip(*tagged_sentence) # 각 샘플에서 단어들은 sentence에 개체명 태깅 정보들은 tag_info에 저장.
        sentences.append(list(sentence)) # 각 샘플에서 단어 정보만 저장한다.
        ner_tags.append(list(tag_info)) # 각 샘플에서 개체명 태깅 정보만 저장한다.

    return sentences, ner_tags


def tozenizer(sentences, ner_tags):
    src_tokenizer = Tokenizer(oov_token='OOV') # 모든 단어를 사용하지만 인덱스 1에는 단어 'OOV'를 할당한다.
    src_tokenizer.fit_on_texts(sentences)

    tar_tokenizer = Tokenizer(lower=False) # 태깅 정보들은 내부적으로 대문자를 유지한채로 저장
    tar_tokenizer.fit_on_texts(ner_tags)

    return src_tokenizer, tar_tokenizer

In [261]:
def process_ner(sentence, bio):
    
    joined = []
    for w, pred in zip(sentence, bio):
        joined.append((w,pred))
    
#     print(joined)
    i = 0
    ner_list = []
    while i < len(joined):
        if joined[i][1] != 0 and joined[i][1] != 1:
            ner = []
            ner.append(joined[i])
            i += 1
            
            if i < len(joined):
                while joined[i][1] != 0 and joined[i][1] != 1:
                    ner.append(joined[i])
                    i += 1
                    if i >= len(joined):
                        break
            
            word = " ".join([x[0] for x in ner])
            entity = ner[0][1]
#             entity = index_to_ner[ner[0][1]]
            ner_list.append((word, entity))
        else:
            i += 1
    
    return ner_list

In [346]:
nerDataPath = 'Data/entity-annotated-corpus/ner_dataset.csv'
jsonDataPath = 'Data/news/'
resultDataPath = 'Data/temp7.csv'

In [43]:
# word_to_index를 얻기 위한 작업
nerData = load_data(nerDataPath)
sentences, ner_tags = preprocess_data(nerData)

src_tokenizer, tar_tokenizer = tozenizer(sentences, ner_tags)
word_to_index = src_tokenizer.word_index
index_to_ner = tar_tokenizer.index_word
index_to_ner[0]='PAD'

전체 샘플 개수: 47959


In [347]:
ner_df = pd.read_csv(resultDataPath)

In [348]:
ner_df

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,62,63,64,65,66,67,68,69,index,sentence
0,0,1,8,1,1,2,6,5,1,2,...,0,0,0,0,0,0,0,0,0,"['south', 'korean', 'ambassador', 'to', 'china..."
1,1,1,1,1,1,1,1,1,1,2,...,0,0,0,0,0,0,0,0,0,"['to', 'better', 'cope', 'with', 'the', 'growi..."
2,2,1,1,1,2,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,"['arguing', 'that', 'the', 'u.s.', 'missile-de..."
3,3,1,1,1,2,9,1,4,7,1,...,0,0,0,0,0,0,0,0,0,"['in', 'return', ',', 'south', 'korea', ""'s"", ..."
4,4,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,"['the', 'fundamental', 'reason', 'why', 'debat..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46750,46750,1,1,1,1,1,1,2,9,1,...,0,0,0,0,0,0,0,0,2769,"['still', ',', 'it', 'remains', 'unclear', 'wh..."
46751,46751,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,2769,"['she', 'also', 'expressed', 'hope', 'that', '..."
46752,46752,2,9,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,2769,"['south', 'korea', ""'s"", 'military', 'came', '..."
46753,46753,1,3,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,2769,"['in', 'june', ',', 'an', 'army', 'sergeant', ..."


In [349]:
ner_df2 = pd.DataFrame()

In [350]:
for i in range(ner_df.shape[0]):
    if i % 500 == 0:
        print(i)
    
    
    ner_dict = {'Geographical Entity':[], 'Organization':[], 'Person':[], 'Geopolitical Entity':[],
           'Time indicator':[], 'Artifact':[], 'Event':[], 'Natural Phenomenon':[]}
    
    row = ner_df.iloc[i]
    
    bio = list(row.iloc[1:71])
#     print(bio)
    sentence = row['sentence']
    sentence = ast.literal_eval(sentence)
#     print(sentence)
    
    ner_list = process_ner(sentence, bio)
    
    for entity in ner_list:
        if entity[1] == 2:
            ner_dict['Geographical Entity'].append(entity[0])
        elif entity[1] == 3:
            ner_dict['Time indicator'].append(entity[0])
        elif entity[1] == 4:
            ner_dict['Organization'].append(entity[0])
        elif entity[1] == 6:
            ner_dict['Person'].append(entity[0])
        elif entity[1] == 8:
            ner_dict['Geopolitical Entity'].append(entity[0])
        elif entity[1] == 11:
            ner_dict['Artifact'].append(entity[0])
        elif entity[1] == 12:
            ner_dict['Event'].append(entity[0])
        elif entity[1] == 15:
            ner_dict['Natural Phenomenon'].append(entity[0])
    
    val_list = list(ner_dict.values())
    joined_val_list = []
    for val in val_list:
        joined_val = ','.join(val)
        joined_val_list.append(joined_val)

    ner = pd.DataFrame(data=[joined_val_list], columns=list(ner_dict.keys()))
    ner_df2 = ner_df2.append(ner)
    
ner_df2 = ner_df2.reset_index(drop=True)

0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500
12000
12500
13000
13500
14000
14500
15000
15500
16000
16500
17000
17500
18000
18500
19000
19500
20000
20500
21000
21500
22000
22500
23000
23500
24000
24500
25000
25500
26000
26500
27000
27500
28000
28500
29000
29500
30000
30500
31000
31500
32000
32500
33000
33500
34000
34500
35000
35500
36000
36500
37000
37500
38000
38500
39000
39500
40000
40500
41000
41500
42000
42500
43000
43500
44000
44500
45000
45500
46000
46500


In [351]:
i

46754

In [352]:
ner_df2['index'] = ner_df['index']

In [353]:
ner_df2

Unnamed: 0,Geographical Entity,Organization,Person,Geopolitical Entity,Time indicator,Artifact,Event,Natural Phenomenon,index
0,"china kim jang-soo,beijing,u.s.,south korea,no...",,,korean,tuesday,,,,0
1,"north korea,south korea",,u.s.officials,korean,,,,,0
2,"u.s.,china,beijing,seoul",,,,,,,,0
3,"south korea,china,seoul",defense ministry,,,,,,,0
4,"thaad,south korea,north korea,hong kong-based ...",,kim,,,,,,0
...,...,...,...,...,...,...,...,...,...
46750,"north korea,south korea",,,,,,,,2769
46751,,,,,,,,,2769
46752,south korea,,,,barracks,,,,2769
46753,north korea,,,,june,,,,2769


In [354]:
cols = ner_df2.iloc[:, 0:-1].columns

In [355]:
cols

Index(['Geographical Entity', 'Organization', 'Person', 'Geopolitical Entity',
       'Time indicator', 'Artifact', 'Event', 'Natural Phenomenon'],
      dtype='object')

In [356]:
data = []
for i in range(ner_df.iloc[-1]['index'] + 1):
    index_df = ner_df2[ner_df2['index'] == i]
    l = []
    for col in cols:
        x = index_df[col]
        x = [k for k in x if k != '']
        x = ','.join(x)
#         print(x)
        l.append(x)
    data.append(l)

In [357]:
result = pd.DataFrame(data=data, columns=list(cols))

In [358]:
# result = dataframe.reset_index(drop=True)
result['index'] = result.index

In [359]:
result

Unnamed: 0,Geographical Entity,Organization,Person,Geopolitical Entity,Time indicator,Artifact,Event,Natural Phenomenon,index
0,"china kim jang-soo,beijing,u.s.,south korea,no...","defense ministry,national security council","u.s.officials,kim,kim,han min-koo,kim,cheong w...","korean,korean,korean,american,chinese,chinese","tuesday,october,saturday",,,,0
1,south korea,"park,park",president park geun-hye,koreans,"tuesday,may",,,,1
2,"korea national,corp,seoul,seoul central,of korea","bank merrill lynch,state audit agency","president kang young-won,posco,knoc,lee myung-bak",canadian,"tuesday,2009",,,,2
3,"south korea,north korea,north korea since apri...","new york university,cnn,national intelligence ...","joo won-moon,joo,kim jung-wook,joo,kim,kim kuk...","koreans,chinese,korean,koreans,koreans","tuesday,since october 2013",,,,3
4,"south korea,tanzania,seoul,south korea,tanzani...","tanzanian embassy,ministry of foreign affairs,...","kwon hee-seog,mbelwa kairuki","african,korean,tanzanian","tuesday,1992,1992,three",,,,4
...,...,...,...,...,...,...,...,...,...
2765,"north korean,2015,korea,korea,japan,north kore...","seoul national university institute,university...","kim jong-un,kim,kim,kim,kim,kim,chang yong-seo...","korean president park geun-hye,korean,korean","new,thursday,70th,1910,minister-level,january,...",,,,2765
2766,"seoul,mount kumgang,reunion,south korea,japan,...",pyongyang,"kim jong-un,(yonhap)kim,kim echoed park,kim,ki...","korean,korean president park geun-hye,korean,k...","thursday,new,koreas,january,inter-korean ties,...",,,,2766
2767,"united states,north korea,china,beijing,north,...","us,us,us,sino-american cooperation","christopher hill,hill,'no surprises,hill","korean,american,chinese,korean,chinese,chinese...",wednesday,,,,2767
2768,"united states,china,iran,iran,north korea,syri...","state department,state department",,"korean,korean,korean,russian,sudanese,venezuelan",tuesday,,,,2768


In [165]:
datapaths = os.listdir(jsonDataPath)

In [166]:
datapaths

['koreaherald_1517_0.json',
 'koreaherald_1517_1.json',
 'koreaherald_1517_2.json',
 'koreaherald_1517_3.json',
 'koreaherald_1517_4.json',
 'koreaherald_1517_5.json',
 'koreaherald_1517_6.json',
 'koreaherald_1517_7.json']

In [360]:
with open(jsonDataPath + datapaths[7], 'r') as f:
    data = json.load(f)

    dataframe = pd.DataFrame.from_dict(data)

In [361]:
dataframe

Unnamed: 0,title,author,time,description,body,section
0,S. Korean envoy dismisses China's concern over...,KH디지털2,2015-05-12 17:01:00,South Korean Ambassador to China Kim Jang-soo ...,South Korean Ambassador to China Kim Jang-soo ...,International
1,Park renews calls for civil service pension re...,KH디지털2,2015-05-12 12:04:00,President Park Geun-hye called Tuesday for the...,President Park Geun-hye called Tuesday for the...,Politics
2,State oil firm raided over shady energy projects,KH디지털2,2015-05-12 12:03:00,Prosecutors raided a state-run oil company Tue...,Prosecutors raided a state-run oil company Tue...,Social affairs
3,Gov't vows effort for release of S. Koreans in...,KH디지털2,2015-05-12 12:00:00,South Korea said Tuesday it will continue all-...,South Korea said Tuesday it will continue all-...,Politics
4,"S. Korea, Tanzania in talks to open embassy in...",KH디지털2,2015-05-12 11:57:00,South Korea and Tanzania held talks Tuesday on...,South Korea and Tanzania held talks Tuesday on...,International
...,...,...,...,...,...,...
2765,N. Korean leader's speech arouses cautious opt...,KH디지털2,2015-01-01 13:36:00,North Korean leader Kim Jong-un's New Year's D...,North Korean leader Kim Jong-un's New Year's D...,North Korea
2766,N. Korean leader open to inter-Korean summit t...,KH디지털2,2015-01-01 10:05:00,North Korean leader Kim Jong-un said Thursday ...,North Korean leader Kim Jong-un said Thursday ...,North Korea
2767,Ex-U.S. envoy calls for clearer communication ...,KH디지털2,2015-01-01 09:27:00,The United States should make its thoughts on ...,The United States should make its thoughts on ...,North Korea
2768,U.S. imposes sanctions on N. Korean firm,KH디지털2,2015-01-01 09:25:00,The United States has imposed sanctions on a N...,The United States has imposed sanctions on a N...,North Korea


In [362]:
dataframe = dataframe.reset_index(drop=True)
dataframe['index'] = dataframe.index

In [363]:
ner_df

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,62,63,64,65,66,67,68,69,index,sentence
0,0,1,8,1,1,2,6,5,1,2,...,0,0,0,0,0,0,0,0,0,"['south', 'korean', 'ambassador', 'to', 'china..."
1,1,1,1,1,1,1,1,1,1,2,...,0,0,0,0,0,0,0,0,0,"['to', 'better', 'cope', 'with', 'the', 'growi..."
2,2,1,1,1,2,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,"['arguing', 'that', 'the', 'u.s.', 'missile-de..."
3,3,1,1,1,2,9,1,4,7,1,...,0,0,0,0,0,0,0,0,0,"['in', 'return', ',', 'south', 'korea', ""'s"", ..."
4,4,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,"['the', 'fundamental', 'reason', 'why', 'debat..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46750,46750,1,1,1,1,1,1,2,9,1,...,0,0,0,0,0,0,0,0,2769,"['still', ',', 'it', 'remains', 'unclear', 'wh..."
46751,46751,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,2769,"['she', 'also', 'expressed', 'hope', 'that', '..."
46752,46752,2,9,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,2769,"['south', 'korea', ""'s"", 'military', 'came', '..."
46753,46753,1,3,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,2769,"['in', 'june', ',', 'an', 'army', 'sergeant', ..."


In [364]:
df = pd.merge(dataframe, result, how='right')

In [365]:
df

Unnamed: 0,title,author,time,description,body,section,index,Geographical Entity,Organization,Person,Geopolitical Entity,Time indicator,Artifact,Event,Natural Phenomenon
0,S. Korean envoy dismisses China's concern over...,KH디지털2,2015-05-12 17:01:00,South Korean Ambassador to China Kim Jang-soo ...,South Korean Ambassador to China Kim Jang-soo ...,International,0,"china kim jang-soo,beijing,u.s.,south korea,no...","defense ministry,national security council","u.s.officials,kim,kim,han min-koo,kim,cheong w...","korean,korean,korean,american,chinese,chinese","tuesday,october,saturday",,,
1,Park renews calls for civil service pension re...,KH디지털2,2015-05-12 12:04:00,President Park Geun-hye called Tuesday for the...,President Park Geun-hye called Tuesday for the...,Politics,1,south korea,"park,park",president park geun-hye,koreans,"tuesday,may",,,
2,State oil firm raided over shady energy projects,KH디지털2,2015-05-12 12:03:00,Prosecutors raided a state-run oil company Tue...,Prosecutors raided a state-run oil company Tue...,Social affairs,2,"korea national,corp,seoul,seoul central,of korea","bank merrill lynch,state audit agency","president kang young-won,posco,knoc,lee myung-bak",canadian,"tuesday,2009",,,
3,Gov't vows effort for release of S. Koreans in...,KH디지털2,2015-05-12 12:00:00,South Korea said Tuesday it will continue all-...,South Korea said Tuesday it will continue all-...,Politics,3,"south korea,north korea,north korea since apri...","new york university,cnn,national intelligence ...","joo won-moon,joo,kim jung-wook,joo,kim,kim kuk...","koreans,chinese,korean,koreans,koreans","tuesday,since october 2013",,,
4,"S. Korea, Tanzania in talks to open embassy in...",KH디지털2,2015-05-12 11:57:00,South Korea and Tanzania held talks Tuesday on...,South Korea and Tanzania held talks Tuesday on...,International,4,"south korea,tanzania,seoul,south korea,tanzani...","tanzanian embassy,ministry of foreign affairs,...","kwon hee-seog,mbelwa kairuki","african,korean,tanzanian","tuesday,1992,1992,three",,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2765,N. Korean leader's speech arouses cautious opt...,KH디지털2,2015-01-01 13:36:00,North Korean leader Kim Jong-un's New Year's D...,North Korean leader Kim Jong-un's New Year's D...,North Korea,2765,"north korean,2015,korea,korea,japan,north kore...","seoul national university institute,university...","kim jong-un,kim,kim,kim,kim,kim,chang yong-seo...","korean president park geun-hye,korean,korean","new,thursday,70th,1910,minister-level,january,...",,,
2766,N. Korean leader open to inter-Korean summit t...,KH디지털2,2015-01-01 10:05:00,North Korean leader Kim Jong-un said Thursday ...,North Korean leader Kim Jong-un said Thursday ...,North Korea,2766,"seoul,mount kumgang,reunion,south korea,japan,...",pyongyang,"kim jong-un,(yonhap)kim,kim echoed park,kim,ki...","korean,korean president park geun-hye,korean,k...","thursday,new,koreas,january,inter-korean ties,...",,,
2767,Ex-U.S. envoy calls for clearer communication ...,KH디지털2,2015-01-01 09:27:00,The United States should make its thoughts on ...,The United States should make its thoughts on ...,North Korea,2767,"united states,north korea,china,beijing,north,...","us,us,us,sino-american cooperation","christopher hill,hill,'no surprises,hill","korean,american,chinese,korean,chinese,chinese...",wednesday,,,
2768,U.S. imposes sanctions on N. Korean firm,KH디지털2,2015-01-01 09:25:00,The United States has imposed sanctions on a N...,The United States has imposed sanctions on a N...,North Korea,2768,"united states,china,iran,iran,north korea,syri...","state department,state department",,"korean,korean,korean,russian,sudanese,venezuelan",tuesday,,,


In [366]:
df.to_csv('Data/ner_tagged_news/7.csv')

In [224]:
dataframe.iloc[0][' body']

"At least 983 people were caught taking of intrusive pictures of women's body parts, during the summer holiday season, police said Sunday.\xa0The Korean National Police Agency checked 415 public restrooms and changing rooms at beaches, 705 restrooms at subway stations and 2,070 changing rooms at waterparks in an intensive crackdown that ended on Aug. 20.\xa0YonhapRestrooms and changing rooms have long been easy targets for rigging cameras up on the ceiling and wall.\xa0The number of voyeurism crimes has jumped tenfold from 517 in 2006 to 5,185 cases in 2016. The percentage of voyeurs among the total number of sex crime perpetrators also went up from 3.6 percent in 2006 to 24.9\xa0percent in 2015.\xa0Sexual crimes occur more frequently in the summer season, the 2012-2016 National Police Agency data showed, with an average of around 6,400 cases being reported from June to August, which is 30 percent more than those that occurred from December to February.\xa0During the 50-day intensive c