In [40]:
import os
import json
import re
import string
import pandas as pd
import numpy as np
import ast

In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [4]:
def load_data(dataPath):
    data = pd.read_csv(dataPath, encoding="latin1")
    return data


def preprocess_data(data):
    data = data.fillna(method="ffill")
    data['Word'] = data['Word'].str.lower()

    func = lambda temp: [(w, t) for w, t in zip(temp["Word"].values.tolist(), temp["Tag"].values.tolist())]
    tagged_sentences=[t for t in data.groupby("Sentence #").apply(func)]
    print("전체 샘플 개수: {}".format(len(tagged_sentences)))

    sentences, ner_tags = [], [] 
    for tagged_sentence in tagged_sentences: # 47,959개의 문장 샘플을 1개씩 불러온다.
        sentence, tag_info = zip(*tagged_sentence) # 각 샘플에서 단어들은 sentence에 개체명 태깅 정보들은 tag_info에 저장.
        sentences.append(list(sentence)) # 각 샘플에서 단어 정보만 저장한다.
        ner_tags.append(list(tag_info)) # 각 샘플에서 개체명 태깅 정보만 저장한다.

    return sentences, ner_tags


def tozenizer(sentences, ner_tags):
    src_tokenizer = Tokenizer(oov_token='OOV') # 모든 단어를 사용하지만 인덱스 1에는 단어 'OOV'를 할당한다.
    src_tokenizer.fit_on_texts(sentences)

    tar_tokenizer = Tokenizer(lower=False) # 태깅 정보들은 내부적으로 대문자를 유지한채로 저장
    tar_tokenizer.fit_on_texts(ner_tags)

    return src_tokenizer, tar_tokenizer

In [203]:
def process_ner(sentence, bio):
    
    joined = []
    for w, pred in zip(sentence, bio):
        joined.append((w,pred))
    
#     print(joined)
    i = 0
    ner_list = []
    while i < len(joined):
        if joined[i][1] != 0 and joined[i][1] != 1:
            ner = []
            ner.append(joined[i])
            i += 1
            
            if i < len(joined):
                while joined[i][1] != 0 and joined[i][1] != 1:
                    ner.append(joined[i])
                    i += 1
                    if i >= len(joined):
                        break
            
            word = " ".join([x[0] for x in ner])
            entity = ner[0][1]
#             entity = index_to_ner[ner[0][1]]
            ner_list.append((word, entity))
        else:
            i += 1
    
    return ner_list

In [255]:
nerDataPath = 'Data/entity-annotated-corpus/ner_dataset.csv'
jsonDataPath = 'Data/news/'
resultDataPath = 'Data/temp3.csv'

In [43]:
# word_to_index를 얻기 위한 작업
nerData = load_data(nerDataPath)
sentences, ner_tags = preprocess_data(nerData)

src_tokenizer, tar_tokenizer = tozenizer(sentences, ner_tags)
word_to_index = src_tokenizer.word_index
index_to_ner = tar_tokenizer.index_word
index_to_ner[0]='PAD'

전체 샘플 개수: 47959


In [256]:
ner_df = pd.read_csv(resultDataPath)

In [257]:
ner_df

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,62,63,64,65,66,67,68,69,index,sentence
0,0,1,1,1,3,10,1,1,1,1,...,0,0,0,0,0,0,0,0,0,"['special', 'prosecutors', 'on', 'saturday', '..."
1,1,1,1,1,1,6,5,1,6,1,...,0,0,0,0,0,0,0,0,0,"['former', 'vice', 'culture', 'minister', 'chu..."
2,2,1,1,1,1,1,1,1,1,6,...,0,0,0,0,0,0,0,0,0,"['there', 'have', 'been', 'allegations', 'that..."
3,3,1,1,1,1,6,5,1,3,1,...,0,0,0,0,0,0,0,0,0,"['the', 'team', ""'s"", 'spokesman', 'lee', 'kyu..."
4,4,6,1,6,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,"['chung', 'and', 'shin', 'were', 'both', 'prev..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53157,53157,6,1,2,1,1,1,1,1,2,...,0,0,0,0,0,0,0,0,2999,"['hwang', 'added', 'china', 'is', 'unlikely', ..."
53158,53158,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,2999,"['the', 'prime', 'minister', 'said', 'he', 'ma..."
53159,53159,2,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,2999,"['china', ""'s"", 'foreign', 'ministry', 'has', ..."
53160,53160,6,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,2999,"['hwang', 'also', 'pointed', 'out', 'that', 'i..."


In [258]:
ner_df2 = pd.DataFrame()

In [259]:
for i in range(ner_df.shape[0]):
    if i % 500 == 0:
        print(i)
    
    
    ner_dict = {'Geographical Entity':[], 'Organization':[], 'Person':[], 'Geopolitical Entity':[],
           'Time indicator':[], 'Artifact':[], 'Event':[], 'Natural Phenomenon':[]}
    
    row = ner_df.iloc[i]
    
    bio = list(row.iloc[1:71])
#     print(bio)
    sentence = row['sentence']
    sentence = ast.literal_eval(sentence)
#     print(sentence)
    
    ner_list = process_ner(sentence, bio)
    
    for entity in ner_list:
        if entity[1] == 2:
            ner_dict['Geographical Entity'].append(entity[0])
        elif entity[1] == 3:
            ner_dict['Time indicator'].append(entity[0])
        elif entity[1] == 4:
            ner_dict['Organization'].append(entity[0])
        elif entity[1] == 6:
            ner_dict['Person'].append(entity[0])
        elif entity[1] == 8:
            ner_dict['Geopolitical Entity'].append(entity[0])
        elif entity[1] == 11:
            ner_dict['Artifact'].append(entity[0])
        elif entity[1] == 12:
            ner_dict['Event'].append(entity[0])
        elif entity[1] == 15:
            ner_dict['Natural Phenomenon'].append(entity[0])
    
    val_list = list(ner_dict.values())
    joined_val_list = []
    for val in val_list:
        joined_val = ','.join(val)
        joined_val_list.append(joined_val)

    ner = pd.DataFrame(data=[joined_val_list], columns=list(ner_dict.keys()))
    ner_df2 = ner_df2.append(ner)
    
ner_df2 = ner_df2.reset_index(drop=True)

0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500
12000
12500
13000
13500
14000
14500
15000
15500
16000
16500
17000
17500
18000
18500
19000
19500
20000
20500
21000
21500
22000
22500
23000
23500
24000
24500
25000
25500
26000
26500
27000
27500
28000
28500
29000
29500
30000
30500
31000
31500
32000
32500
33000
33500
34000
34500
35000
35500
36000
36500
37000
37500
38000
38500
39000
39500
40000
40500
41000
41500
42000
42500
43000
43500
44000
44500
45000
45500
46000
46500
47000
47500
48000
48500
49000


IndexError: list index out of range

In [260]:
i

49072

In [236]:
ner_df2['index'] = ner_df['index']

In [237]:
ner_df2

Unnamed: 0,Geographical Entity,Organization,Person,Geopolitical Entity,Time indicator,Artifact,Event,Natural Phenomenon,index
0,"seoul,unrefined","us president donald trump ’s,twitter",,,two countries’,,,,0
1,,“job-killing” south korea-us,,,,,,,0
2,trump,,,,,,,,0
3,south korea,"us president donald trump (ap-yonhap)the,reuters",,,friday,,,,0
4,,washington times,,,saturday,,,,0
...,...,...,...,...,...,...,...,...,...
49708,,,,,,,,,2999
49709,,,,,,,,,2999
49710,,,,,october,,,,2999
49711,,,,,,,,,2999


In [238]:
cols = ner_df2.iloc[:, 0:-1].columns

In [239]:
cols

Index(['Geographical Entity', 'Organization', 'Person', 'Geopolitical Entity',
       'Time indicator', 'Artifact', 'Event', 'Natural Phenomenon'],
      dtype='object')

In [244]:
data = []
for i in range(3000):
    index_df = ner_df2[ner_df2['index'] == i]
    l = []
    for col in cols:
        x = index_df[col]
        x = [k for k in x if k != '']
        x = ','.join(x)
#         print(x)
        l.append(x)
    data.append(l)

In [245]:
result = pd.DataFrame(data=data, columns=list(cols))

In [246]:
# result = dataframe.reset_index(drop=True)
result['index'] = result.index

In [247]:
result

Unnamed: 0,Geographical Entity,Organization,Person,Geopolitical Entity,Time indicator,Artifact,Event,Natural Phenomenon,index
0,"seoul,unrefined,trump,south korea,seoul,washin...","us president donald trump ’s,twitter,“job-kill...","kim kwan-jin,h.r. mcmaster,mcmaster ’s,buzzfee...","korean,korean,american","two countries’,friday,saturday,saturday,may be...",,,,0
1,,ministry of gender equality,"25-29 (27,6 percent),30-34 (10,(123rf)social,7...",korean,"monday,2013,past,8,8,between 20 and 24,20 (3,9...",,,,1
2,"south korea,south jeolla,mokpo municipality,seoul",us,"mokpo,(yonhap)the,,800-ton sewol",,"monday,may 1 , 2017,6 ,800-ton sewol,april 16 ...",,,,2
3,"democratic party of korea,tomorrow ,”,south ko...",democratic party of korea (yonhap)“(even) choi...,"yoo eun-hae,“it,‘hell joseon’,,” yoo,hell jose...",,"may 9 election,two-day,thursday,friday,monday,...",,,,3
4,,ministry of gender equality,(yonhap)a,,"monday,past,acquaintance",,,,4
...,...,...,...,...,...,...,...,...,...
2995,"south korea,denmark,seoul,denmark,copenhagen,s...",,"chung yoo-ra,choi jai-chul,mohammad ahsan,chun...","danish,korean,choi,danish,danish,korean,danish...","saturday,saturday,chung,end of",,,,2995
2996,"us,pyongyang,rok,rok,korea","us,us sometime later,us,brigade,usfk j5 strate...","kim jong-un,,000-2 ,000-strong,kim jong-un,dig...","korean,korean,korean,korean,korean",sunday,,,,2996
2997,"tokyo (afp),south korea,tokyo,tokyo,japan,huge...",,"6 million),abe,mainstream historians","japanese prime minister shinzo abe,japanese,ja...","friday,busan,ii,sunday,ii",,,,2997
2998,"park,choi,seoul,gwanghwamun square,april2014,s...","park and lit candles,park,national assembly,ul...","scandal-tainted president park geun-hye,choi s...",koreans,"saturday,11th consecutive saturday,oct,7 milli...",,,,2998


In [165]:
datapaths = os.listdir(jsonDataPath)

In [166]:
datapaths

['koreaherald_1517_0.json',
 'koreaherald_1517_1.json',
 'koreaherald_1517_2.json',
 'koreaherald_1517_3.json',
 'koreaherald_1517_4.json',
 'koreaherald_1517_5.json',
 'koreaherald_1517_6.json',
 'koreaherald_1517_7.json']

In [248]:
with open(jsonDataPath + datapaths[2], 'r') as f:
    data = json.load(f)

    dataframe = pd.DataFrame.from_dict(data)

In [249]:
dataframe

Unnamed: 0,title,author,time,description,body,section
0,"Trump’s crude, unilateral approach threatens t...",Shin Hyon-hee,2017-05-01 15:46:00,Rekindled by his demand for Seoul to pay for t...,Rekindled by his demand for Seoul to pay for t...,
1,Half of all Korean men pay for sex: report,Ock Hyun-ju,2017-05-01 15:43:00,More than half of Korean men have paid for sex...,More than half of Korean men have paid for sex...,Social affairs
2,Items belonging to ferry sinking victims conti...,a2017001,2017-05-01 15:37:00,MOKPO -- The number of recovered items belongi...,MOKPO -- The number of recovered items belongi...,Social affairs
3,Candidates tense over holidays’ effect on turnout,Korea Herald,2017-05-01 15:33:00,With an unusually long holiday week ahead of t...,With an unusually long holiday week ahead of t...,Politics
4,Most teenage prostitutions conducted via onlin...,a2017001,2017-05-01 15:32:00,More than seven out of 10 teenage prostitution...,More than seven out of 10 teenage prostitution...,Social affairs
...,...,...,...,...,...,...
2995,Korean envoy asks Denmark for swift decision o...,KH디지털2,2017-01-08 14:02:00,South Korea's top diplomat in Denmark has urge...,South Korea's top diplomat in Denmark has urge...,Social affairs
2996,US special units said to join in operation to ...,KH디지털2,2017-01-08 13:59:00,US special operation forces are expected to pa...,US special operation forces are expected to pa...,Diplomatic Circuit
2997,Japan PM urges S.Korea to remove 'comfort woma...,KH디지털2,2017-01-08 13:50:00,TOKYO (AFP) -- Japanese Prime Minister Shinzo ...,TOKYO (AFP) -- Japanese Prime Minister Shinzo ...,Diplomatic Circuit
2998,S. Koreans stage weekly rally demanding Park's...,KH디지털2,2017-01-08 13:45:00,Crowds of South Koreans took to streets across...,Crowds of South Koreans took to streets across...,Social affairs


In [250]:
dataframe = dataframe.reset_index(drop=True)
dataframe['index'] = dataframe.index

In [251]:
ner_df

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,62,63,64,65,66,67,68,69,index,sentence
0,0,1,1,1,1,1,2,1,1,1,...,0,0,0,0,0,0,0,0,0,"['rekindled', 'by', 'his', 'demand', 'for', 's..."
1,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,"['on', 'the', 'campaign', 'trail', ',', 'he', ..."
2,2,1,1,1,1,1,1,2,1,1,...,0,0,0,0,0,0,0,0,0,"['just', 'over', '100', 'days', 'into', 'the',..."
3,3,4,6,5,5,5,1,1,1,1,...,0,0,0,0,0,0,0,0,0,"['us', 'president', 'donald', 'trump', '(ap-yo..."
4,4,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,"['he', 'repeated', 'these', 'demands', 'in', '..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49708,49708,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,2999,"['""', 'the', 'two', 'were', 'sprinkling', 'wat..."
49709,49709,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,2999,"['two', 'other', 'workers', 'sustained', 'mino..."
49710,49710,1,1,1,1,1,1,3,1,1,...,0,0,0,0,0,0,0,0,2999,"['the', 'demolition', ',', 'which', 'began', '..."
49711,49711,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,2999,"['temporary', 'traffic', 'controls', 'are', 'i..."


In [252]:
df = pd.merge(dataframe, result, how='right')

In [253]:
df

Unnamed: 0,title,author,time,description,body,section,index,Geographical Entity,Organization,Person,Geopolitical Entity,Time indicator,Artifact,Event,Natural Phenomenon
0,"Trump’s crude, unilateral approach threatens t...",Shin Hyon-hee,2017-05-01 15:46:00,Rekindled by his demand for Seoul to pay for t...,Rekindled by his demand for Seoul to pay for t...,,0,"seoul,unrefined,trump,south korea,seoul,washin...","us president donald trump ’s,twitter,“job-kill...","kim kwan-jin,h.r. mcmaster,mcmaster ’s,buzzfee...","korean,korean,american","two countries’,friday,saturday,saturday,may be...",,,
1,Half of all Korean men pay for sex: report,Ock Hyun-ju,2017-05-01 15:43:00,More than half of Korean men have paid for sex...,More than half of Korean men have paid for sex...,Social affairs,1,,ministry of gender equality,"25-29 (27,6 percent),30-34 (10,(123rf)social,7...",korean,"monday,2013,past,8,8,between 20 and 24,20 (3,9...",,,
2,Items belonging to ferry sinking victims conti...,a2017001,2017-05-01 15:37:00,MOKPO -- The number of recovered items belongi...,MOKPO -- The number of recovered items belongi...,Social affairs,2,"south korea,south jeolla,mokpo municipality,seoul",us,"mokpo,(yonhap)the,,800-ton sewol",,"monday,may 1 , 2017,6 ,800-ton sewol,april 16 ...",,,
3,Candidates tense over holidays’ effect on turnout,Korea Herald,2017-05-01 15:33:00,With an unusually long holiday week ahead of t...,With an unusually long holiday week ahead of t...,Politics,3,"democratic party of korea,tomorrow ,”,south ko...",democratic party of korea (yonhap)“(even) choi...,"yoo eun-hae,“it,‘hell joseon’,,” yoo,hell jose...",,"may 9 election,two-day,thursday,friday,monday,...",,,
4,Most teenage prostitutions conducted via onlin...,a2017001,2017-05-01 15:32:00,More than seven out of 10 teenage prostitution...,More than seven out of 10 teenage prostitution...,Social affairs,4,,ministry of gender equality,(yonhap)a,,"monday,past,acquaintance",,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,Korean envoy asks Denmark for swift decision o...,KH디지털2,2017-01-08 14:02:00,South Korea's top diplomat in Denmark has urge...,South Korea's top diplomat in Denmark has urge...,Social affairs,2995,"south korea,denmark,seoul,denmark,copenhagen,s...",,"chung yoo-ra,choi jai-chul,mohammad ahsan,chun...","danish,korean,choi,danish,danish,korean,danish...","saturday,saturday,chung,end of",,,
2996,US special units said to join in operation to ...,KH디지털2,2017-01-08 13:59:00,US special operation forces are expected to pa...,US special operation forces are expected to pa...,Diplomatic Circuit,2996,"us,pyongyang,rok,rok,korea","us,us sometime later,us,brigade,usfk j5 strate...","kim jong-un,,000-2 ,000-strong,kim jong-un,dig...","korean,korean,korean,korean,korean",sunday,,,
2997,Japan PM urges S.Korea to remove 'comfort woma...,KH디지털2,2017-01-08 13:50:00,TOKYO (AFP) -- Japanese Prime Minister Shinzo ...,TOKYO (AFP) -- Japanese Prime Minister Shinzo ...,Diplomatic Circuit,2997,"tokyo (afp),south korea,tokyo,tokyo,japan,huge...",,"6 million),abe,mainstream historians","japanese prime minister shinzo abe,japanese,ja...","friday,busan,ii,sunday,ii",,,
2998,S. Koreans stage weekly rally demanding Park's...,KH디지털2,2017-01-08 13:45:00,Crowds of South Koreans took to streets across...,Crowds of South Koreans took to streets across...,Social affairs,2998,"park,choi,seoul,gwanghwamun square,april2014,s...","park and lit candles,park,national assembly,ul...","scandal-tainted president park geun-hye,choi s...",koreans,"saturday,11th consecutive saturday,oct,7 milli...",,,


In [254]:
df.to_csv('Data/ner_tagged_news/2.csv')

In [224]:
dataframe.iloc[0][' body']

"At least 983 people were caught taking of intrusive pictures of women's body parts, during the summer holiday season, police said Sunday.\xa0The Korean National Police Agency checked 415 public restrooms and changing rooms at beaches, 705 restrooms at subway stations and 2,070 changing rooms at waterparks in an intensive crackdown that ended on Aug. 20.\xa0YonhapRestrooms and changing rooms have long been easy targets for rigging cameras up on the ceiling and wall.\xa0The number of voyeurism crimes has jumped tenfold from 517 in 2006 to 5,185 cases in 2016. The percentage of voyeurs among the total number of sex crime perpetrators also went up from 3.6 percent in 2006 to 24.9\xa0percent in 2015.\xa0Sexual crimes occur more frequently in the summer season, the 2012-2016 National Police Agency data showed, with an average of around 6,400 cases being reported from June to August, which is 30 percent more than those that occurred from December to February.\xa0During the 50-day intensive c